## Doing Clustering to find Major Topics

### Init/All Podcasts

In [None]:
!pip install sentence_transformers
!pip install hdbscan
!pip install bertopic

Collecting sentence_transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m143.4/171.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
import random
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from spacy.lang.en import English
from scipy.special import softmax

In [None]:
nlp = English()
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x786624f6abc0>

In [None]:
df = pd.read_csv("/content/drive/MyDrive/NLP2024/project/full_dataset.csv")

In [None]:
docs = df["document"].values

In [None]:
df["split_doc"] = df["document"].apply(lambda x: list(nlp(x).sents))

In [None]:
df["split_doc"] = df["split_doc"].apply(lambda x: [str(y) for y in x])

In [None]:
bad_words = ["WSJ", "declined comment", "wsj.com", "theme music", "produced today", "voicemail", "request for comment",
                "comment", "Amazon slash", "producer", "prime membership", "sandra", "Progressive", "Prime", "Amazon Music", "NPR", "streaming",
             "Wall Street", "Fox", "Reuters News", "producer", "Thank you"]

In [None]:
# Removing regular ads ; Amazon prime and Progressive Insurance ads
def apply_removal(doc):
  final_list = []
  for sentence in doc:
    if not any(substring in sentence for substring in bad_words):
      final_list.append(sentence)
  return " ".join(final_list)

In [None]:
df["ad_free"] = df["split_doc"].apply(apply_removal)

In [None]:
docs = df["ad_free"].values

In [None]:
check_point = 'all-MiniLM-L6-v2'
# Embedding model
embedding_model = SentenceTransformer(check_point)
# Clustering model for BERT
# At least 4 podcasts need to have mentioned a topic. This seems fair.
cluster_model = HDBSCAN(min_cluster_size=3,
                        metric='euclidean',
                        cluster_selection_method='leaf',
                        prediction_data=True)
# Following this advice after getting stop words https://maartengr.github.io/BERTopic/faq.html#how-do-i-reduce-topic-outliers
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1,3))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
topic_model = BERTopic(embedding_model=embedding_model,
                       hdbscan_model=cluster_model,
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       nr_topics=10,
                       top_n_words=10,
                       min_topic_size=3,
                       verbose=True)

In [None]:
# There is a degree of randomness here; results will not always be constant! However they will be similar.
topics, probs = topic_model.fit_transform(docs)

2024-05-12 18:50:08,332 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-12 18:50:11,883 - BERTopic - Embedding - Completed ✓
2024-05-12 18:50:11,884 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-12 18:50:21,790 - BERTopic - Dimensionality - Completed ✓
2024-05-12 18:50:21,792 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-12 18:50:21,806 - BERTopic - Cluster - Completed ✓
2024-05-12 18:50:21,808 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-12 18:50:23,564 - BERTopic - Representation - Completed ✓
2024-05-12 18:50:23,571 - BERTopic - Topic reduction - Reducing number of topics
2024-05-12 18:50:25,394 - BERTopic - Topic reduction - Reduced number of topics from 11 to 10


In [None]:
topic_model.visualize_barchart(top_n_topics = 12)

In [None]:
topic_model.visualize_documents(docs)

In [None]:
topic_model.visualize_hierarchy()

### WSJ Analysis

In [None]:
wsj = df[df["source"] == "WSJ"]

In [None]:
wsj_docs = wsj["ad_free"].values

In [None]:
wsj_sents = []

for document in wsj_docs:
  cur_sentences = list(nlp(document).sents)
  for item in cur_sentences:
    wsj_sents.append(item)


In [None]:
wsj_sents = [str(x) for x in wsj_sents]

In [None]:
# Remove intro/specific advertisements
# This is slashing with a pretty big sword but I do not know a better way to do it
# An advertisement model failed to help me here so.
def apply_removal_wsj(doc):
  final_list = []
  substrings = ["WSJ", "declined comment", "wsj.com", "theme music", "produced today", "voicemail", "request for comment",
                "comment", "Amazon slash", "producer", "prime membership", "sandra"]
  for sentence in doc:
    if not any(substring in sentence for substring in substrings):
      final_list.append(sentence)
    else:
      print(sentence)
  return final_list

In [None]:
wsj_sents_mod = apply_removal_wsj(wsj_sents)

In [None]:
check_point = 'all-MiniLM-L6-v2'
# Embedding model
embedding_model = SentenceTransformer(check_point)
# Making more strict on an individual sentence level
cluster_model = HDBSCAN(min_cluster_size=2,
                        metric='euclidean',
                        cluster_selection_method='leaf',
                        prediction_data=True)
# Following this advice after getting stop words https://maartengr.github.io/BERTopic/faq.html#how-do-i-reduce-topic-outliers
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1,1))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [None]:
topic_model = BERTopic(embedding_model=embedding_model,
                       hdbscan_model=cluster_model,
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       nr_topics=10,
                       top_n_words=10,
                       min_topic_size=3,
                       verbose=True)

In [None]:
# There is a degree of randomness here; results will not always be constant! However they will be similar.
topics, probs = topic_model.fit_transform(wsj_docs)

2024-05-11 20:59:34,002 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-05-11 20:59:39,104 - BERTopic - Embedding - Completed ✓
2024-05-11 20:59:39,108 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-11 20:59:41,501 - BERTopic - Dimensionality - Completed ✓
2024-05-11 20:59:41,503 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-11 20:59:41,519 - BERTopic - Cluster - Completed ✓
2024-05-11 20:59:41,522 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-11 20:59:41,706 - BERTopic - Representation - Completed ✓
2024-05-11 20:59:41,708 - BERTopic - Topic reduction - Reducing number of topics
2024-05-11 20:59:41,712 - BERTopic - Topic reduction - Reduced number of topics from 6 to 6


In [None]:
topic_model.visualize_barchart(top_n_topics = 6)

### NPR

In [None]:
npr = df[df["source"] == "NPR"]
npr_docs = npr["ad_free"].values

In [None]:
npr_sents = []

for document in npr_docs:
  cur_sentences = list(nlp(document).sents)
  for item in cur_sentences:
    npr_sents.append(item)


In [None]:
npr_sents = [str(x) for x in npr_sents]

In [None]:
# Remove intro/specific advertisements
# This is slashing with a pretty big sword but I do not know a better way to do it
# An advertisement model failed to help me here so.
def apply_removal_npr(doc):
  final_list = []
  substrings = ["NPR", "declined comment", "npr.com", "theme music", "produced today", "voicemail", "request for comment",
                "comment", "Amazon slash", "producer", "prime membership", "sandra", "Good morning", "Thanks for listening", "Today is"]
  for sentence in doc:
    if not any(substring in sentence for substring in substrings):
      final_list.append(sentence)
    else:
      print(sentence)
  return final_list

In [None]:
npr_sents_mod = apply_removal_npr(npr_sents)

In [None]:
check_point = 'all-MiniLM-L6-v2'
# Embedding model
embedding_model = SentenceTransformer(check_point)
# Making more strict on an individual sentence level
cluster_model = HDBSCAN(min_cluster_size=2,
                        metric='euclidean',
                        cluster_selection_method='leaf',
                        prediction_data=True)
# Following this advice after getting stop words https://maartengr.github.io/BERTopic/faq.html#how-do-i-reduce-topic-outliers
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1,1))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [None]:
topic_model = BERTopic(embedding_model=embedding_model,
                       hdbscan_model=cluster_model,
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       nr_topics=10,
                       top_n_words=10,
                       min_topic_size=3,
                       verbose=True)

In [None]:
# There is a degree of randomness here; results will not always be constant! However they will be similar.
topics, probs = topic_model.fit_transform(npr_docs)

2024-05-11 20:57:55,096 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-05-11 20:58:01,807 - BERTopic - Embedding - Completed ✓
2024-05-11 20:58:01,814 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-11 20:58:05,974 - BERTopic - Dimensionality - Completed ✓
2024-05-11 20:58:05,975 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-11 20:58:05,984 - BERTopic - Cluster - Completed ✓
2024-05-11 20:58:05,987 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-11 20:58:06,132 - BERTopic - Representation - Completed ✓
2024-05-11 20:58:06,134 - BERTopic - Topic reduction - Reducing number of topics
2024-05-11 20:58:06,140 - BERTopic - Topic reduction - Reduced number of topics from 8 to 8


In [None]:
topic_model.visualize_barchart(top_n_topics = 10)

### Reuters

In [None]:
reuters = df[df["source"] == "REUTERS"]
reuters_docs = reuters["ad_free"].values

In [None]:
reuters_sents = []

for document in reuters_docs:
  cur_sentences = list(nlp(document).sents)
  for item in cur_sentences:
    reuters_sents.append(item)


In [None]:
reuters_sents = [str(x) for x in reuters_sents]

In [None]:
# Remove intro/specific advertisements
# This is slashing with a pretty big sword but I do not know a better way to do it
# An advertisement model failed to help me here so.
def apply_removal_reuters(doc):
  final_list = []
  substrings = ["reuters", "declined comment", "reuters.com", "theme music", "produced today", "voicemail", "request for comment",
                "comment", "Amazon slash", "producer", "prime membership", "sandra", "Good morning", "Thanks for listening", "Today is"]
  for sentence in doc:
    if not any(substring in sentence for substring in substrings):
      final_list.append(sentence)
    else:
      print(sentence)
  return final_list

In [None]:
reuters_sents_mod = apply_removal_reuters(reuters_sents)

In [None]:
check_point = 'all-MiniLM-L6-v2'
# Embedding model
embedding_model = SentenceTransformer(check_point)
# Making more strict on an individual sentence level
cluster_model = HDBSCAN(min_cluster_size=20,
                        metric='euclidean',
                        cluster_selection_method='leaf',
                        prediction_data=True)
# Following this advice after getting stop words https://maartengr.github.io/BERTopic/faq.html#how-do-i-reduce-topic-outliers
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1,1))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [None]:
topic_model = BERTopic(embedding_model=embedding_model,
                       hdbscan_model=cluster_model,
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       nr_topics=10,
                       top_n_words=10,
                       min_topic_size=3,
                       verbose=True)

In [None]:
# There is a degree of randomness here; results will not always be constant! However they will be similar.
topics, probs = topic_model.fit_transform(reuters_sents_mod)

2024-05-11 21:10:16,722 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/86 [00:00<?, ?it/s]

2024-05-11 21:10:57,821 - BERTopic - Embedding - Completed ✓
2024-05-11 21:10:57,825 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-11 21:11:14,315 - BERTopic - Dimensionality - Completed ✓
2024-05-11 21:11:14,321 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-11 21:11:14,437 - BERTopic - Cluster - Completed ✓
2024-05-11 21:11:14,438 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-11 21:11:14,567 - BERTopic - Representation - Completed ✓
2024-05-11 21:11:14,569 - BERTopic - Topic reduction - Reducing number of topics
2024-05-11 21:11:14,683 - BERTopic - Topic reduction - Reduced number of topics from 27 to 10


In [None]:
topic_model.visualize_barchart(top_n_topics = 10)

### Fox

In [None]:
fox = df[df["source"] == "FOX"]
fox_docs = fox["ad_free"].values

In [None]:
fox_sents = []

for document in fox_docs:
  cur_sentences = list(nlp(document).sents)
  for item in cur_sentences:
    fox_sents.append(item)


In [None]:
fox_sents = [str(x) for x in fox_sents]

In [None]:
# Remove intro/specific advertisements
# This is slashing with a pretty big sword but I do not know a better way to do it
# An advertisement model failed to help me here so.
def apply_removal_fox(doc):
  final_list = []
  substrings = ["fox", "declined comment", "fox.com", "theme music", "produced today", "voicemail", "request for comment",
               "Amazon slash", "producer", "prime membership", "sandra", "Good morning", "Thanks for listening", "Today is",
                "Thanks", "Jessica", "podcast", "dave", "anthony", "chris", "yeah", "okay", "uh", "appreciate", ]
  for sentence in doc:
    if not any(substring in sentence for substring in substrings):
      final_list.append(sentence)
    else:
      print(sentence)
  return final_list

In [None]:
fox_sents_mod = apply_removal_fox(fox_sents)

In [None]:
check_point = 'all-MiniLM-L6-v2'
# Embedding model
embedding_model = SentenceTransformer(check_point)
# Making more strict on an individual sentence level
cluster_model = HDBSCAN(min_cluster_size=20,
                        metric='euclidean',
                        cluster_selection_method='leaf',
                        prediction_data=True)
# Following this advice after getting stop words https://maartengr.github.io/BERTopic/faq.html#how-do-i-reduce-topic-outliers
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1,1))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [None]:
topic_model = BERTopic(embedding_model=embedding_model,
                       hdbscan_model=cluster_model,
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       nr_topics=10,
                       top_n_words=10,
                       min_topic_size=3,
                       verbose=True)

In [None]:
# There is a degree of randomness here; results will not always be constant! However they will be similar.
topics, probs = topic_model.fit_transform(fox_sents_mod)

2024-05-11 21:42:40,419 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/261 [00:00<?, ?it/s]

2024-05-11 21:44:46,853 - BERTopic - Embedding - Completed ✓
2024-05-11 21:44:46,855 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-11 21:45:06,245 - BERTopic - Dimensionality - Completed ✓
2024-05-11 21:45:06,248 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-11 21:45:06,659 - BERTopic - Cluster - Completed ✓
2024-05-11 21:45:06,661 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-11 21:45:07,015 - BERTopic - Representation - Completed ✓
2024-05-11 21:45:07,017 - BERTopic - Topic reduction - Reducing number of topics
2024-05-11 21:45:07,329 - BERTopic - Topic reduction - Reduced number of topics from 85 to 10


In [None]:
topic_model.visualize_barchart(top_n_topics = 10)