## BERTopic

In [None]:
dataset_name = "jack_vs_calley_1000" 

Create and run BERTtopic

In [None]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer


# Load the comments dataset
df = pd.read_csv(f"../datasets/youtube-comments/{dataset_name}.csv") 

# Assuming your CSV has a column named 'text' containing the comments
comments = df['text'].tolist()

vectorizer_model = CountVectorizer(stop_words="english")
sentence_model = SentenceTransformer('all-mpnet-base-v2') 
# Create and fit the BERTopic model
model = BERTopic( 
    vectorizer_model=vectorizer_model,
    embedding_model=sentence_model,
    language='english',
    calculate_probabilities=True,
    verbose=True)

topics, probabilities = model.fit_transform(comments)


An example of cluster labeling using keybert

In [None]:
from keybert import KeyBERT

kw_model = KeyBERT()
topic_labels = {}
for topic in range(len(set(topics))-1):
    words = model.get_topic(topic)
    keywords = kw_model.extract_keywords(' '.join([word[0] for word in words]), keyphrase_ngram_range=(1, 2), top_n=1) 
    topic_labels[topic] = keywords[0][0]



model.set_topic_labels(topic_labels=topic_labels)
topic_labels

Comments and their label

In [None]:
for i in range(10):
    print(f'{topic_labels[i]}: {comments[i]}')

Top 10 clusters

In [None]:
freq = model.get_topic_info()
freq.head(10)

### visualizations  

In [None]:
model.visualize_topics(custom_labels=True)

In [None]:
model.visualize_hierarchy(custom_labels=True)

In [None]:
model.visualize_barchart(custom_labels=True)

In [None]:
model.visualize_heatmap(custom_labels=True)