# BERTopic

## Create And Run

In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
import importlib
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)

import model.model_utils as model_utils
import utils.utils3d as utils3d
import data.data_utils as data_utils

def reload_utils():
    importlib.reload(model_utils)
    importlib.reload(utils3d)
    importlib.reload(data_utils)

reload_utils()

In [None]:
dataset_name = "jack_vs_calley_1000"
comments = data_utils.get_comments(dataset_name)

In [None]:
from sentence_transformers import SentenceTransformer
sentence_transformer: SentenceTransformer = model_utils.get_sentence_transformer_model()
embeddings = sentence_transformer.encode(comments)

In [None]:
model = model_utils.get_model()
topics, probs = model.fit_transform(comments, embeddings)

## Keybert Cluster Labeling

In [None]:
from keybert import KeyBERT

kw_model = KeyBERT()
topic_labels = {}
for topic in range(len(set(topics))-1):
    words = model.get_topic(topic)
    keywords = kw_model.extract_keywords(' '.join([word[0] for word in words]), keyphrase_ngram_range=(1, 2), top_n=1) 
    topic_labels[topic] = keywords[0][0]

model.set_topic_labels(topic_labels=topic_labels)

topic_labels

### Comments and their label

In [None]:
for i in range(10):
    print(f'{topic_labels[i]}: {comments[i]}')

### Top 10 clusters

In [None]:
model.get_topic_info()['Representation']

In [None]:
freq = model.get_topic_info()
freq.head(10)

### visualizations  

In [None]:
model.visualize_topics(custom_labels=True)

In [None]:
model.visualize_hierarchy(custom_labels=True)

In [None]:
model.visualize_barchart(custom_labels=True)

In [None]:
model.visualize_heatmap(custom_labels=True)

## 3D Visualize

### Filter Top Topics

In [None]:
import numpy as np
import matplotlib.pyplot as plt

unique_topics, topic_counts = np.unique(topics, return_counts=True)

# Create a bar graph
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
plt.bar(unique_topics, topic_counts)

# Add labels and title
plt.xlabel("Topic ID")
plt.ylabel("Count")
plt.title("Topic Distribution")

# Show the graph
plt.xticks(unique_topics)  # Set x-axis ticks to the unique topic values
plt.show()

In [None]:
model.get_topic(0)

In [None]:
filtered_labels = []
filtered_comments = []
filtered_embeddings = []

for topic, comment, embedding in zip(topics, comments, embeddings):
    if 0 <= topic <= 9:
        filtered_labels.append(topic)
        filtered_comments.append(comment)
        filtered_embeddings.append(embedding)

final_df_sorted_labels = pd.DataFrame({'label': filtered_labels, 'sentence': filtered_comments, 'embedding': filtered_embeddings})

### Visualized

In [None]:
fig = utils3d.get_3d_fig(filtered_embeddings, filtered_comments, filtered_labels)
fig

In [None]:
import pandas as pd

data = []
for topic_num in range(10):
    topic_words = model.get_topic(topic_num)
    top_words = [word for word, score in topic_words]
    data.append({'Topic': topic_num, 'Top Words': ', '.join(top_words), 'Num Comments': topic_counts[topic_num+1]})


topics_top_words_df = pd.DataFrame(data)
pd.set_option('display.max_colwidth', 200)
topics_top_words_df