# Visualizations

In [None]:
import pandas as pd
df_sentences = pd.read_pickle("../data/df_sentences.pkl")


In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

In [2]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
import phate
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Utilizing BERTTopic to create embeddings

In [None]:
df_sentences = pd.read_csv('sentences.csv')

In [None]:
# Prepare input list
sentences = df_sentences["sentences"].tolist()

# Set up model (you can replace 'all-MiniLM-L6-v2' with any transformer model you prefer)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model=embedding_model, verbose=True)

# Fit the model
topics, probs = topic_model.fit_transform(sentences)

In [None]:
# Get raw sentence embeddings from the transformer
embeddings = embedding_model.encode(sentences, show_progress_bar=True)

### Creating visualization

We will be using dimensional reducing methods PHATE, and comparing it to other models, which are UMAP, t-SNE and PCA.

In [None]:
# Reduce dimensionality
pca = PCA(n_components=2).fit_transform(embeddings)
tsne = TSNE(n_components=2, perplexity=30).fit_transform(embeddings)
umap = UMAP(n_components=2).fit_transform(embeddings)
phate_op = phate.PHATE().fit_transform(embeddings)

In [None]:
# Create labels
labels = topic_model.get_topics()
topic_nums = np.array(topics)

In [None]:
# Plotting
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
methods = [pca, tsne, umap, phate_op, diffusion]
titles = ["PCA", "t-SNE", "UMAP", "PHATE", "Diffusion Map"]

for ax, method, title in zip(axes.flat, methods, titles):
    sns.scatterplot(x=method[:, 0], y=method[:, 1], hue=topic_nums, palette="tab10", s=10, ax=ax, legend=False)
    ax.set_title(title)

axes.flat[-1].axis("off")  # last subplot empty if odd number
plt.suptitle("Topic Clusters Visualized with Dimensionality Reduction", fontsize=16)
plt.tight_layout()
plt.show()