# 🧠 BERT + Topic Modeling

This notebook demonstrates how to cluster documents using BERT embeddings and extract meaningful topics using KeyBERT and HDBSCAN.

In [None]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from keybert import KeyBERT
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score

sns.set(style="whitegrid")


In [None]:
with open("data/documents.json") as f:
    docs = json.load(f)

texts = [doc["text"] for doc in docs]
print(f"Loaded {len(texts)} documents.")


## 🔢 Sentence-BERT Embeddings

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
embeddings.shape


## 🔍 HDBSCAN Clustering

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='euclidean')
labels = clusterer.fit_predict(embeddings)

print("Cluster labels:", labels)
print("Number of clusters (excluding noise):", len(set(labels)) - (1 if -1 in labels else 0))


## 📊 UMAP Visualization

In [None]:
reducer = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='cosine', random_state=42)
embedding_2d = reducer.fit_transform(embeddings)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=embedding_2d[:,0], y=embedding_2d[:,1], hue=labels, palette="Set2", s=80)
plt.title("UMAP projection of clustered docs")
plt.show()


## 🧠 Topic Extraction with KeyBERT

In [None]:
kw_model = KeyBERT(model)

cluster_to_docs = {}
for text, label in zip(texts, labels):
    if label == -1:
        continue
    cluster_to_docs.setdefault(label, []).append(text)

for label, group in cluster_to_docs.items():
    keywords = kw_model.extract_keywords(" ".join(group), keyphrase_ngram_range=(1, 2), stop_words="english", top_n=5)
    print(f"Cluster {label}: {[kw[0] for kw in keywords]}")
