# 📊 BERT-Based Document Clustering

This notebook demonstrates how to cluster documents using sentence embeddings and visualize the results.

In [None]:
import json
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import umap

sns.set(style="whitegrid")


In [None]:
with open("data/documents.json") as f:
    docs = json.load(f)

texts = [doc["text"] for doc in docs]
ids = [doc["id"] for doc in docs]
print(f"Loaded {len(texts)} documents.")


## 🧠 Generate Embeddings with Sentence-BERT

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
embeddings.shape


## 🔹 KMeans Clustering

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(embeddings)

for i, label in enumerate(kmeans_labels):
    print(f"[Cluster {label}] {texts[i]}")


In [None]:
score = silhouette_score(embeddings, kmeans_labels)
print(f"Silhouette Score: {score:.4f}")


## 📈 Visualize with UMAP

In [None]:
reducer = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='cosine', random_state=42)
embedding_2d = reducer.fit_transform(embeddings)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=embedding_2d[:,0], y=embedding_2d[:,1], hue=kmeans_labels, palette="Set2", s=80)
plt.title("UMAP projection of clustered docs")
plt.show()


## 🧬 Agglomerative Clustering (Optional)

In [None]:
agglo = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='average')
agglo_labels = agglo.fit_predict(embeddings)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=embedding_2d[:,0], y=embedding_2d[:,1], hue=agglo_labels, palette="Set1", s=80)
plt.title("UMAP + Agglomerative Clustering")
plt.show()
