# Document Clustering with LLM Embeddings
Uses sentence-transformers to embed docs then KMeans.

In [None]:
# Install
!pip install -q sentence-transformers scikit-learn

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

docs = [
    "AI is transforming healthcare.",
    "Hospitals use machine learning for diagnosis.",
    "Stock markets fluctuate daily.",
    "Investors analyze financial reports.",
    "Cooking recipes involve ingredients and steps.",
    "Baking bread requires yeast and flour."
]
model = SentenceTransformer('all-MiniLM-L6-v2')
emb = model.encode(docs)
km = KMeans(n_clusters=3, random_state=42, n_init='auto').fit(emb)
labels = km.labels_
pca = PCA(n_components=2).fit_transform(emb)
plt.scatter(pca[:,0], pca[:,1], c=labels, cmap='tab10')
for i, txt in enumerate(docs):
    plt.annotate(i, (pca[i,0], pca[i,1]))
plt.title('Doc clusters (PCA of embeddings)')
plt.show()


## Notes
- Replace docs with your corpus.
- Try different embedding models (OpenAI, Instructor).