In [None]:

!pip install -q sentence-transformers scikit-learn pandas nltk faiss-cpu torch gensim

In [None]:

# 1) Imports
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import nltk
nltk.download('punkt')

In [None]:

# 2) Load dataset and preprocess
newsgroups = fetch_20newsgroups(subset='all', remove=('headers','footers','quotes'))
docs = newsgroups.data
labels = newsgroups.target
target_names = newsgroups.target_names

df = pd.DataFrame({'document': docs, 'label': labels})
df['document'] = df['document'].str.replace('\n', ' ').str.strip()
df = df[df['document'].str.len() > 20].reset_index(drop=True)
print("Loaded dataset, documents:", len(df))

In [None]:

# 3) Load SentenceTransformer model (safe)
from sentence_transformers import SentenceTransformer
print("Loading SentenceTransformer (all-MiniLM-L6-v2). This may take a moment...")
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
print("Model loaded.")

In [None]:
# 4) Generate BERT embeddings for all documents (safe batching)
documents = df['document'].tolist()
embeddings = model.encode(documents, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
print("Embeddings shape:", embeddings.shape)

In [None]:
# 5) KMeans clustering
from sklearn.cluster import KMeans
num_clusters = 20
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(embeddings)
print("KMeans clustering done. Cluster counts:")
print(df['cluster'].value_counts().sort_index())

In [None]:
# ==========================================
# AUTO-LABEL ALL 20 CLUSTERS
# ==========================================
from sentence_transformers import SentenceTransformer, util
import torch

# Real-world categories you want to assign
categories = [
    "sports",
    "politics",
    "technology",
    "religion",
    "business",
    "health",
    "education",
    "science",
    "entertainment",
    "crime",
    "travel",
    "law",
    "finance",
    "history"
]

# Load embedding model
label_model = SentenceTransformer("all-MiniLM-L6-v2")

# Pre-compute category embeddings
category_embs = label_model.encode(categories, convert_to_tensor=True)

cluster_real_labels = {}

for cluster_id in sorted(cluster_topics.keys()):

    keywords = cluster_topics[cluster_id]
    keyword_text = " ".join(keywords)

    # Embed cluster keywords
    key_emb = label_model.encode(keyword_text, convert_to_tensor=True)

    # Compare with each category
    sim_scores = util.cos_sim(key_emb, category_embs)[0]

    # Best matching category
    best_idx = torch.argmax(sim_scores).item()
    best_label = categories[best_idx]
    best_score = sim_scores[best_idx].item()

    cluster_real_labels[cluster_id] = (best_label, best_score)

# ==========================
# PRINT FINAL RESULTS
# ==========================
print("\n==============================")
print(" AUTO LABELS FOR ALL 20 CLUSTERS ")
print("==============================\n")

for cid, (label, score) in cluster_real_labels.items():
    print(f"Cluster {cid}:  {label.upper()}   (confidence={score:.3f})")


In [None]:

# 6) Topic modelling per cluster (TF-IDF top keywords)
from sklearn.feature_extraction.text import TfidfVectorizer
def extract_cluster_topics(df, num_keywords=10):
    cluster_topics = {}
    for cid in sorted(df['cluster'].unique()):
        docs_cluster = df[df['cluster']==cid]['document'].values
        if len(docs_cluster) == 0:
            cluster_topics[cid] = []
            continue
        vect = TfidfVectorizer(stop_words='english', max_features=2000)
        X = vect.fit_transform(docs_cluster)
        avg = X.mean(axis=0).A1
        terms = vect.get_feature_names_out()
        top_idx = avg.argsort()[::-1][:num_keywords]
        cluster_topics[cid] = [terms[i] for i in top_idx]
    return cluster_topics

cluster_topics = extract_cluster_topics(df, num_keywords=10)
print("\nSample cluster topics (first 5 clusters):")
for i in range(5):
    print(f"Cluster {i}: {', '.join(cluster_topics.get(i,[]))}")

In [None]:

# 7) Build FAISS index for global semantic search (cosine via normalized inner product)
import faiss
embed_matrix = embeddings.astype('float32')
faiss.normalize_L2(embed_matrix)   # normalize for cosine similarity
d = embed_matrix.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embed_matrix)
print("\nFAISS index created with {} vectors (dim={}).".format(index.ntotal, d))

In [None]:
# 8) Train a small MLP classifier (PyTorch) on embeddings -> cluster labels (for confidence)
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

X = embeddings.astype(np.float32)
y = df['cluster'].values

# train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.12, random_state=42, stratify=y
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

X_train_t = torch.tensor(X_train).to(device)
X_val_t = torch.tensor(X_val).to(device)

# FIX: convert labels to LONG tensors
y_train_t = torch.tensor(y_train, dtype=torch.long).to(device)
y_val_t = torch.tensor(y_val, dtype=torch.long).to(device)

class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, num_classes)
        )

    def forward(self, x):
        return self.net(x)

input_dim = X.shape[1]
num_classes = len(np.unique(y))
hidden_dim = 512

mlp = MLPClassifier(input_dim, hidden_dim, num_classes).to(device)
optimizer = optim.Adam(mlp.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
epochs = 12
batch_size = 128

print("\nTraining MLP classifier (fixed version)...")

mlp.train()
n = X_train_t.shape[0]

for epoch in range(epochs):
    perm = torch.randperm(n)
    epoch_loss = 0.0

    for i in range(0, n, batch_size):
        idx = perm[i:i+batch_size]
        xb = X_train_t[idx]
        yb = y_train_t[idx]

        optimizer.zero_grad()
        logits = mlp(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * xb.size(0)

    mlp.eval()
    with torch.no_grad():
        val_logits = mlp(X_val_t)
        val_preds = val_logits.argmax(dim=1)
        val_acc = (val_preds == y_val_t).float().mean().item()

    mlp.train()
    print(f"Epoch {epoch+1}/{epochs}   Loss: {epoch_loss/n:.4f}   Val Acc: {val_acc:.4f}")

In [None]:

# 9) Optional: Train Doc2Vec (gensim) to add deep-learning doc similarity
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
tagged = [TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(df['document'])]
doc2vec_model = Doc2Vec(vector_size=100, min_count=2, epochs=30, workers=4)
doc2vec_model.build_vocab(tagged)
doc2vec_model.train(tagged, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
print("Doc2Vec trained.")

In [None]:

# 10) Helper search functions
from sklearn.metrics.pairwise import cosine_similarity

# FAISS search
def faiss_search(query, top_k=5):
    q_emb = model.encode([query], convert_to_numpy=True).astype('float32')
    faiss.normalize_L2(q_emb)
    scores, idx = index.search(q_emb, top_k)
    return scores[0], idx[0]

# NN + similarity in-cluster search (confidence + cosine)
prob_threshold = 0.55  # tuneable

def nn_cluster_search(query, top_k=5, prob_threshold=prob_threshold):
    q_emb = model.encode([query], convert_to_numpy=True).astype(np.float32)
    q_tensor = torch.tensor(q_emb).to(device)
    mlp.eval()
    with torch.no_grad():
        logits = mlp(q_tensor)
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
    top_prob = probs.max()
    pred_cluster = int(probs.argmax())
    if top_prob < prob_threshold:
        return "No similar text found", float(top_prob), None
    # gather cluster docs
    cluster_idxs = df[df['cluster']==pred_cluster].index.values
    if len(cluster_idxs) == 0:
        return "No similar text found", float(top_prob), None
    cluster_embs = X[cluster_idxs]
    sims = cosine_similarity(q_emb, cluster_embs)[0]
    top_local = sims.argsort()[::-1][:top_k]
    top_doc_idxs = cluster_idxs[top_local]
    docs_scores = [(df.loc[i,'document'], float(sims[j])) for j,i in enumerate(top_doc_idxs)]
    return pred_cluster, float(top_prob), docs_scores

# Doc2Vec search
def doc2vec_search(query, top_k=5):
    q_vec = doc2vec_model.infer_vector(query.split())
    sims = doc2vec_model.dv.most_similar([q_vec], topn=top_k)
    results = [(df.loc[int(doc_id),'document'], float(score)) for doc_id, score in sims]
    return results

In [None]:
# 11) Interactive demo loop: choose search mode
print("\nInteractive demo: Choose search mode:")
print(" - 'nn_similarity' : MLP confidence + in-cluster cosine similarity (recommended)")
print(" - 'faiss'         : Global FAISS semantic search (fast & global)")
print(" - 'doc2vec'       : Doc2Vec-based similarity")
print("Type 'exit' to quit.\n")

while True:
    mode = input("Choose mode (nn_similarity/faiss/doc2vec) or 'exit': ").strip().lower()
    if mode == 'exit':
        break
    if mode not in {'nn_similarity','faiss','doc2vec'}:
        print("Invalid mode. Try again.")
        continue
    query = input("\nEnter document text (paste full paragraph recommended):\n")
    if query.strip().lower() == 'exit':
        break
    if mode == 'faiss':
        scores, idxs = faiss_search(query, top_k=5)
        print("\nTop FAISS matches (score, snippet):")
        for s,i in zip(scores, idxs):
            print(f"\nScore: {s:.4f}\n{df.loc[i,'document'][:400]}...\n")
    elif mode == 'nn_similarity':
        cluster_or_msg, conf, docs_scores = nn_cluster_search(query, top_k=5)
        if cluster_or_msg == "No similar text found":
            print(f"\nNo similar text found (NN confidence={conf:.3f})\n")
        else:
            print(f"\nPredicted cluster: {cluster_or_msg}  (NN confidence={conf:.3f})")
            if cluster_or_msg in cluster_topics:
                print("Cluster topics:", ", ".join(cluster_topics[cluster_or_msg]))
            print("\nTop documents in cluster (score, snippet):")
            for doc, sc in docs_scores:
                print(f"\nScore: {sc:.4f}\n{doc[:400]}...\n")
    else:  # doc2vec
        res = doc2vec_search(query, top_k=5)
        print("\nDoc2Vec matches (score, snippet):")
        for doc, sc in res:
            print(f"\nScore: {sc:.4f}\n{doc[:400]}...\n")
    print("="*80 + "\n")

# Optionally save trained MLP and Doc2Vec
torch.save(mlp.state_dict(), "mlp_cluster_classifier.pt")
doc2vec_model.save("doc2vec_model.model")
print("Models saved: mlp_cluster_classifier.pt , doc2vec_model.model")

In [None]:
# 11) Interactive demo loop: choose search mode
print("\nInteractive demo: Choose search mode:")
print(" - 'nn_similarity' : MLP confidence + in-cluster cosine similarity (recommended)")
print(" - 'faiss'         : Global FAISS semantic search (fast & global)")
print(" - 'doc2vec'       : Doc2Vec-based similarity")
print("Type 'exit' to quit.\n")

while True:
    mode = input("Choose mode (nn_similarity/faiss/doc2vec) or 'exit': ").strip().lower()
    if mode == 'exit':
        break
    if mode not in {'nn_similarity','faiss','doc2vec'}:
        print("Invalid mode. Try again.")
        continue

    query = input("\nEnter document text (paste full paragraph recommended):\n")
    if query.strip().lower() == 'exit':
        break

    # ---------------------------
    # 1) FAISS MODE
    # ---------------------------
    if mode == 'faiss':
        scores, idxs = faiss_search(query, top_k=5)
        print("\nTop FAISS matches (score, cluster, snippet):")
        for s, i in zip(scores, idxs):
            cluster_id = df.loc[i, 'cluster']
            snippet = df.loc[i, 'document'][:400]
            print(f"\nScore: {s:.4f} | Cluster: {cluster_id}\n{snippet}...\n")

    # ---------------------------
    # 2) NEURAL-NET + COSINE
    # ---------------------------
    elif mode == 'nn_similarity':
        cluster_or_msg, conf, docs_scores = nn_cluster_search(query, top_k=5)

        if cluster_or_msg == "No similar text found":
            print(f"\nNo similar text found (NN confidence={conf:.3f})\n")
        else:
            print(f"\nPredicted cluster: {cluster_or_msg}  (NN confidence={conf:.3f})")

            if cluster_or_msg in cluster_topics:
                print("Cluster topics:", ", ".join(cluster_topics[cluster_or_msg]))

            print("\nTop documents in this cluster (score, snippet):")
            for doc, sc in docs_scores:
                print(f"\nScore: {sc:.4f}\n{doc[:400]}...\n")

    # ---------------------------
    # 3) DOC2VEC MODE
    # ---------------------------
    else:  # doc2vec
        res = doc2vec_search(query, top_k=5)
        print("\nDoc2Vec matches (score, cluster, snippet):")
        for doc, sc in res:
            # Find original index & cluster
            idx = df.index[df['document'] == doc][0]
            cluster_id = df.loc[idx, 'cluster']
            print(f"\nScore: {sc:.4f} | Cluster: {cluster_id}\n{doc[:400]}...\n")

    print("="*80 + "\n")

# save models
torch.save(mlp.state_dict(), "mlp_cluster_classifier.pt")
doc2vec_model.save("doc2vec_model.model")
print("Models saved: mlp_cluster_classifier.pt , doc2vec_model.model")

In [None]:
# 11) Interactive demo loop: choose search mode
print("\nInteractive demo: Choose search mode:")
print(" - 'nn_similarity' : MLP confidence + in-cluster cosine similarity (recommended)")
print(" - 'faiss'         : Global FAISS semantic search (fast & global)")
print(" - 'doc2vec'       : Doc2Vec-based similarity")
print("Type 'exit' to quit.\n")

while True:
    mode = input("Choose mode (nn_similarity/faiss/doc2vec) or 'exit': ").strip().lower()
    if mode == 'exit':
        break
    if mode not in {'nn_similarity', 'faiss', 'doc2vec'}:
        print("Invalid mode. Try again.")
        continue

    query = input("\nEnter document text (paste full paragraph recommended):\n")
    if query.strip().lower() == 'exit':
        break

    # Encode query
    query_emb = model.encode([query], convert_to_numpy=True)

    # Predict main cluster of the query
    predicted_cluster = kmeans.predict(query_emb)[0]

    # Print main cluster label
    print(f"\nPredicted Cluster: {predicted_cluster}")
    if predicted_cluster in cluster_topics:
        print("Cluster Topics:", ", ".join(cluster_topics[predicted_cluster]))

    # ---------------------------
    # 1) FAISS MODE (Fixed)
    # ---------------------------
    if mode == 'faiss':
        scores, idxs = faiss_search(query, top_k=5)

        print("\nTop FAISS matches (score, snippet):")
        for s, i in zip(scores, idxs):
            snippet = df.loc[i, 'document'][:400]
            print(f"\nScore: {s:.4f}\n{snippet}...\n")

    # ---------------------------
    # 2) NEURAL-NET + COSINE (Fixed)
    # ---------------------------
    elif mode == 'nn_similarity':

        # Use NN to classify cluster (already shown above)
        x_tensor = torch.tensor(query_emb, dtype=torch.float32)
        probs = mlp(x_tensor).detach().numpy()
        nn_conf = probs[0][predicted_cluster]

        # Get documents only from that cluster
        cluster_docs = df[df['cluster'] == predicted_cluster]

        # Compute cosine similarity
        doc_embs = cluster_docs['bert_emb'].to_list()
        sim_scores = cosine_similarity(query_emb, np.vstack(doc_embs))[0]

        # Top similar docs within cluster
        top_idx = sim_scores.argsort()[::-1][:5]

        print(f"\nNeural Network Confidence: {nn_conf:.4f}")
        print("\nTop documents in this cluster:")

        for idx in top_idx:
            doc_text = cluster_docs.iloc[idx]['document'][:400]
            score = sim_scores[idx]
            print(f"\nScore: {score:.4f}\n{doc_text}...\n")

    # ---------------------------
    # 3) DOC2VEC MODE (Fixed)
    # ---------------------------
    else:
        results = doc2vec_search(query, top_k=5)
        print("\nDoc2Vec matches (score, snippet):")
        for doc, sc in results:
            print(f"\nScore: {sc:.4f}\n{doc[:400]}...\n")

    print("="*80 + "\n")

# Save models
torch.save(mlp.state_dict(), "mlp_cluster_classifier.pt")
doc2vec_model.save("doc2vec_model.model")
print("Models saved: mlp_cluster_classifier.pt , doc2vec_model.model")
