# BERTopic Topic Modeling with Different Transformers
This notebook compares different encoder models using BERTopic.

In [None]:
import os
import torch
from bertopic import BERTopic
from bertopic.evaluation import coherence_score, diversity_score
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import pandas as pd

class HFTransformerEmbedder:
    def __init__(self, model_name, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(device)
        self.device = device

    def encode(self, texts, batch_size=16):
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = self.tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(self.device)
            with torch.no_grad():
                outputs = self.model(**inputs)
                pooled = outputs.last_hidden_state.mean(dim=1)
            embeddings.extend(pooled.cpu().numpy())
        return embeddings

def run_pipeline(texts, encoders):
    results = []
    for enc in encoders:
        name, enc_type = enc['name'], enc['type']
        print(f"\nRunning BERTopic with encoder: {name}")
        if enc_type == "sentence-transformer":
            embedder = SentenceTransformer(name)
        elif enc_type == "huggingface":
            embedder = HFTransformerEmbedder(name).encode
        else:
            continue

        topic_model = BERTopic(embedding_model=embedder, verbose=True)
        topics, _ = topic_model.fit_transform(texts)

        coherence = coherence_score(topic_model, texts, 'c_v')
        diversity = diversity_score(topic_model)

        results.append({
            "encoder": name,
            "type": enc_type,
            "coherence": coherence,
            "diversity": diversity,
            "num_topics": len(set(topics)) - (1 if -1 in topics else 0)
        })

        topic_model.save(f"models/bertopic_{name.replace('/', '_')}")
    return pd.DataFrame(results)


In [None]:
# Load dataset
texts = [
    "Climate change is real.",
    "Artificial intelligence is transforming healthcare.",
    "Elections are coming soon.",
    "New technology in robotics."
]

In [None]:
# List of encoders
encoders = [
    {"name": "all-MiniLM-L6-v2", "type": "sentence-transformer"},
    {"name": "google/gemma-2b", "type": "huggingface"}
]

In [None]:
import os
os.makedirs("models", exist_ok=True)
df_results = run_pipeline(texts, encoders)
df_results.to_csv("results.csv", index=False)
df_results