# BERTopic Training with Pre-computed Embeddings

This notebook loads pre-computed embeddings, trains BERTopic models for each subject and embedding combination, and evaluates coherence scores.

In [9]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Optional, Tuple
from tqdm import tqdm
import warnings

from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

pd.set_option('display.max_colwidth', None)
warnings.filterwarnings("ignore", category=SyntaxWarning)

In [10]:
VERSION = "v1"
LIST_SUBJECT = ["cs","math", "physics"]

TRANSFORMERS = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "all-distilroberta-v1",
    "intfloat/e5-base-v2",
    "all-mpnet-base-v2", 
    "BAAI/bge-base-en-v1.5",
    "allenai/specter2"
]

EMBEDDING_DIMS = {
    "sentence-transformers/all-MiniLM-L6-v2": 384,
    "all-distilroberta-v1": 768,
    "intfloat/e5-base-v2": 768,
    "all-mpnet-base-v2": 768,
    "BAAI/bge-base-en-v1.5": 768,
    "allenai/specter2": 768
}

BASE_DIR = Path("../../dataset")
EMBEDDING_DIR = Path("./embedding")
RESULT_DIR = Path("./transformer")

RESULT_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
def get_model_safe_name(model_name: str) -> str:
    return model_name.replace("/", "_").replace("-", "_")


def load_dataset(subject: str) -> pd.DataFrame:
    file_path = BASE_DIR / subject / "emb" / f"{VERSION}.csv"
    if not file_path.exists():
        print(f"File not found: {file_path}")
        return None
    return pd.read_csv(file_path)


def load_mmap_embeddings(
    mmap_path: str,
    num_documents: int,
    embedding_dim: int,
    dtype: str = "float32"
) -> Optional[np.memmap]:
    try:
        embs = np.memmap(
            mmap_path,
            dtype=dtype,
            mode="r",
            shape=(num_documents, embedding_dim)
        )
        return embs
    except FileNotFoundError:
        print(f"Embedding not found: {mmap_path}")
        return None
    except Exception as e:
        print(f"Error loading embeddings: {e}")
        return None


def train_bertopic_model(
    documents: List[str],
    embeddings: np.ndarray,
    n_neighbors: int = 15,
    n_components: int = 5,
    min_dist: float = 0.0,
    min_cluster_size: int = 150,
    min_samples: int = 10,
    random_state: int = 42
) -> Tuple[BERTopic, List[int], Optional[np.ndarray]]:
    
    umap_model = UMAP(
        n_neighbors=n_neighbors,
        n_components=n_components,
        metric="cosine",
        random_state=random_state,
        min_dist=min_dist,
        verbose=False
    )

    hdbscan_model = HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric="euclidean",
        cluster_selection_method="eom",
        prediction_data=True
    )

    topic_model = BERTopic(
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        calculate_probabilities=False,
        verbose=False
    )

    topics, probs = topic_model.fit_transform(documents, embeddings=embeddings)
    return topic_model, topics, probs


def tokenize_for_coherence(text: str) -> List[str]:
    return [token for token in simple_preprocess(str(text), deacc=True)]


def calculate_coherence(
    topic_model: BERTopic,
    texts_tokenized: List[List[str]],
    dictionary: Dictionary,
    top_n: int = 10
) -> float:
    all_topic_ids = topic_model.get_topics().keys()
    topics_list = []
    
    for topic_id in all_topic_ids:
        if topic_id == -1:
            continue
        topic_words = [word for word, _ in topic_model.get_topic(topic_id)[:top_n]]
        topics_list.append(topic_words)
    
    if not topics_list:
        return 0.0
    
    cm = CoherenceModel(
        topics=topics_list,
        texts=texts_tokenized,
        dictionary=dictionary,
        coherence='c_v',
        processes=1
    )
    
    return cm.get_coherence()

In [4]:
all_data = {}
all_texts_tokenized = {}
all_dictionaries = {}

for subject in LIST_SUBJECT:
    df = load_dataset(subject)
    if df is not None:
        all_data[subject] = df
        print(f"{subject}: {len(df):,} documents loaded")
        
        print(f"  Tokenizing for coherence...")
        texts_tokenized = [tokenize_for_coherence(text) for text in tqdm(df['text'], desc=f"  {subject}")]
        all_texts_tokenized[subject] = texts_tokenized
        all_dictionaries[subject] = Dictionary(texts_tokenized)

print(f"\nTotal subjects loaded: {len(all_data)}")

math: 157,085 documents loaded
  Tokenizing for coherence...


  math: 100%|██████████| 157085/157085 [00:17<00:00, 8975.04it/s]



Total subjects loaded: 1


In [5]:
results = []

for subject, df in all_data.items():
    print(f"\n{'='*70}")
    print(f"Subject: {subject.upper()} ({len(df):,} documents)")
    print(f"{'='*70}")
    
    texts = df["text"].fillna("").tolist()
    texts_tokenized = all_texts_tokenized[subject]
    dictionary = all_dictionaries[subject]
    
    subject_result_dir = RESULT_DIR / subject
    subject_result_dir.mkdir(parents=True, exist_ok=True)
    
    for model_name in TRANSFORMERS:
        model_safe_name = get_model_safe_name(model_name)

        embedding_path = str(EMBEDDING_DIR / subject / f"{model_safe_name}_{VERSION}.mmap")
        model_save_path = str(subject_result_dir / f"{model_safe_name}_{VERSION}")
        print(f"\n[{model_name}]")
        
        if os.path.exists(model_save_path):
            print(f"  Loading existing model...")
            try:
                topic_model = BERTopic.load(model_save_path)
                n_topics = len(topic_model.get_topic_info()) - 1
                coherence = calculate_coherence(topic_model, texts_tokenized, dictionary)
                print(f"  Topics: {n_topics} | Coherence: {coherence:.4f}")
                results.append({
                    "subject": subject,
                    "model": model_name,
                    "n_topics": n_topics,
                    "coherence": coherence
                })
                continue
            except Exception as e:
                print(f"  Failed to load, retraining: {e}")
        
        emb_dim = EMBEDDING_DIMS[model_name]
        embeddings = load_mmap_embeddings(embedding_path, len(texts), emb_dim)
        
        if embeddings is None:
            print(f"  Skipping (no embeddings)")
            continue
        
        print(f"  Training BERTopic...")
        try:
            topic_model, topics, probs = train_bertopic_model(texts, embeddings)
            n_topics = len(topic_model.get_topic_info()) - 1
            print(f"  Topics found: {n_topics}")
            
            print(f"  Calculating coherence...")
            coherence = calculate_coherence(topic_model, texts_tokenized, dictionary)
            print(f"  Coherence (c_v): {coherence:.4f}")
            
            topic_model.save(model_save_path)
            print(f"  ✓ Saved to {model_save_path}")
            
            results.append({
                "subject": subject,
                "model": model_name,
                "n_topics": n_topics,
                "coherence": coherence
            })
            
            del embeddings, topic_model
            import gc
            gc.collect()
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
            results.append({
                "subject": subject,
                "model": model_name,
                "n_topics": None,
                "coherence": None,
                "error": str(e)
            })

print("\n" + "="*70)
print("All training complete!")
print("="*70)


Subject: MATH (157,085 documents)

[sentence-transformers/all-MiniLM-L6-v2]
  Training BERTopic...
  Topics found: 166
  Calculating coherence...




  Coherence (c_v): 0.7221
  ✓ Saved to transformer/math/sentence_transformers_all_MiniLM_L6_v2_v1

[all-distilroberta-v1]
  Training BERTopic...
  Topics found: 167
  Calculating coherence...




  Coherence (c_v): 0.7115
  ✓ Saved to transformer/math/all_distilroberta_v1_v1

[intfloat/e5-base-v2]
  Training BERTopic...
  Topics found: 127
  Calculating coherence...




  Coherence (c_v): 0.7165
  ✓ Saved to transformer/math/intfloat_e5_base_v2_v1

[all-mpnet-base-v2]
  Training BERTopic...
  Topics found: 165
  Calculating coherence...




  Coherence (c_v): 0.7160
  ✓ Saved to transformer/math/all_mpnet_base_v2_v1

[BAAI/bge-base-en-v1.5]
  Training BERTopic...
  Topics found: 150
  Calculating coherence...




  Coherence (c_v): 0.7229
  ✓ Saved to transformer/math/BAAI_bge_base_en_v1.5_v1

[allenai/specter2]
  Training BERTopic...
  Topics found: 1
  Calculating coherence...




  Coherence (c_v): 0.4117
  ✓ Saved to transformer/math/allenai_specter2_v1

All training complete!


In [6]:
results_df = pd.DataFrame(results)
results_df.to_csv(RESULT_DIR / f"coherence_results_{VERSION}.csv", index=False)

print("\n" + "="*70)
print("COHERENCE SCORE SUMMARY")
print("="*70 + "\n")

if len(results_df) > 0:
    pivot = results_df.pivot(index='model', columns='subject', values='coherence')
    pivot['mean'] = pivot.mean(axis=1)
    pivot = pivot.sort_values('mean', ascending=False)
    print(pivot.round(4))
    
    print(f"\nResults saved to: {RESULT_DIR / f'coherence_results_{VERSION}.csv'}")


COHERENCE SCORE SUMMARY

subject                                   math    mean
model                                                 
BAAI/bge-base-en-v1.5                   0.7229  0.7229
sentence-transformers/all-MiniLM-L6-v2  0.7221  0.7221
intfloat/e5-base-v2                     0.7165  0.7165
all-mpnet-base-v2                       0.7160  0.7160
all-distilroberta-v1                    0.7115  0.7115
allenai/specter2                        0.4117  0.4117

Results saved to: transformer/coherence_results_v1.csv


In [11]:
print("\nTop 2 Models per Subject:")
print("-" * 110)
results_df = pd.read_csv(f"{RESULT_DIR}/coherence_results_{VERSION}.csv")

for subject in LIST_SUBJECT:
    subject_results = results_df[results_df['subject'] == subject]
    if len(subject_results) > 0 and subject_results['coherence'].notna().any():
        top2 = subject_results.nlargest(6, 'coherence')
        for rank, (_, row) in enumerate(top2.iterrows(), 1):
            print(f"{subject.upper():10s} | #{rank} | {row['model']:45s} | {row['coherence']:.4f}")
        print("-" * 110)


Top 2 Models per Subject:
--------------------------------------------------------------------------------------------------------------
CS         | #1 | all-distilroberta-v1                          | 0.7322
CS         | #2 | intfloat/e5-base-v2                           | 0.7242
CS         | #3 | allenai/specter2                              | 0.7210
CS         | #4 | BAAI/bge-base-en-v1.5                         | 0.7198
CS         | #5 | sentence-transformers/all-MiniLM-L6-v2        | 0.7148
CS         | #6 | all-mpnet-base-v2                             | 0.7146
--------------------------------------------------------------------------------------------------------------
MATH       | #1 | BAAI/bge-base-en-v1.5                         | 0.7229
MATH       | #2 | sentence-transformers/all-MiniLM-L6-v2        | 0.7221
MATH       | #3 | intfloat/e5-base-v2                           | 0.7165
MATH       | #4 | all-mpnet-base-v2                             | 0.7160
MATH       | #5 | all

In [12]:
import pandas as pd

results_df = pd.read_csv("./transformer/coherence_results_v1.csv")

def rank_models(df):
    scored = df.copy()
    
    # For each subject, rank coherence: 6 = highest, 1 = lowest
    scored['score'] = scored.groupby('subject')['coherence'].rank(method='min').astype(int)
    
    # Pivot to show scores per subject
    score_pivot = scored.pivot(index='model', columns='subject', values='score')
    score_pivot.columns = [f"{col}_score" for col in score_pivot.columns]
    
    # Pivot coherence values too for reference
    coherence_pivot = scored.pivot(index='model', columns='subject', values='coherence')
    coherence_pivot.columns = [f"{col}_coherence" for col in coherence_pivot.columns]
    
    # Merge and calculate total
    ranking = pd.concat([coherence_pivot, score_pivot], axis=1)
    ranking['mean_coherence'] = coherence_pivot.mean(axis=1)
    ranking['total_score'] = score_pivot.sum(axis=1).astype(int)
    ranking['overall_rank'] = ranking['total_score'].rank(ascending=False, method='min').astype(int)
    ranking = ranking.sort_values('overall_rank')
    
    return ranking

ranking = rank_models(results_df)

# Display
subjects = ["cs", "math", "physics"]
print("=" * 115)
print("MODEL RANKING BY COHERENCE SCORE")
print("=" * 115)
print(f"\n{'Model':<45}", end="")
for s in subjects:
    print(f"| {s.upper():>8} (Score)", end="")
print(f"| {'MEAN':>7} | {'TOTAL':>6} | {'RANK':>4}")
print("-" * 115)

for model, row in ranking.iterrows():
    print(f"{model:<45}", end="")
    for s in subjects:
        print(f"| {row[f'{s}_coherence']:.4f}  ({int(row[f'{s}_score'])})", end="  ")
    print(f"| {row['mean_coherence']:.4f}  |   {int(row['total_score']):>2}   |  #{int(row['overall_rank'])}")

print("=" * 115)
print("\nScoring: 6 = best coherence per subject, 1 = worst")

MODEL RANKING BY COHERENCE SCORE

Model                                        |       CS (Score)|     MATH (Score)|  PHYSICS (Score)|    MEAN |  TOTAL | RANK
-------------------------------------------------------------------------------------------------------------------
all-distilroberta-v1                         | 0.7322  (6)  | 0.7115  (2)  | 0.7387  (6)  | 0.7274  |   14   |  #1
BAAI/bge-base-en-v1.5                        | 0.7198  (3)  | 0.7229  (6)  | 0.7331  (4)  | 0.7253  |   13   |  #2
sentence-transformers/all-MiniLM-L6-v2       | 0.7148  (2)  | 0.7221  (5)  | 0.7327  (3)  | 0.7232  |   10   |  #3
intfloat/e5-base-v2                          | 0.7242  (5)  | 0.7165  (4)  | 0.7220  (1)  | 0.7209  |   10   |  #3
all-mpnet-base-v2                            | 0.7146  (1)  | 0.7160  (3)  | 0.7336  (5)  | 0.7214  |    9   |  #5
allenai/specter2                             | 0.7210  (4)  | 0.4117  (1)  | 0.7253  (2)  | 0.6193  |    7   |  #6

Scoring: 6 = best coherence per su