# BERTopic Hyperparameter Tuning (Grid Search)

This notebook performs grid search over UMAP and HDBSCAN hyperparameters
using the best embedding model per subject:
- **CS & Physics**: all-distilroberta-v1
- **Math**: BAAI/bge-base-en-v1.5

Results and best models are saved to the `tunning/` directory.

In [1]:
import os
import gc
import itertools
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Optional, Tuple
from tqdm import tqdm
import warnings

from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

pd.set_option('display.max_colwidth', None)
warnings.filterwarnings("ignore", category=SyntaxWarning)

In [2]:
VERSION = "v1"
LIST_SUBJECT = ["cs", "math", "physics"]

SUBJECT_MODEL = {
    "cs":      {"name": "all-distilroberta-v1",  "safe_name": "all_distilroberta_v1",    "dim": 768},
    "math":    {"name": "BAAI/bge-base-en-v1.5", "safe_name": "BAAI_bge_base_en_v1.5",   "dim": 768},
    "physics": {"name": "all-distilroberta-v1",  "safe_name": "all_distilroberta_v1",    "dim": 768},
}

PARAM_GRID = {
    "min_cluster_size": [100, 150, 200],
    "min_samples": [5, 10, 20],
    "n_neighbors": [10, 15, 25],
    "n_components": [5, 10],
}

BASE_DIR = Path("../../dataset")
EMBEDDING_DIR = Path("./embedding")
TUNING_DIR = Path(f"./tunning/hdbscan_{VERSION}")

TUNING_DIR.mkdir(parents=True, exist_ok=True)

total_combos = 1
for values in PARAM_GRID.values():
    total_combos *= len(values)

print("Model per subject:")
for subj, info in SUBJECT_MODEL.items():
    print(f"  {subj}: {info['name']}")
print(f"\nGrid search: {total_combos} combinations per subject")
print(f"Total trials: {total_combos * len(LIST_SUBJECT)}")
print(f"Results will be saved to: {TUNING_DIR}")

Model per subject:
  cs: all-distilroberta-v1
  math: BAAI/bge-base-en-v1.5
  physics: all-distilroberta-v1

Grid search: 54 combinations per subject
Total trials: 162
Results will be saved to: tunning/hdbscan_v1


In [3]:
def load_dataset(subject: str) -> pd.DataFrame:
    file_path = BASE_DIR / subject / "emb" / f"{VERSION}.csv"
    if not file_path.exists():
        print(f"File not found: {file_path}")
        return None
    return pd.read_csv(file_path)


def load_mmap_embeddings(
    mmap_path: str,
    num_documents: int,
    embedding_dim: int,
    dtype: str = "float32"
) -> Optional[np.memmap]:
    try:
        return np.memmap(
            mmap_path, dtype=dtype, mode="r",
            shape=(num_documents, embedding_dim)
        )
    except Exception as e:
        print(f"Error loading embeddings: {e}")
        return None


def tokenize_for_coherence(text: str) -> List[str]:
    return [token for token in simple_preprocess(str(text), deacc=True)]


def calculate_coherence(
    topic_model: BERTopic,
    texts_tokenized: List[List[str]],
    dictionary: Dictionary,
    top_n: int = 10
) -> float:
    topics_list = []
    for topic_id in topic_model.get_topics().keys():
        if topic_id == -1:
            continue
        topic_words = [word for word, _ in topic_model.get_topic(topic_id)[:top_n]]
        topics_list.append(topic_words)

    if not topics_list:
        return 0.0

    cm = CoherenceModel(
        topics=topics_list,
        texts=texts_tokenized,
        dictionary=dictionary,
        coherence='c_v',
        processes=1
    )
    return cm.get_coherence()

In [4]:
all_data = {}
all_texts_tokenized = {}
all_dictionaries = {}
all_embeddings = {}

for subject in LIST_SUBJECT:
    df = load_dataset(subject)
    if df is None:
        continue

    all_data[subject] = df
    texts = df["text"].fillna("").tolist()
    print(f"{subject}: {len(df):,} documents loaded")

    print(f"  Tokenizing for coherence...")
    texts_tokenized = [tokenize_for_coherence(t) for t in tqdm(texts, desc=f"  {subject}")]
    all_texts_tokenized[subject] = texts_tokenized
    all_dictionaries[subject] = Dictionary(texts_tokenized)

    model_info = SUBJECT_MODEL[subject]
    emb_path = str(EMBEDDING_DIR / subject / f"{model_info['safe_name']}_{VERSION}.mmap")
    embeddings = load_mmap_embeddings(emb_path, len(texts), model_info['dim'])
    if embeddings is not None:
        all_embeddings[subject] = embeddings
        print(f"  Embeddings loaded: {embeddings.shape} (model: {model_info['name']})")
    else:
        print(f"  ⚠ Failed to load embeddings from {emb_path}")

print(f"\nSubjects ready: {list(all_data.keys())}")

cs: 165,756 documents loaded
  Tokenizing for coherence...


  cs: 100%|██████████| 165756/165756 [00:27<00:00, 6025.01it/s]


  Embeddings loaded: (165756, 768) (model: all-distilroberta-v1)
math: 157,085 documents loaded
  Tokenizing for coherence...


  math: 100%|██████████| 157085/157085 [00:18<00:00, 8678.59it/s]


  Embeddings loaded: (157085, 768) (model: BAAI/bge-base-en-v1.5)
physics: 146,311 documents loaded
  Tokenizing for coherence...


  physics: 100%|██████████| 146311/146311 [00:22<00:00, 6571.69it/s]


  Embeddings loaded: (146311, 768) (model: all-distilroberta-v1)

Subjects ready: ['cs', 'math', 'physics']


In [5]:
param_keys = list(PARAM_GRID.keys())
param_values = list(PARAM_GRID.values())
all_combos = list(itertools.product(*param_values))

results = []
csv_path = TUNING_DIR / "coherence_results.csv"

existing_results = set()
if csv_path.exists():
    existing_df = pd.read_csv(csv_path)
    for _, row in existing_df.iterrows():
        key = (row['subject'], row['min_cluster_size'], row['min_samples'],
               row['n_neighbors'], row['n_components'])
        existing_results.add(key)
    results = existing_df.to_dict('records')
    print(f"Loaded {len(results)} existing results from {csv_path}")

for subject in LIST_SUBJECT:
    if subject not in all_embeddings:
        print(f"\nSkipping {subject} (no embeddings)")
        continue

    model_info = SUBJECT_MODEL[subject]
    texts = all_data[subject]["text"].fillna("").tolist()
    embeddings = all_embeddings[subject]
    texts_tokenized = all_texts_tokenized[subject]
    dictionary = all_dictionaries[subject]

    print(f"\n{'='*80}")
    print(f"Subject: {subject.upper()} ({len(texts):,} documents) | Model: {model_info['name']}")
    print(f"{'='*80}")

    subject_combos = [
        combo for combo in all_combos
        if (subject, *combo) not in existing_results
    ]
    print(f"Combinations to run: {len(subject_combos)} / {len(all_combos)}")

    for i, combo in enumerate(subject_combos, 1):
        params = dict(zip(param_keys, combo))

        print(f"\n[{i}/{len(subject_combos)}] {params}")

        try:
            umap_model = UMAP(
                n_neighbors=params["n_neighbors"],
                n_components=params["n_components"],
                metric="cosine",
                random_state=42,
                min_dist=0.0,
                verbose=False
            )

            hdbscan_model = HDBSCAN(
                min_cluster_size=params["min_cluster_size"],
                min_samples=params["min_samples"],
                metric="euclidean",
                cluster_selection_method="eom",
                prediction_data=True
            )

            topic_model = BERTopic(
                umap_model=umap_model,
                hdbscan_model=hdbscan_model,
                calculate_probabilities=False,
                verbose=False
            )

            topics, _ = topic_model.fit_transform(texts, embeddings=embeddings)
            n_topics = len(topic_model.get_topic_info()) - 1
            outlier_count = sum(1 for t in topics if t == -1)
            outlier_ratio = outlier_count / len(topics)

            coherence = calculate_coherence(topic_model, texts_tokenized, dictionary)

            print(f"  Topics: {n_topics} | Coherence: {coherence:.4f} | Outliers: {outlier_ratio:.2%}")

            results.append({
                "subject": subject,
                "model": model_info["name"],
                "min_cluster_size": params["min_cluster_size"],
                "min_samples": params["min_samples"],
                "n_neighbors": params["n_neighbors"],
                "n_components": params["n_components"],
                "n_topics": n_topics,
                "coherence": coherence,
                "outlier_ratio": outlier_ratio
            })

            del topic_model
            gc.collect()

        except Exception as e:
            print(f"  ✗ Error: {e}")
            results.append({
                "subject": subject,
                "model": model_info["name"],
                "min_cluster_size": params["min_cluster_size"],
                "min_samples": params["min_samples"],
                "n_neighbors": params["n_neighbors"],
                "n_components": params["n_components"],
                "n_topics": None,
                "coherence": None,
                "outlier_ratio": None
            })

        results_df = pd.DataFrame(results)
        results_df.to_csv(csv_path, index=False)

print(f"\n{'='*80}")
print(f"Grid search complete! Results saved to: {csv_path}")
print(f"Total trials: {len(results)}")
print(f"{'='*80}")

Loaded 108 existing results from tunning/hdbscan_v1/coherence_results.csv

Subject: CS (165,756 documents) | Model: all-distilroberta-v1
Combinations to run: 0 / 54

Subject: MATH (157,085 documents) | Model: BAAI/bge-base-en-v1.5
Combinations to run: 54 / 54

[1/54] {'min_cluster_size': 100, 'min_samples': 5, 'n_neighbors': 10, 'n_components': 5}
  Topics: 254 | Coherence: 0.7091 | Outliers: 43.88%

[2/54] {'min_cluster_size': 100, 'min_samples': 5, 'n_neighbors': 10, 'n_components': 10}
  Topics: 250 | Coherence: 0.7053 | Outliers: 44.73%

[3/54] {'min_cluster_size': 100, 'min_samples': 5, 'n_neighbors': 15, 'n_components': 5}
  Topics: 225 | Coherence: 0.7072 | Outliers: 45.22%

[4/54] {'min_cluster_size': 100, 'min_samples': 5, 'n_neighbors': 15, 'n_components': 10}
  Topics: 233 | Coherence: 0.7114 | Outliers: 47.00%

[5/54] {'min_cluster_size': 100, 'min_samples': 5, 'n_neighbors': 25, 'n_components': 5}
  Topics: 205 | Coherence: 0.7158 | Outliers: 49.04%

[6/54] {'min_cluster_s

In [6]:
results_df = pd.read_csv(csv_path)

print("\nBest Hyperparameters per Subject")
print("=" * 100)

for subject in LIST_SUBJECT:
    subject_df = results_df[results_df['subject'] == subject].dropna(subset=['coherence'])
    if len(subject_df) == 0:
        continue

    best = subject_df.loc[subject_df['coherence'].idxmax()]
    model_name = SUBJECT_MODEL[subject]['name']

    print(f"\n{subject.upper()} (model: {model_name}):")
    print(f"  Best Coherence: {best['coherence']:.4f}")
    print(f"  Topics: {int(best['n_topics'])}")
    print(f"  Outlier Ratio: {best['outlier_ratio']:.2%}")
    print(f"  Parameters:")
    print(f"    min_cluster_size = {int(best['min_cluster_size'])}")
    print(f"    min_samples      = {int(best['min_samples'])}")
    print(f"    n_neighbors      = {int(best['n_neighbors'])}")
    print(f"    n_components     = {int(best['n_components'])}")

    top5 = subject_df.nlargest(5, 'coherence')
    print(f"\n  Top 5 Configurations:")
    print(f"  {'Rank':<5} {'Coherence':<11} {'Topics':<8} {'Outliers':<10} {'min_cluster':<13} {'min_samples':<13} {'n_neighbors':<13} {'n_components'}")
    print(f"  {'-'*90}")
    for rank, (_, row) in enumerate(top5.iterrows(), 1):
        print(f"  #{rank:<4} {row['coherence']:.4f}     {int(row['n_topics']):<8} {row['outlier_ratio']:.2%}     {int(row['min_cluster_size']):<13} {int(row['min_samples']):<13} {int(row['n_neighbors']):<13} {int(row['n_components'])}")
    print("-" * 100)


Best Hyperparameters per Subject

CS (model: all-distilroberta-v1):
  Best Coherence: 0.7364
  Topics: 261
  Outlier Ratio: 40.55%
  Parameters:
    min_cluster_size = 100
    min_samples      = 10
    n_neighbors      = 25
    n_components     = 5

  Top 5 Configurations:
  Rank  Coherence   Topics   Outliers   min_cluster   min_samples   n_neighbors   n_components
  ------------------------------------------------------------------------------------------
  #1    0.7364     261      40.55%     100           10            25            5
  #2    0.7352     202      36.52%     150           5             15            5
  #3    0.7345     243      41.23%     100           20            25            5
  #4    0.7340     256      37.30%     100           5             25            10
  #5    0.7332     273      37.47%     100           10            15            5
----------------------------------------------------------------------------------------------------

MATH (model: BAAI/b

In [7]:
results_df = pd.read_csv(csv_path)

print("Retraining and saving best models per subject...")
print("=" * 80)

for subject in LIST_SUBJECT:
    if subject not in all_embeddings:
        print(f"\nSkipping {subject} (no embeddings)")
        continue

    subject_df = results_df[results_df['subject'] == subject].dropna(subset=['coherence'])
    if len(subject_df) == 0:
        continue

    best = subject_df.loc[subject_df['coherence'].idxmax()]
    model_info = SUBJECT_MODEL[subject]
    texts = all_data[subject]["text"].fillna("").tolist()
    embeddings = all_embeddings[subject]

    print(f"\n{subject.upper()} (model: {model_info['name']}): Retraining with best params (coherence={best['coherence']:.4f})")
    print(f"  min_cluster_size={int(best['min_cluster_size'])}, min_samples={int(best['min_samples'])}, "
          f"n_neighbors={int(best['n_neighbors'])}, n_components={int(best['n_components'])}")

    umap_model = UMAP(
        n_neighbors=int(best["n_neighbors"]),
        n_components=int(best["n_components"]),
        metric="cosine",
        random_state=42,
        min_dist=0.0,
        verbose=False
    )

    hdbscan_model = HDBSCAN(
        min_cluster_size=int(best["min_cluster_size"]),
        min_samples=int(best["min_samples"]),
        metric="euclidean",
        cluster_selection_method="eom",
        prediction_data=True
    )

    topic_model = BERTopic(
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        calculate_probabilities=False,
        verbose=False
    )

    topics, _ = topic_model.fit_transform(texts, embeddings=embeddings)
    n_topics = len(topic_model.get_topic_info()) - 1
    print(f"  Topics: {n_topics}")

    model_save_path = str(TUNING_DIR / f"best_{subject}")
    topic_model.save(model_save_path)
    print(f"  ✓ Saved to {model_save_path}")

    del topic_model
    gc.collect()

print(f"\n{'='*80}")
print("All best models saved!")
print(f"{'='*80}")

Retraining and saving best models per subject...

CS (model: all-distilroberta-v1): Retraining with best params (coherence=0.7364)
  min_cluster_size=100, min_samples=10, n_neighbors=25, n_components=5




  Topics: 261
  ✓ Saved to tunning/hdbscan_v1/best_cs

MATH (model: BAAI/bge-base-en-v1.5): Retraining with best params (coherence=0.7229)
  min_cluster_size=150, min_samples=10, n_neighbors=15, n_components=5




  Topics: 150
  ✓ Saved to tunning/hdbscan_v1/best_math

PHYSICS (model: all-distilroberta-v1): Retraining with best params (coherence=0.7466)
  min_cluster_size=100, min_samples=5, n_neighbors=25, n_components=10




  Topics: 232
  ✓ Saved to tunning/hdbscan_v1/best_physics

All best models saved!
