# Top2Vec Hyperparameter Tuning

Based on `coherence_results_v1.csv`, `sentence-transformers/all-MiniLM-L6-v2` achieved the highest average coherence across all subjects. This notebook performs grid-search hyperparameter tuning over UMAP and HDBSCAN parameters to find the best Top2Vec configuration.

In [1]:
import os
import gc
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Optional, Dict, Any
from tqdm import tqdm
from itertools import product
import warnings
import time

from top2vec import Top2Vec
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

warnings.filterwarnings("ignore", category=FutureWarning)

## Configuration

In [2]:
VERSION = "v1"
LIST_SUBJECT = ["cs", "math", "physics"]

TRANSFORMER = "sentence-transformers/all-MiniLM-L6-v2"
EMBEDDING_DIM = 384

BASE_DIR = Path("../../dataset")
EMBEDDING_DIR = Path("../bertopic/embedding")
TUNNING_DIR = Path("./tunning")

SAFE_MODEL_NAME = TRANSFORMER.replace("/", "_").replace("-", "_")
OUTPUT_DIR = TUNNING_DIR / SAFE_MODEL_NAME
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Embedding: {TRANSFORMER}")
print(f"Output directory: {OUTPUT_DIR}")

Embedding: sentence-transformers/all-MiniLM-L6-v2
Output directory: tunning/sentence_transformers_all_MiniLM_L6_v2


## Hyperparameter Grid

In [3]:
PARAM_GRID = {
    "umap_n_neighbors": [10, 15, 30],
    "umap_n_components": [5, 10, 30],
    "hdbscan_min_cluster_size": [15, 30, 50],
    "hdbscan_cluster_selection_method": ["eom"],
    "min_count": [50],
}

keys = list(PARAM_GRID.keys())
values = list(PARAM_GRID.values())
all_combos = list(product(*values))

print(f"Total parameter combinations: {len(all_combos)}")
print(f"Total runs (combinations x subjects): {len(all_combos) * len(LIST_SUBJECT)}")

Total parameter combinations: 27
Total runs (combinations x subjects): 81


## Helper Functions

In [4]:
def get_model_safe_name(model_name: str) -> str:
    return model_name.replace("/", "_").replace("-", "_")


def load_dataset(subject: str) -> Optional[pd.DataFrame]:
    file_path = BASE_DIR / subject / "emb" / f"{VERSION}.csv"
    if not file_path.exists():
        print(f"File not found: {file_path}")
        return None
    return pd.read_csv(file_path)


def load_mmap_embeddings(
    mmap_path: str,
    num_documents: int,
    embedding_dim: int,
    dtype: str = "float32"
) -> Optional[np.ndarray]:
    try:
        embs = np.array(np.memmap(
            mmap_path, dtype=dtype, mode="r",
            shape=(num_documents, embedding_dim)
        ))
        return normalize(embs)
    except FileNotFoundError:
        print(f"Embedding not found: {mmap_path}")
        return None
    except Exception as e:
        print(f"Error loading embeddings: {e}")
        return None


def train_top2vec_with_precomputed(
    documents: List[str],
    precomputed_embeddings: np.ndarray,
    transformer_name: str,
    umap_args: Dict[str, Any] = None,
    hdbscan_args: Dict[str, Any] = None,
    min_count: int = 50,
) -> Top2Vec:
    num_docs = len(documents)
    st_model = SentenceTransformer(transformer_name)

    original_embed_docs = Top2Vec._embed_documents

    def patched_embed_documents(self, train_corpus, batch_size):
        if len(train_corpus) == num_docs:
            return precomputed_embeddings
        else:
            return st_model.encode(train_corpus, batch_size=batch_size, show_progress_bar=False)

    Top2Vec._embed_documents = patched_embed_documents

    model = Top2Vec(
        documents=documents,
        embedding_model='all-MiniLM-L6-v2',
        min_count=min_count,
        contextual_top2vec=False,
        ngram_vocab=False,
        umap_args=umap_args,
        hdbscan_args=hdbscan_args,
        verbose=False,
    )

    Top2Vec._embed_documents = original_embed_docs
    del st_model

    return model


def calculate_coherence(
    model: Top2Vec,
    texts_tokenized: List[List[str]],
    dictionary: Dictionary,
    top_n: int = 5
) -> float:
    num_topics = model.get_num_topics()
    topic_words, _, _ = model.get_topics(num_topics)
    topic_words_sliced = topic_words[:, :top_n]

    cm = CoherenceModel(
        topics=topic_words_sliced.tolist(),
        texts=texts_tokenized,
        dictionary=dictionary,
        coherence='c_v',
        processes=1
    )

    return cm.get_coherence()

## Load Datasets, Embeddings & Tokenize

In [5]:
all_data = {}
all_embeddings = {}
all_texts_tokenized = {}
all_dictionaries = {}

safe_name = get_model_safe_name(TRANSFORMER)

for subject in LIST_SUBJECT:
    df = load_dataset(subject)
    if df is None:
        continue

    all_data[subject] = df
    print(f"{subject}: {len(df):,} documents loaded")

    mmap_path = EMBEDDING_DIR / subject / f"{safe_name}_{VERSION}.mmap"
    embs = load_mmap_embeddings(str(mmap_path), len(df), EMBEDDING_DIM)
    if embs is None:
        print(f"  âš  Skipping {subject}: embedding not found")
        continue
    all_embeddings[subject] = embs
    print(f"  Embeddings loaded: {embs.shape}")

    print(f"  Tokenizing for coherence...")
    texts_tokenized = [text.split() for text in tqdm(df['text'].fillna('').tolist(), desc=f"  {subject}")]
    all_texts_tokenized[subject] = texts_tokenized
    all_dictionaries[subject] = Dictionary(texts_tokenized)

print(f"\nSubjects ready: {list(all_embeddings.keys())}")

cs: 165,756 documents loaded
  Embeddings loaded: (165756, 384)
  Tokenizing for coherence...


  cs: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 165756/165756 [00:02<00:00, 74037.54it/s]


math: 126,192 documents loaded
  Embeddings loaded: (126192, 384)
  Tokenizing for coherence...


  math: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 126192/126192 [00:00<00:00, 137450.95it/s]


physics: 146,311 documents loaded
  Embeddings loaded: (146311, 384)
  Tokenizing for coherence...


  physics: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 146311/146311 [00:02<00:00, 63863.18it/s] 



Subjects ready: ['cs', 'math', 'physics']


## Hyperparameter Tuning Grid Search

In [6]:
results = []
csv_path = OUTPUT_DIR / "tuning_results.csv"

total_runs = len(all_combos) * len(all_embeddings)
run_count = 0

for subject in all_embeddings:
    df = all_data[subject]
    documents = df['text'].fillna('').tolist()
    embs = all_embeddings[subject]

    print(f"\n{'=' * 70}")
    print(f"Subject: {subject.upper()} ({len(documents):,} documents)")
    print(f"{'=' * 70}")

    for combo in all_combos:
        run_count += 1
        params = dict(zip(keys, combo))

        umap_args = {
            "n_neighbors": params["umap_n_neighbors"],
            "n_components": params["umap_n_components"],
            "metric": "cosine",
        }
        hdbscan_args = {
            "min_cluster_size": params["hdbscan_min_cluster_size"],
            "metric": "euclidean",
            "cluster_selection_method": params["hdbscan_cluster_selection_method"],
        }

        print(f"\n[{run_count}/{total_runs}] {subject} | "
              f"nn={params['umap_n_neighbors']} nc={params['umap_n_components']} "
              f"mcs={params['hdbscan_min_cluster_size']} csm={params['hdbscan_cluster_selection_method']} "
              f"mc={params['min_count']}")

        try:
            start_time = time.time()

            model = train_top2vec_with_precomputed(
                documents=documents,
                precomputed_embeddings=embs,
                transformer_name=TRANSFORMER,
                umap_args=umap_args,
                hdbscan_args=hdbscan_args,
                min_count=params["min_count"],
            )

            n_topics = model.get_num_topics()
            elapsed = time.time() - start_time

            if n_topics <= 1:
                print(f"  âš  Only {n_topics} topic(s) found, skipping coherence ({elapsed:.1f}s)")
                coherence = None
            else:
                coherence = calculate_coherence(
                    model,
                    all_texts_tokenized[subject],
                    all_dictionaries[subject]
                )
                print(f"  âœ“ Topics: {n_topics} | Coherence: {coherence:.4f} ({elapsed:.1f}s)")

            result_row = {
                "subject": subject,
                "umap_n_neighbors": params["umap_n_neighbors"],
                "umap_n_components": params["umap_n_components"],
                "hdbscan_min_cluster_size": params["hdbscan_min_cluster_size"],
                "hdbscan_cluster_selection_method": params["hdbscan_cluster_selection_method"],
                "min_count": params["min_count"],
                "n_topics": n_topics,
                "coherence": coherence,
                "time_seconds": round(elapsed, 1),
            }
            results.append(result_row)

            del model
            gc.collect()

        except Exception as e:
            print(f"  âœ— Error: {e}")
            result_row = {
                "subject": subject,
                "umap_n_neighbors": params["umap_n_neighbors"],
                "umap_n_components": params["umap_n_components"],
                "hdbscan_min_cluster_size": params["hdbscan_min_cluster_size"],
                "hdbscan_cluster_selection_method": params["hdbscan_cluster_selection_method"],
                "min_count": params["min_count"],
                "n_topics": None,
                "coherence": None,
                "time_seconds": None,
            }
            results.append(result_row)

        if run_count % 10 == 0:
            pd.DataFrame(results).to_csv(csv_path, index=False)
            print(f"  ðŸ’¾ Checkpoint saved ({run_count}/{total_runs})")

results_df = pd.DataFrame(results)
results_df.to_csv(csv_path, index=False)
print(f"\nâœ… All results saved to {csv_path}")
print(f"Total runs: {len(results_df)}")


Subject: CS (165,756 documents)

[1/81] cs | nn=10 nc=5 mcs=15 csm=eom mc=50


2026-02-12 23:21:27,494 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 919 | Coherence: 0.5735 (80.6s)

[2/81] cs | nn=10 nc=5 mcs=30 csm=eom mc=50


2026-02-12 23:24:56,741 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 499 | Coherence: 0.5744 (65.6s)

[3/81] cs | nn=10 nc=5 mcs=50 csm=eom mc=50


2026-02-12 23:27:45,301 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 302 | Coherence: 0.5702 (65.3s)

[4/81] cs | nn=10 nc=10 mcs=15 csm=eom mc=50


2026-02-12 23:30:13,978 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 906 | Coherence: 0.5712 (67.7s)

[5/81] cs | nn=10 nc=10 mcs=30 csm=eom mc=50


2026-02-12 23:33:31,619 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 516 | Coherence: 0.5802 (67.1s)

[6/81] cs | nn=10 nc=10 mcs=50 csm=eom mc=50


2026-02-12 23:36:18,975 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 303 | Coherence: 0.5740 (67.2s)

[7/81] cs | nn=10 nc=30 mcs=15 csm=eom mc=50


2026-02-12 23:38:50,875 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 935 | Coherence: 0.5703 (84.3s)

[8/81] cs | nn=10 nc=30 mcs=30 csm=eom mc=50


2026-02-12 23:42:26,616 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 492 | Coherence: 0.5744 (83.5s)

[9/81] cs | nn=10 nc=30 mcs=50 csm=eom mc=50


2026-02-12 23:45:32,041 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 311 | Coherence: 0.5785 (83.2s)

[10/81] cs | nn=15 nc=5 mcs=15 csm=eom mc=50


2026-02-12 23:48:18,092 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 810 | Coherence: 0.5761 (70.4s)
  ðŸ’¾ Checkpoint saved (10/81)

[11/81] cs | nn=15 nc=5 mcs=30 csm=eom mc=50


2026-02-12 23:51:33,615 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 455 | Coherence: 0.5722 (69.8s)

[12/81] cs | nn=15 nc=5 mcs=50 csm=eom mc=50


2026-02-12 23:54:20,901 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 292 | Coherence: 0.5727 (69.0s)

[13/81] cs | nn=15 nc=10 mcs=15 csm=eom mc=50


2026-02-12 23:56:52,355 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 829 | Coherence: 0.5722 (70.7s)

[14/81] cs | nn=15 nc=10 mcs=30 csm=eom mc=50


2026-02-13 00:00:07,260 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 488 | Coherence: 0.5778 (71.0s)

[15/81] cs | nn=15 nc=10 mcs=50 csm=eom mc=50


2026-02-13 00:02:57,309 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 284 | Coherence: 0.5770 (70.6s)

[16/81] cs | nn=15 nc=30 mcs=15 csm=eom mc=50


2026-02-13 00:05:29,360 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 843 | Coherence: 0.5712 (88.5s)

[17/81] cs | nn=15 nc=30 mcs=30 csm=eom mc=50


2026-02-13 00:08:58,210 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 457 | Coherence: 0.5736 (87.5s)

[18/81] cs | nn=15 nc=30 mcs=50 csm=eom mc=50


2026-02-13 00:12:00,673 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 297 | Coherence: 0.5793 (87.9s)

[19/81] cs | nn=30 nc=5 mcs=15 csm=eom mc=50


2026-02-13 00:14:51,227 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 678 | Coherence: 0.5818 (76.9s)

[20/81] cs | nn=30 nc=5 mcs=30 csm=eom mc=50


2026-02-13 00:18:02,087 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 410 | Coherence: 0.5757 (77.6s)
  ðŸ’¾ Checkpoint saved (20/81)

[21/81] cs | nn=30 nc=5 mcs=50 csm=eom mc=50


2026-02-13 00:20:49,605 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 261 | Coherence: 0.5853 (77.5s)

[22/81] cs | nn=30 nc=10 mcs=15 csm=eom mc=50


2026-02-13 00:23:25,165 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 711 | Coherence: 0.5761 (81.3s)

[23/81] cs | nn=30 nc=10 mcs=30 csm=eom mc=50


2026-02-13 00:26:43,403 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 417 | Coherence: 0.5761 (81.2s)

[24/81] cs | nn=30 nc=10 mcs=50 csm=eom mc=50


2026-02-13 00:29:34,586 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 258 | Coherence: 0.5828 (81.9s)

[25/81] cs | nn=30 nc=30 mcs=15 csm=eom mc=50


2026-02-13 00:32:15,681 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 700 | Coherence: 0.5760 (99.4s)

[26/81] cs | nn=30 nc=30 mcs=30 csm=eom mc=50


2026-02-13 00:35:49,721 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 423 | Coherence: 0.5771 (100.4s)

[27/81] cs | nn=30 nc=30 mcs=50 csm=eom mc=50


2026-02-13 00:39:04,622 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 239 | Coherence: 0.5893 (99.6s)

Subject: MATH (126,192 documents)

[28/81] math | nn=10 nc=5 mcs=15 csm=eom mc=50


2026-02-13 00:41:45,495 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 637 | Coherence: 0.5457 (42.6s)

[29/81] math | nn=10 nc=5 mcs=30 csm=eom mc=50


2026-02-13 00:43:07,810 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 316 | Coherence: 0.5599 (42.5s)

[30/81] math | nn=10 nc=5 mcs=50 csm=eom mc=50


2026-02-13 00:44:22,104 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 208 | Coherence: 0.5534 (43.1s)
  ðŸ’¾ Checkpoint saved (30/81)

[31/81] math | nn=10 nc=10 mcs=15 csm=eom mc=50


2026-02-13 00:45:33,706 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 629 | Coherence: 0.5433 (43.5s)

[32/81] math | nn=10 nc=10 mcs=30 csm=eom mc=50


2026-02-13 00:46:55,251 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 337 | Coherence: 0.5543 (43.4s)

[33/81] math | nn=10 nc=10 mcs=50 csm=eom mc=50


2026-02-13 00:48:11,627 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 190 | Coherence: 0.5410 (43.9s)

[34/81] math | nn=10 nc=30 mcs=15 csm=eom mc=50


2026-02-13 00:49:23,685 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 634 | Coherence: 0.5444 (52.7s)

[35/81] math | nn=10 nc=30 mcs=30 csm=eom mc=50


2026-02-13 00:50:56,609 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 330 | Coherence: 0.5617 (54.3s)

[36/81] math | nn=10 nc=30 mcs=50 csm=eom mc=50


2026-02-13 00:52:22,633 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 178 | Coherence: 0.5446 (54.1s)

[37/81] math | nn=15 nc=5 mcs=15 csm=eom mc=50


2026-02-13 00:53:42,921 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 565 | Coherence: 0.5470 (44.9s)

[38/81] math | nn=15 nc=5 mcs=30 csm=eom mc=50


2026-02-13 00:55:05,138 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 300 | Coherence: 0.5586 (44.6s)

[39/81] math | nn=15 nc=5 mcs=50 csm=eom mc=50


2026-02-13 00:56:20,866 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 175 | Coherence: 0.5615 (44.7s)

[40/81] math | nn=15 nc=10 mcs=15 csm=eom mc=50


2026-02-13 00:57:31,409 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 569 | Coherence: 0.5534 (45.4s)
  ðŸ’¾ Checkpoint saved (40/81)

[41/81] math | nn=15 nc=10 mcs=30 csm=eom mc=50


2026-02-13 00:58:55,234 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 307 | Coherence: 0.5509 (45.5s)

[42/81] math | nn=15 nc=10 mcs=50 csm=eom mc=50


2026-02-13 01:00:12,799 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 166 | Coherence: 0.5586 (45.6s)

[43/81] math | nn=15 nc=30 mcs=15 csm=eom mc=50


2026-02-13 01:01:24,826 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 569 | Coherence: 0.5525 (56.8s)

[44/81] math | nn=15 nc=30 mcs=30 csm=eom mc=50


2026-02-13 01:02:59,024 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 313 | Coherence: 0.5615 (57.4s)

[45/81] math | nn=15 nc=30 mcs=50 csm=eom mc=50


2026-02-13 01:04:28,703 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 176 | Coherence: 0.5645 (55.5s)

[46/81] math | nn=30 nc=5 mcs=15 csm=eom mc=50


2026-02-13 01:05:51,502 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 468 | Coherence: 0.5494 (51.7s)

[47/81] math | nn=30 nc=5 mcs=30 csm=eom mc=50


2026-02-13 01:07:19,367 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 252 | Coherence: 0.5684 (52.0s)

[48/81] math | nn=30 nc=5 mcs=50 csm=eom mc=50


2026-02-13 01:08:40,755 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 154 | Coherence: 0.5676 (51.6s)

[49/81] math | nn=30 nc=10 mcs=15 csm=eom mc=50


2026-02-13 01:09:57,267 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 481 | Coherence: 0.5498 (52.1s)

[50/81] math | nn=30 nc=10 mcs=30 csm=eom mc=50


2026-02-13 01:11:26,459 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 247 | Coherence: 0.5534 (51.8s)
  ðŸ’¾ Checkpoint saved (50/81)

[51/81] math | nn=30 nc=10 mcs=50 csm=eom mc=50


2026-02-13 01:12:47,790 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 157 | Coherence: 0.5607 (51.9s)

[52/81] math | nn=30 nc=30 mcs=15 csm=eom mc=50


2026-02-13 01:14:05,246 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 465 | Coherence: 0.5540 (63.8s)

[53/81] math | nn=30 nc=30 mcs=30 csm=eom mc=50


2026-02-13 01:15:45,257 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 256 | Coherence: 0.5661 (63.5s)

[54/81] math | nn=30 nc=30 mcs=50 csm=eom mc=50


2026-02-13 01:17:19,165 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 2 | Coherence: 0.4889 (64.9s)

Subject: PHYSICS (146,311 documents)

[55/81] physics | nn=10 nc=5 mcs=15 csm=eom mc=50


2026-02-13 01:18:44,008 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 762 | Coherence: 0.6293 (57.2s)

[56/81] physics | nn=10 nc=5 mcs=30 csm=eom mc=50


2026-02-13 01:21:00,040 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 381 | Coherence: 0.6435 (56.2s)

[57/81] physics | nn=10 nc=5 mcs=50 csm=eom mc=50


2026-02-13 01:23:00,282 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 249 | Coherence: 0.6520 (55.8s)

[58/81] physics | nn=10 nc=10 mcs=15 csm=eom mc=50


2026-02-13 01:24:54,050 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 778 | Coherence: 0.6307 (57.7s)

[59/81] physics | nn=10 nc=10 mcs=30 csm=eom mc=50


2026-02-13 01:27:09,966 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 396 | Coherence: 0.6343 (57.6s)

[60/81] physics | nn=10 nc=10 mcs=50 csm=eom mc=50


2026-02-13 01:29:13,297 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 253 | Coherence: 0.6465 (58.2s)
  ðŸ’¾ Checkpoint saved (60/81)

[61/81] physics | nn=10 nc=30 mcs=15 csm=eom mc=50


2026-02-13 01:31:08,556 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 769 | Coherence: 0.6222 (69.6s)

[62/81] physics | nn=10 nc=30 mcs=30 csm=eom mc=50


2026-02-13 01:33:37,821 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 398 | Coherence: 0.6424 (69.5s)

[63/81] physics | nn=10 nc=30 mcs=50 csm=eom mc=50


2026-02-13 01:35:52,935 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 235 | Coherence: 0.6456 (70.1s)

[64/81] physics | nn=15 nc=5 mcs=15 csm=eom mc=50


2026-02-13 01:37:59,842 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 671 | Coherence: 0.6332 (58.9s)

[65/81] physics | nn=15 nc=5 mcs=30 csm=eom mc=50


2026-02-13 01:40:14,905 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 364 | Coherence: 0.6466 (58.5s)

[66/81] physics | nn=15 nc=5 mcs=50 csm=eom mc=50


2026-02-13 01:42:16,939 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 237 | Coherence: 0.6600 (58.9s)

[67/81] physics | nn=15 nc=10 mcs=15 csm=eom mc=50


2026-02-13 01:44:12,612 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 678 | Coherence: 0.6341 (60.0s)

[68/81] physics | nn=15 nc=10 mcs=30 csm=eom mc=50


2026-02-13 01:46:29,799 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 359 | Coherence: 0.6406 (58.7s)

[69/81] physics | nn=15 nc=10 mcs=50 csm=eom mc=50


2026-02-13 01:48:32,023 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 243 | Coherence: 0.6479 (58.9s)

[70/81] physics | nn=15 nc=30 mcs=15 csm=eom mc=50


2026-02-13 01:50:28,747 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 698 | Coherence: 0.6335 (72.5s)
  ðŸ’¾ Checkpoint saved (70/81)

[71/81] physics | nn=15 nc=30 mcs=30 csm=eom mc=50


2026-02-13 01:53:03,300 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 373 | Coherence: 0.6365 (77.4s)

[72/81] physics | nn=15 nc=30 mcs=50 csm=eom mc=50


2026-02-13 01:55:22,765 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 237 | Coherence: 0.6496 (72.8s)

[73/81] physics | nn=30 nc=5 mcs=15 csm=eom mc=50


2026-02-13 01:57:32,028 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 578 | Coherence: 0.6346 (66.0s)

[74/81] physics | nn=30 nc=5 mcs=30 csm=eom mc=50


2026-02-13 01:59:49,948 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 309 | Coherence: 0.6503 (65.7s)

[75/81] physics | nn=30 nc=5 mcs=50 csm=eom mc=50


2026-02-13 02:01:57,304 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 203 | Coherence: 0.6534 (65.5s)

[76/81] physics | nn=30 nc=10 mcs=15 csm=eom mc=50


2026-02-13 02:03:56,503 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 588 | Coherence: 0.6366 (67.1s)

[77/81] physics | nn=30 nc=10 mcs=30 csm=eom mc=50


2026-02-13 02:06:19,074 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 323 | Coherence: 0.6440 (67.8s)

[78/81] physics | nn=30 nc=10 mcs=50 csm=eom mc=50


2026-02-13 02:08:28,931 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 205 | Coherence: 0.6642 (67.5s)

[79/81] physics | nn=30 nc=30 mcs=15 csm=eom mc=50


2026-02-13 02:10:31,457 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 569 | Coherence: 0.6314 (81.8s)

[80/81] physics | nn=30 nc=30 mcs=30 csm=eom mc=50


2026-02-13 02:13:06,771 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 301 | Coherence: 0.6425 (81.9s)
  ðŸ’¾ Checkpoint saved (80/81)

[81/81] physics | nn=30 nc=30 mcs=50 csm=eom mc=50


2026-02-13 02:15:29,351 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  âœ“ Topics: 210 | Coherence: 0.6605 (82.4s)

âœ… All results saved to tunning/sentence_transformers_all_MiniLM_L6_v2/tuning_results.csv
Total runs: 81


## Results Summary

In [7]:
results_df = pd.read_csv(csv_path)
valid_results = results_df.dropna(subset=["coherence"])

print(f"Total runs: {len(results_df)}")
print(f"Valid runs (>1 topic): {len(valid_results)}")
print(f"Skipped (1 topic or error): {len(results_df) - len(valid_results)}")

print("\n" + "=" * 90)
print("Best Parameters per Subject")
print("=" * 90)

best_per_subject = {}
for subject in LIST_SUBJECT:
    subj_results = valid_results[valid_results["subject"] == subject]
    if subj_results.empty:
        print(f"\n{subject.upper()}: No valid results")
        continue

    best_idx = subj_results["coherence"].idxmax()
    best_row = subj_results.loc[best_idx]
    best_per_subject[subject] = best_row

    print(f"\n{subject.upper()}:")
    print(f"  Best coherence: {best_row['coherence']:.4f}")
    print(f"  Topics: {int(best_row['n_topics'])}")
    print(f"  umap_n_neighbors: {int(best_row['umap_n_neighbors'])}")
    print(f"  umap_n_components: {int(best_row['umap_n_components'])}")
    print(f"  hdbscan_min_cluster_size: {int(best_row['hdbscan_min_cluster_size'])}")
    print(f"  hdbscan_cluster_selection_method: {best_row['hdbscan_cluster_selection_method']}")
    print(f"  min_count: {int(best_row['min_count'])}")

print("\n" + "=" * 90)
print("Top 5 per Subject")
print("=" * 90)
for subject in LIST_SUBJECT:
    subj_results = valid_results[valid_results["subject"] == subject]
    if subj_results.empty:
        continue
    top5 = subj_results.nlargest(5, "coherence")
    print(f"\n{subject.upper()}:")
    print(top5[["umap_n_neighbors", "umap_n_components", "hdbscan_min_cluster_size",
                "hdbscan_cluster_selection_method", "min_count", "n_topics", "coherence"]].to_string(index=False))

Total runs: 81
Valid runs (>1 topic): 81
Skipped (1 topic or error): 0

Best Parameters per Subject

CS:
  Best coherence: 0.5893
  Topics: 239
  umap_n_neighbors: 30
  umap_n_components: 30
  hdbscan_min_cluster_size: 50
  hdbscan_cluster_selection_method: eom
  min_count: 50

MATH:
  Best coherence: 0.5684
  Topics: 252
  umap_n_neighbors: 30
  umap_n_components: 5
  hdbscan_min_cluster_size: 30
  hdbscan_cluster_selection_method: eom
  min_count: 50

PHYSICS:
  Best coherence: 0.6642
  Topics: 205
  umap_n_neighbors: 30
  umap_n_components: 10
  hdbscan_min_cluster_size: 50
  hdbscan_cluster_selection_method: eom
  min_count: 50

Top 5 per Subject

CS:
 umap_n_neighbors  umap_n_components  hdbscan_min_cluster_size hdbscan_cluster_selection_method  min_count  n_topics  coherence
               30                 30                        50                              eom         50       239   0.589346
               30                  5                        50                  

## Save Best Models

In [8]:
for subject, best_row in best_per_subject.items():
    print(f"\n{'=' * 70}")
    print(f"Retraining best model for {subject.upper()}")
    print(f"{'=' * 70}")

    df = all_data[subject]
    documents = df['text'].fillna('').tolist()
    embs = all_embeddings[subject]

    umap_args = {
        "n_neighbors": int(best_row["umap_n_neighbors"]),
        "n_components": int(best_row["umap_n_components"]),
        "metric": "cosine",
    }
    hdbscan_args = {
        "min_cluster_size": int(best_row["hdbscan_min_cluster_size"]),
        "metric": "euclidean",
        "cluster_selection_method": best_row["hdbscan_cluster_selection_method"],
    }

    print(f"  UMAP: {umap_args}")
    print(f"  HDBSCAN: {hdbscan_args}")
    print(f"  min_count: {int(best_row['min_count'])}")

    model = train_top2vec_with_precomputed(
        documents=documents,
        precomputed_embeddings=embs,
        transformer_name=TRANSFORMER,
        umap_args=umap_args,
        hdbscan_args=hdbscan_args,
        min_count=int(best_row["min_count"]),
    )

    n_topics = model.get_num_topics()
    coherence = calculate_coherence(
        model,
        all_texts_tokenized[subject],
        all_dictionaries[subject]
    )
    print(f"  Topics: {n_topics} | Coherence: {coherence:.4f}")

    save_path = OUTPUT_DIR / f"best_model_{subject}"
    save_path.mkdir(parents=True, exist_ok=True)
    model.save(str(save_path / "model"))
    print(f"  âœ“ Model saved to {save_path}")

    del model
    gc.collect()

print("\nâœ… All best models saved!")


Retraining best model for CS
  UMAP: {'n_neighbors': 30, 'n_components': 30, 'metric': 'cosine'}
  HDBSCAN: {'min_cluster_size': 50, 'metric': 'euclidean', 'cluster_selection_method': 'eom'}
  min_count: 50


2026-02-13 02:17:54,817 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  Topics: 239 | Coherence: 0.5782
  âœ“ Model saved to tunning/sentence_transformers_all_MiniLM_L6_v2/best_model_cs

Retraining best model for MATH
  UMAP: {'n_neighbors': 30, 'n_components': 5, 'metric': 'cosine'}
  HDBSCAN: {'min_cluster_size': 30, 'metric': 'euclidean', 'cluster_selection_method': 'eom'}
  min_count: 50


2026-02-13 02:20:38,520 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  Topics: 254 | Coherence: 0.5768
  âœ“ Model saved to tunning/sentence_transformers_all_MiniLM_L6_v2/best_model_math

Retraining best model for PHYSICS
  UMAP: {'n_neighbors': 30, 'n_components': 10, 'metric': 'cosine'}
  HDBSCAN: {'min_cluster_size': 50, 'metric': 'euclidean', 'cluster_selection_method': 'eom'}
  min_count: 50


2026-02-13 02:22:10,618 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


  Topics: 202 | Coherence: 0.6567
  âœ“ Model saved to tunning/sentence_transformers_all_MiniLM_L6_v2/best_model_physics

âœ… All best models saved!
