# BERTopic with KMeans Clustering

This notebook trains BERTopic models using **all-distilroberta-v1** embeddings
with **KMeans** clustering instead of HDBSCAN across all subjects.

In [1]:
import os
import gc
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Optional
from tqdm import tqdm
import warnings

from umap import UMAP
from sklearn.cluster import KMeans
from bertopic import BERTopic
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

pd.set_option('display.max_colwidth', None)
warnings.filterwarnings("ignore", category=SyntaxWarning)

In [None]:
VERSION = "v1"
MODEL_NAME = "all-distilroberta-v1"
MODEL_SAFE_NAME = "all_distilroberta_v1"
EMBEDDING_DIM = 768
LIST_SUBJECT = ["cs", "math", "physics"]

N_CLUSTERS_LIST = [200, 250, 300, 500]

BASE_DIR = Path("../../dataset")
EMBEDDING_DIR = Path("./embedding")
RESULT_DIR = Path(f"./tunning/kmeans_{MODEL_SAFE_NAME}_{VERSION}")

RESULT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Model: {MODEL_NAME}")
print(f"Clustering: KMeans")
print(f"K values to test: {N_CLUSTERS_LIST}")
print(f"Results will be saved to: {RESULT_DIR}")

Model: all-distilroberta-v1
Clustering: KMeans
K values to test: [200, 250, 300, 500]
Results will be saved to: result/kmeans_all_distilroberta_v1_v1


In [3]:
def load_dataset(subject: str) -> pd.DataFrame:
    file_path = BASE_DIR / subject / "emb" / f"{VERSION}.csv"
    if not file_path.exists():
        print(f"File not found: {file_path}")
        return None
    return pd.read_csv(file_path)


def load_mmap_embeddings(
    mmap_path: str,
    num_documents: int,
    embedding_dim: int,
    dtype: str = "float32"
) -> Optional[np.memmap]:
    try:
        return np.memmap(
            mmap_path, dtype=dtype, mode="r",
            shape=(num_documents, embedding_dim)
        )
    except Exception as e:
        print(f"Error loading embeddings: {e}")
        return None


def tokenize_for_coherence(text: str) -> List[str]:
    return [token for token in simple_preprocess(str(text), deacc=True)]


def calculate_coherence(
    topic_model: BERTopic,
    texts_tokenized: List[List[str]],
    dictionary: Dictionary,
    top_n: int = 10
) -> float:
    topics_list = []
    for topic_id in topic_model.get_topics().keys():
        if topic_id == -1:
            continue
        topic_words = [word for word, _ in topic_model.get_topic(topic_id)[:top_n]]
        topics_list.append(topic_words)

    if not topics_list:
        return 0.0

    cm = CoherenceModel(
        topics=topics_list,
        texts=texts_tokenized,
        dictionary=dictionary,
        coherence='c_v',
        processes=1
    )
    return cm.get_coherence()

In [4]:
all_data = {}
all_texts_tokenized = {}
all_dictionaries = {}
all_embeddings = {}

for subject in LIST_SUBJECT:
    df = load_dataset(subject)
    if df is None:
        continue

    all_data[subject] = df
    texts = df["text"].fillna("").tolist()
    print(f"{subject}: {len(df):,} documents loaded")

    print(f"  Tokenizing for coherence...")
    texts_tokenized = [tokenize_for_coherence(t) for t in tqdm(texts, desc=f"  {subject}")]
    all_texts_tokenized[subject] = texts_tokenized
    all_dictionaries[subject] = Dictionary(texts_tokenized)

    emb_path = str(EMBEDDING_DIR / subject / f"{MODEL_SAFE_NAME}_{VERSION}.mmap")
    embeddings = load_mmap_embeddings(emb_path, len(texts), EMBEDDING_DIM)
    if embeddings is not None:
        all_embeddings[subject] = embeddings
        print(f"  Embeddings loaded: {embeddings.shape}")
    else:
        print(f"  ⚠ Failed to load embeddings")

print(f"\nSubjects ready: {list(all_data.keys())}")

cs: 165,756 documents loaded
  Tokenizing for coherence...


  cs: 100%|██████████| 165756/165756 [00:27<00:00, 6030.82it/s]


  Embeddings loaded: (165756, 768)
math: 126,192 documents loaded
  Tokenizing for coherence...


  math: 100%|██████████| 126192/126192 [00:13<00:00, 9092.65it/s]


  Embeddings loaded: (126192, 768)
physics: 146,311 documents loaded
  Tokenizing for coherence...


  physics: 100%|██████████| 146311/146311 [00:22<00:00, 6426.13it/s]


  Embeddings loaded: (146311, 768)

Subjects ready: ['cs', 'math', 'physics']


In [5]:
results = []

for subject in LIST_SUBJECT:
    if subject not in all_embeddings:
        print(f"\nSkipping {subject} (no embeddings)")
        continue

    texts = all_data[subject]["text"].fillna("").tolist()
    embeddings = all_embeddings[subject]
    texts_tokenized = all_texts_tokenized[subject]
    dictionary = all_dictionaries[subject]

    print(f"\n{'='*80}")
    print(f"Subject: {subject.upper()} ({len(texts):,} documents)")
    print(f"{'='*80}")

    for n_clusters in N_CLUSTERS_LIST:
        print(f"\n[KMeans k={n_clusters}]")

        model_save_path = str(RESULT_DIR / f"{subject}_k{n_clusters}")

        if os.path.exists(model_save_path):
            print(f"  Loading existing model...")
            try:
                topic_model = BERTopic.load(model_save_path)
                n_topics = len(topic_model.get_topic_info()) - 1
                coherence = calculate_coherence(topic_model, texts_tokenized, dictionary)
                outlier_count = sum(1 for t in topic_model.topics_ if t == -1)
                outlier_ratio = outlier_count / len(topic_model.topics_)

                print(f"  Topics: {n_topics} | Coherence: {coherence:.4f} | Outliers: {outlier_ratio:.2%}")
                results.append({
                    "subject": subject,
                    "model": MODEL_NAME,
                    "clustering": "kmeans",
                    "n_clusters": n_clusters,
                    "n_topics": n_topics,
                    "coherence": coherence,
                    "outlier_ratio": outlier_ratio
                })
                del topic_model
                gc.collect()
                continue
            except Exception as e:
                print(f"  Failed to load, retraining: {e}")

        try:
            umap_model = UMAP(
                n_neighbors=15,
                n_components=5,
                metric="cosine",
                random_state=42,
                min_dist=0.0,
                verbose=False
            )

            kmeans_model = KMeans(
                n_clusters=n_clusters,
                random_state=42,
                n_init=10,
                max_iter=300
            )

            topic_model = BERTopic(
                umap_model=umap_model,
                hdbscan_model=kmeans_model,
                calculate_probabilities=False,
                verbose=False
            )

            print(f"  Training BERTopic...")
            topics, _ = topic_model.fit_transform(texts, embeddings=embeddings)
            n_topics = len(topic_model.get_topic_info()) - 1
            outlier_count = sum(1 for t in topics if t == -1)
            outlier_ratio = outlier_count / len(topics)

            print(f"  Topics found: {n_topics}")
            print(f"  Calculating coherence...")
            coherence = calculate_coherence(topic_model, texts_tokenized, dictionary)
            print(f"  Coherence (c_v): {coherence:.4f} | Outliers: {outlier_ratio:.2%}")

            topic_model.save(model_save_path)
            print(f"  ✓ Saved to {model_save_path}")

            results.append({
                "subject": subject,
                "model": MODEL_NAME,
                "clustering": "kmeans",
                "n_clusters": n_clusters,
                "n_topics": n_topics,
                "coherence": coherence,
                "outlier_ratio": outlier_ratio
            })

            del topic_model
            gc.collect()

        except Exception as e:
            print(f"  ✗ Error: {e}")
            results.append({
                "subject": subject,
                "model": MODEL_NAME,
                "clustering": "kmeans",
                "n_clusters": n_clusters,
                "n_topics": None,
                "coherence": None,
                "outlier_ratio": None
            })

csv_path = RESULT_DIR / "coherence_results_kmeans.csv"
results_df = pd.DataFrame(results)
results_df.to_csv(csv_path, index=False)

print(f"\n{'='*80}")
print(f"All training complete! Results saved to: {csv_path}")
print(f"{'='*80}")


Subject: CS (165,756 documents)

[KMeans k=200]
  Loading existing model...
  Topics: 199 | Coherence: 0.7009 | Outliers: 0.00%

[KMeans k=250]
  Training BERTopic...
  Topics found: 249
  Calculating coherence...




  Coherence (c_v): 0.7033 | Outliers: 0.00%
  ✓ Saved to result/kmeans_all_distilroberta_v1_v1/cs_k250

[KMeans k=300]
  Training BERTopic...
  Topics found: 299
  Calculating coherence...




  Coherence (c_v): 0.7049 | Outliers: 0.00%
  ✓ Saved to result/kmeans_all_distilroberta_v1_v1/cs_k300

[KMeans k=500]
  Training BERTopic...
  Topics found: 499
  Calculating coherence...




  Coherence (c_v): 0.6986 | Outliers: 0.00%
  ✓ Saved to result/kmeans_all_distilroberta_v1_v1/cs_k500

Subject: MATH (126,192 documents)

[KMeans k=200]
  Training BERTopic...
  Topics found: 199
  Calculating coherence...




  Coherence (c_v): 0.6893 | Outliers: 0.00%
  ✓ Saved to result/kmeans_all_distilroberta_v1_v1/math_k200

[KMeans k=250]
  Training BERTopic...
  Topics found: 249
  Calculating coherence...




  Coherence (c_v): 0.6910 | Outliers: 0.00%
  ✓ Saved to result/kmeans_all_distilroberta_v1_v1/math_k250

[KMeans k=300]
  Training BERTopic...
  Topics found: 299
  Calculating coherence...




  Coherence (c_v): 0.6916 | Outliers: 0.00%
  ✓ Saved to result/kmeans_all_distilroberta_v1_v1/math_k300

[KMeans k=500]
  Training BERTopic...
  Topics found: 499
  Calculating coherence...




  Coherence (c_v): 0.6676 | Outliers: 0.00%
  ✓ Saved to result/kmeans_all_distilroberta_v1_v1/math_k500

Subject: PHYSICS (146,311 documents)

[KMeans k=200]
  Training BERTopic...
  Topics found: 199
  Calculating coherence...




  Coherence (c_v): 0.7173 | Outliers: 0.00%
  ✓ Saved to result/kmeans_all_distilroberta_v1_v1/physics_k200

[KMeans k=250]
  Training BERTopic...
  Topics found: 249
  Calculating coherence...




  Coherence (c_v): 0.7211 | Outliers: 0.00%
  ✓ Saved to result/kmeans_all_distilroberta_v1_v1/physics_k250

[KMeans k=300]
  Training BERTopic...
  Topics found: 299
  Calculating coherence...




  Coherence (c_v): 0.7224 | Outliers: 0.00%
  ✓ Saved to result/kmeans_all_distilroberta_v1_v1/physics_k300

[KMeans k=500]
  Training BERTopic...
  Topics found: 499
  Calculating coherence...




  Coherence (c_v): 0.7175 | Outliers: 0.00%
  ✓ Saved to result/kmeans_all_distilroberta_v1_v1/physics_k500

All training complete! Results saved to: result/kmeans_all_distilroberta_v1_v1/coherence_results_kmeans.csv


In [6]:
results_df = pd.read_csv(RESULT_DIR / "coherence_results_kmeans.csv")

print("\nKMeans Coherence Results")
print("=" * 90)

pivot = results_df.pivot(index='n_clusters', columns='subject', values='coherence')
pivot['mean'] = pivot.mean(axis=1)
pivot = pivot.sort_values('mean', ascending=False)
print(pivot.round(4))

print(f"\n\nBest K per Subject")
print("=" * 90)

for subject in LIST_SUBJECT:
    subject_df = results_df[results_df['subject'] == subject].dropna(subset=['coherence'])
    if len(subject_df) == 0:
        continue

    best = subject_df.loc[subject_df['coherence'].idxmax()]
    print(f"\n{subject.upper()}:")
    print(f"  Best K: {int(best['n_clusters'])} | Coherence: {best['coherence']:.4f} | "
          f"Topics: {int(best['n_topics'])} | Outliers: {best['outlier_ratio']:.2%}")


KMeans Coherence Results
subject         cs    math  physics    mean
n_clusters                                 
300         0.7049  0.6916   0.7224  0.7063
250         0.7033  0.6910   0.7211  0.7051
200         0.7009  0.6893   0.7173  0.7025
500         0.6986  0.6676   0.7175  0.6946


Best K per Subject

CS:
  Best K: 300 | Coherence: 0.7049 | Topics: 299 | Outliers: 0.00%

MATH:
  Best K: 300 | Coherence: 0.6916 | Topics: 299 | Outliers: 0.00%

PHYSICS:
  Best K: 300 | Coherence: 0.7224 | Topics: 299 | Outliers: 0.00%
