# LDA Hyperparameter Tuning

Grid-search hyperparameter tuning for Gensim LDA across subjects (cs, math, physics).
Evaluates coherence (C_v) and saves the best model immediately when a new best is found.

In [1]:
import os
import gc
import pickle
import time
import pandas as pd
import numpy as np
from pathlib import Path
from itertools import product
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

## Configuration

In [2]:
LIST_SUBJECT = [ "math"]

BASE_DIR = Path("../../dataset")
TUNNING_DIR = Path("./tunning")
VERSION = "v1"

for subject in LIST_SUBJECT:
    (TUNNING_DIR / subject).mkdir(parents=True, exist_ok=True)

print(f"Subjects: {LIST_SUBJECT}")
print(f"Output directory: {TUNNING_DIR}")

Subjects: ['math']
Output directory: tunning


## Hyperparameter Grid

In [3]:

PARAM_GRID = {
    "num_topics": [25, 50, 75, 150],   
    "alpha": ["asymmetric", 0.01], 
    "eta": ["auto", 0.01],             
    "passes": [15],                   
}

FIXED_PARAMS = {
    "chunksize": 2000,
    "random_state": 42,
    "workers": 5,
}

keys = list(PARAM_GRID.keys())
values = list(PARAM_GRID.values())
all_combos = list(product(*values))

print(f"Tunable parameters: {keys}")
print(f"Total parameter combinations: {len(all_combos)}")
print(f"Total runs (combinations x subjects): {len(all_combos) * len(LIST_SUBJECT)}")

Tunable parameters: ['num_topics', 'alpha', 'eta', 'passes']
Total parameter combinations: 16
Total runs (combinations x subjects): 16


## Helper Functions

In [4]:
def load_and_preprocess(subject: str):
    file_path = BASE_DIR / subject / "emb" / f"{VERSION}.csv"
    df = pd.read_csv(file_path)
    
    token_list = df["text"].tolist()
    tokens = [x.split() for x in token_list]

    dictionary = Dictionary(tokens)
    dictionary.filter_extremes(no_below=15, no_above=0.5)
    dictionary.compactify()

    corpus = [dictionary.doc2bow(text) for text in tokens]

    return df, tokens, dictionary, corpus


def train_lda(corpus, dictionary, params: dict) -> LdaMulticore:
    model = LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=params["num_topics"],
        alpha=params["alpha"],
        eta=params["eta"],
        passes=params["passes"],
        chunksize=FIXED_PARAMS["chunksize"],
        random_state=FIXED_PARAMS["random_state"],
        workers=FIXED_PARAMS["workers"],
    )
    return model


def calculate_coherence(model: LdaMulticore, texts, dictionary: Dictionary) -> float:
    topic_tuples = model.show_topics(
        num_topics=model.num_topics,
        num_words=10,
        formatted=False
    )
    topics = [[word for word, _ in words_probs] for _, words_probs in topic_tuples]

    cm = CoherenceModel(
        topics=topics,
        texts=texts,
        dictionary=dictionary,
        coherence='c_v',
        processes=5
    )
    return cm.get_coherence()

## Load & Preprocess All Subjects

In [5]:
all_data = {}
all_tokens = {}
all_dictionaries = {}
all_corpora = {}

for subject in LIST_SUBJECT:
    print(f"Loading {subject}...")
    df, tokens, dictionary, corpus = load_and_preprocess(subject)
    all_data[subject] = df
    all_tokens[subject] = tokens
    all_dictionaries[subject] = dictionary
    all_corpora[subject] = corpus
    print(f"  {subject}: {len(df):,} documents, {len(dictionary):,} terms in dictionary")

print(f"\nSubjects ready: {list(all_data.keys())}")

Loading math...
  math: 157,085 documents, 16,835 terms in dictionary

Subjects ready: ['math']


## Hyperparameter Tuning Grid Search

In [6]:
total_runs = len(all_combos) * len(LIST_SUBJECT)
run_counter = 0

for subject in LIST_SUBJECT:
    corpus = all_corpora[subject]
    dictionary = all_dictionaries[subject]
    tokens = all_tokens[subject]
    n_docs = len(all_data[subject])

    results_csv_path = TUNNING_DIR / subject / "tuning_results.csv"
    best_model_path = TUNNING_DIR / subject / "best_model.pkl"
    best_coherence = -1.0

    existing_results = []
    if results_csv_path.exists():
        existing_df = pd.read_csv(results_csv_path)
        existing_results = existing_df.to_dict('records')
        if len(existing_results) > 0:
            best_coherence = existing_df["coherence_cv"].max()
            print(f"Resuming {subject}: {len(existing_results)} previous runs found, best coherence so far: {best_coherence:.4f}")

    results = existing_results.copy()

    completed_param_sets = set()
    for r in existing_results:
        param_key = (r["num_topics"], str(r["alpha"]), str(r["eta"]), r["passes"])
        completed_param_sets.add(param_key)

    print(f"\n{'='*70}")
    print(f"Subject: {subject.upper()} ({n_docs:,} documents)")
    print(f"{'='*70}")

    for combo in all_combos:
        run_counter += 1
        params = dict(zip(keys, combo))

        param_key = (params["num_topics"], str(params["alpha"]), str(params["eta"]), params["passes"])
        if param_key in completed_param_sets:
            continue

        print(f"\n[{run_counter}/{total_runs}] {subject} | "
              f"k={params['num_topics']} α={params['alpha']} η={params['eta']} "
              f"passes={params['passes']}")

        try:
            start_time = time.time()

            model = train_lda(corpus, dictionary, params)
            coherence = calculate_coherence(model, tokens, dictionary)

            elapsed = time.time() - start_time

            result_row = {
                "subject": subject,
                "num_topics": params["num_topics"],
                "alpha": str(params["alpha"]),
                "eta": str(params["eta"]),
                "passes": params["passes"],
                "chunksize": FIXED_PARAMS["chunksize"],
                "random_state": FIXED_PARAMS["random_state"],
                "workers": FIXED_PARAMS["workers"],
                "coherence_cv": coherence,
                "time_seconds": round(elapsed, 1),
            }
            results.append(result_row)

            is_new_best = coherence > best_coherence
            if is_new_best:
                best_coherence = coherence
                with open(best_model_path, "wb") as f:
                    pickle.dump(model, f)
                print(f"  ⭐ NEW BEST | Coherence: {coherence:.4f} ({elapsed:.1f}s) → Saved to {best_model_path}")
            else:
                print(f"  ✓ Coherence: {coherence:.4f} ({elapsed:.1f}s) | Best: {best_coherence:.4f}")

            pd.DataFrame(results).to_csv(results_csv_path, index=False)

            del model
            gc.collect()

        except Exception as e:
            print(f"  ✗ ERROR: {e}")
            result_row = {
                "subject": subject,
                "num_topics": params["num_topics"],
                "alpha": str(params["alpha"]),
                "eta": str(params["eta"]),
                "passes": params["passes"],
                "chunksize": FIXED_PARAMS["chunksize"],
                "random_state": FIXED_PARAMS["random_state"],
                "workers": FIXED_PARAMS["workers"],
                "coherence_cv": None,
                "time_seconds": None,
            }
            results.append(result_row)
            pd.DataFrame(results).to_csv(results_csv_path, index=False)
            gc.collect()

    print(f"\n{'='*70}")
    print(f"✅ {subject.upper()} COMPLETE | Best coherence: {best_coherence:.4f}")
    print(f"Results saved to: {results_csv_path}")
    print(f"Best model saved to: {best_model_path}")
    print(f"{'='*70}")


Subject: MATH (157,085 documents)

[1/16] math | k=25 α=asymmetric η=auto passes=15
  ⭐ NEW BEST | Coherence: 0.5598 (123.8s) → Saved to tunning/math/best_model.pkl

[2/16] math | k=25 α=asymmetric η=0.01 passes=15
  ⭐ NEW BEST | Coherence: 0.5638 (140.7s) → Saved to tunning/math/best_model.pkl

[3/16] math | k=25 α=0.01 η=auto passes=15
  ⭐ NEW BEST | Coherence: 0.5694 (126.6s) → Saved to tunning/math/best_model.pkl

[4/16] math | k=25 α=0.01 η=0.01 passes=15
  ✓ Coherence: 0.5611 (142.7s) | Best: 0.5694

[5/16] math | k=50 α=asymmetric η=auto passes=15
  ✓ Coherence: 0.5562 (223.4s) | Best: 0.5694

[6/16] math | k=50 α=asymmetric η=0.01 passes=15
  ✓ Coherence: 0.5613 (227.2s) | Best: 0.5694

[7/16] math | k=50 α=0.01 η=auto passes=15
  ✓ Coherence: 0.5559 (160.0s) | Best: 0.5694

[8/16] math | k=50 α=0.01 η=0.01 passes=15
  ✓ Coherence: 0.5600 (191.5s) | Best: 0.5694

[9/16] math | k=75 α=asymmetric η=auto passes=15
  ✓ Coherence: 0.5472 (397.5s) | Best: 0.5694

[10/16] math | k=75

## Summary: Best Parameters Per Subject

In [7]:
print("\n" + "=" * 90)
print("FINAL RESULTS: Best Parameters Per Subject")
print("=" * 90)

for subject in LIST_SUBJECT:
    results_csv_path = TUNNING_DIR / subject / "tuning_results.csv"
    if not results_csv_path.exists():
        print(f"\n{subject.upper()}: No results found")
        continue

    df = pd.read_csv(results_csv_path)
    df_valid = df.dropna(subset=["coherence_cv"])

    if len(df_valid) == 0:
        print(f"\n{subject.upper()}: No valid results")
        continue

    best_row = df_valid.loc[df_valid["coherence_cv"].idxmax()]

    print(f"\n{'─'*50}")
    print(f"  Subject: {subject.upper()}")
    print(f"  Total runs: {len(df_valid)}")
    print(f"  Best Coherence (C_v): {best_row['coherence_cv']:.4f}")
    print(f"  Parameters:")
    print(f"    num_topics = {best_row['num_topics']}")
    print(f"    alpha      = {best_row['alpha']}")
    print(f"    eta        = {best_row['eta']}")
    print(f"    passes     = {best_row['passes']}")
    print(f"    chunksize  = {best_row['chunksize']}")
    print(f"{'─'*50}")

print("\n" + "=" * 90)
print("Top 5 configurations per subject:")
print("=" * 90)

for subject in LIST_SUBJECT:
    results_csv_path = TUNNING_DIR / subject / "tuning_results.csv"
    if not results_csv_path.exists():
        continue

    df = pd.read_csv(results_csv_path)
    df_valid = df.dropna(subset=["coherence_cv"]).sort_values("coherence_cv", ascending=False)

    print(f"\n{subject.upper()}:")
    print(df_valid[["num_topics", "alpha", "eta", "passes", "coherence_cv", "time_seconds"]].head(5).to_string(index=False))
    print()


FINAL RESULTS: Best Parameters Per Subject

──────────────────────────────────────────────────
  Subject: MATH
  Total runs: 16
  Best Coherence (C_v): 0.5694
  Parameters:
    num_topics = 25
    alpha      = 0.01
    eta        = auto
    passes     = 15
    chunksize  = 2000
──────────────────────────────────────────────────

Top 5 configurations per subject:

MATH:
 num_topics      alpha  eta  passes  coherence_cv  time_seconds
         25       0.01 auto      15      0.569442         126.6
         25 asymmetric 0.01      15      0.563847         140.7
         50 asymmetric 0.01      15      0.561318         227.2
         25       0.01 0.01      15      0.561129         142.7
         50       0.01 0.01      15      0.559955         191.5

