# DTM Hyperparameter Tuning (tomotopy)

Grid-search hyperparameter tuning for tomotopy DTModel across subjects (cs, math, physics).
Evaluates coherence (C_v) and saves the best model immediately when a new best is found.

## Imports & Configuration

In [1]:
import os
import gc
import ast
import time
import pandas as pd
import numpy as np
import tomotopy as tp
from pathlib import Path
from itertools import product
from tqdm.notebook import tqdm
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
print(f"tomotopy version: {tp.isa}")

tomotopy version: avx2


In [2]:
LIST_SUBJECT = ["cs", "math", "physics"]

BASE_DIR = Path("../../dataset")
TUNNING_DIR = Path("./tunning")
VERSION = "v1"

for subject in LIST_SUBJECT:
    (TUNNING_DIR / subject).mkdir(parents=True, exist_ok=True)

print(f"Subjects: {LIST_SUBJECT}")
print(f"Output directory: {TUNNING_DIR}")

Subjects: ['cs', 'math', 'physics']
Output directory: tunning


## Hyperparameter Grid

In [3]:
PARAM_GRID = {
    "k": [25, 50, 75, 150],
    "alpha_var": [0.01, 0.1],
    "phi_var": [0.01, 0.1],
    "train_iter": [500],
}

FIXED_PARAMS = {
    "eta_var": 0.1,
    "lr_a": 0.01,
    "lr_b": 0.1,
    "lr_c": 0.55,
    "min_cf": 5,
    "min_df": 3,
    "seed": 42,
}

ITERATION_STEP = 50

keys = list(PARAM_GRID.keys())
values = list(PARAM_GRID.values())
all_combos = list(product(*values))

print(f"Tunable parameters: {keys}")
print(f"Total parameter combinations: {len(all_combos)}")
print(f"Total runs (combinations x subjects): {len(all_combos) * len(LIST_SUBJECT)}")

Tunable parameters: ['k', 'alpha_var', 'phi_var', 'train_iter']
Total parameter combinations: 16
Total runs (combinations x subjects): 48


## Helper Functions

In [4]:
def load_and_preprocess(subject: str):
    file_path = BASE_DIR / subject / "bow" / f"{VERSION}.csv"
    df = pd.read_csv(file_path)

    if not pd.api.types.is_datetime64_any_dtype(df["submitted_date"]):
        df["submitted_date"] = pd.to_datetime(df["submitted_date"])
    df["year"] = df["submitted_date"].dt.year
    all_years = sorted(df["year"].unique())
    num_time_steps = len(all_years)
    year_to_timestep = {year: i for i, year in enumerate(all_years)}
    df["timestep"] = df["year"].map(year_to_timestep)

    token_list = df["text"].tolist()
    processed_docs = [ast.literal_eval(x) for x in token_list]
    doc_timesteps = df["timestep"].tolist()

    return df, processed_docs, doc_timesteps, num_time_steps, all_years


def build_and_train_dtm(processed_docs, doc_timesteps, num_time_steps, params):
    dt_model = tp.DTModel(
        t=num_time_steps,
        k=params["k"],
        alpha_var=params["alpha_var"],
        eta_var=FIXED_PARAMS["eta_var"],
        phi_var=params["phi_var"],
        lr_a=FIXED_PARAMS["lr_a"],
        lr_b=FIXED_PARAMS["lr_b"],
        lr_c=FIXED_PARAMS["lr_c"],
        min_cf=FIXED_PARAMS["min_cf"],
        min_df=FIXED_PARAMS["min_df"],
        seed=FIXED_PARAMS["seed"],
    )

    for i in range(len(processed_docs)):
        if processed_docs[i]:
            dt_model.add_doc(processed_docs[i], timepoint=doc_timesteps[i])

    train_iter = params["train_iter"]
    for i in range(0, train_iter, ITERATION_STEP):
        dt_model.train(ITERATION_STEP)

    return dt_model


def calculate_coherence(model) -> float:
    """Calculate C_v coherence using tomotopy's built-in Coherence."""
    coherence = tp.coherence.Coherence(model, coherence="c_v", top_n=10)
    return coherence.get_score()

## Load & Preprocess All Subjects

In [5]:
all_data = {}
all_processed_docs = {}
all_doc_timesteps = {}
all_num_time_steps = {}
all_years = {}

for subject in LIST_SUBJECT:
    print(f"Loading {subject}...")
    df, processed_docs, doc_timesteps, num_time_steps, years = load_and_preprocess(subject)
    all_data[subject] = df
    all_processed_docs[subject] = processed_docs
    all_doc_timesteps[subject] = doc_timesteps
    all_num_time_steps[subject] = num_time_steps
    all_years[subject] = years
    print(f"  {subject}: {len(df):,} documents, {num_time_steps} time steps ({years[0]}-{years[-1]})")

print(f"\nSubjects ready: {list(all_data.keys())}")

Loading cs...
  cs: 165,756 documents, 26 time steps (2000-2025)
Loading math...
  math: 157,085 documents, 26 time steps (2000-2025)
Loading physics...
  physics: 146,311 documents, 26 time steps (2000-2025)

Subjects ready: ['cs', 'math', 'physics']


## Hyperparameter Tuning Grid Search

In [6]:
total_combos = len(all_combos)

for subject in LIST_SUBJECT:
    processed_docs = all_processed_docs[subject]
    doc_timesteps = all_doc_timesteps[subject]
    num_time_steps = all_num_time_steps[subject]
    n_docs = len(all_data[subject])

    results_csv_path = TUNNING_DIR / subject / "tuning_results.csv"
    best_model_path = TUNNING_DIR / subject / "best_model.bin"
    best_coherence = -1.0

    # Resume support
    existing_results = []
    if results_csv_path.exists():
        existing_df = pd.read_csv(results_csv_path)
        existing_results = existing_df.to_dict("records")
        if len(existing_results) > 0:
            best_coherence = existing_df["coherence_cv"].max()
            print(f"Resuming {subject}: {len(existing_results)} previous runs found, best coherence so far: {best_coherence:.4f}")

    results = existing_results.copy()

    completed_param_sets = set()
    for r in existing_results:
        param_key = (r["k"], r["alpha_var"], r["phi_var"], r["train_iter"])
        completed_param_sets.add(param_key)

    print(f"\n{'='*70}")
    print(f"Subject: {subject.upper()} ({n_docs:,} documents, {num_time_steps} time steps)")
    print(f"{'='*70}")

    for idx, combo in enumerate(all_combos, 1):
        params = dict(zip(keys, combo))

        param_key = (params["k"], params["alpha_var"], params["phi_var"], params["train_iter"])
        if param_key in completed_param_sets:
            continue

        print(f"\n[{idx}/{total_combos}] {subject} | "
              f"k={params['k']} α_var={params['alpha_var']} "
              f"φ_var={params['phi_var']} iter={params['train_iter']}")

        try:
            start_time = time.time()

            model = build_and_train_dtm(
                processed_docs, doc_timesteps, num_time_steps, params
            )
            coherence = calculate_coherence(model)

            elapsed = time.time() - start_time

            result_row = {
                "subject": subject,
                "k": params["k"],
                "alpha_var": params["alpha_var"],
                "phi_var": params["phi_var"],
                "train_iter": params["train_iter"],
                "eta_var": FIXED_PARAMS["eta_var"],
                "lr_a": FIXED_PARAMS["lr_a"],
                "lr_b": FIXED_PARAMS["lr_b"],
                "lr_c": FIXED_PARAMS["lr_c"],
                "min_cf": FIXED_PARAMS["min_cf"],
                "min_df": FIXED_PARAMS["min_df"],
                "seed": FIXED_PARAMS["seed"],
                "coherence_cv": coherence,
                "ll_per_word": model.ll_per_word,
                "time_seconds": round(elapsed, 1),
            }
            results.append(result_row)

            is_new_best = coherence > best_coherence
            if is_new_best:
                best_coherence = coherence
                model.save(str(best_model_path))
                print(f"  ⭐ NEW BEST | Coherence: {coherence:.4f} ({elapsed:.1f}s) → Saved to {best_model_path}")
            else:
                print(f"  ✓ Coherence: {coherence:.4f} ({elapsed:.1f}s) | Best: {best_coherence:.4f}")

            # Save results incrementally
            pd.DataFrame(results).to_csv(results_csv_path, index=False)

            del model
            gc.collect()

        except Exception as e:
            print(f"  ✗ ERROR: {e}")
            result_row = {
                "subject": subject,
                "k": params["k"],
                "alpha_var": params["alpha_var"],
                "phi_var": params["phi_var"],
                "train_iter": params["train_iter"],
                "eta_var": FIXED_PARAMS["eta_var"],
                "lr_a": FIXED_PARAMS["lr_a"],
                "lr_b": FIXED_PARAMS["lr_b"],
                "lr_c": FIXED_PARAMS["lr_c"],
                "min_cf": FIXED_PARAMS["min_cf"],
                "min_df": FIXED_PARAMS["min_df"],
                "seed": FIXED_PARAMS["seed"],
                "coherence_cv": None,
                "ll_per_word": None,
                "time_seconds": None,
            }
            results.append(result_row)
            pd.DataFrame(results).to_csv(results_csv_path, index=False)
            gc.collect()

    print(f"\n{'='*70}")
    print(f"✅ {subject.upper()} COMPLETE | Best coherence: {best_coherence:.4f}")
    print(f"Results saved to: {results_csv_path}")
    print(f"Best model saved to: {best_model_path}")
    print(f"{'='*70}")

Resuming cs: 13 previous runs found, best coherence so far: 0.5036

Subject: CS (165,756 documents, 26 time steps)

[14/16] cs | k=150 α_var=0.01 φ_var=0.1 iter=500


  dt_model.train(ITERATION_STEP)


  ✓ Coherence: 0.4673 (1504.0s) | Best: 0.5036

[15/16] cs | k=150 α_var=0.1 φ_var=0.01 iter=500
  ✓ Coherence: 0.4449 (1750.5s) | Best: 0.5036

[16/16] cs | k=150 α_var=0.1 φ_var=0.1 iter=500
  ✓ Coherence: 0.4746 (1692.8s) | Best: 0.5036

✅ CS COMPLETE | Best coherence: 0.5036
Results saved to: tunning/cs/tuning_results.csv
Best model saved to: tunning/cs/best_model.bin

Subject: MATH (157,085 documents, 26 time steps)

[1/16] math | k=25 α_var=0.01 φ_var=0.01 iter=500
  ⭐ NEW BEST | Coherence: 0.4316 (276.7s) → Saved to tunning/math/best_model.bin

[2/16] math | k=25 α_var=0.01 φ_var=0.1 iter=500
  ✓ Coherence: 0.4254 (287.4s) | Best: 0.4316

[3/16] math | k=25 α_var=0.1 φ_var=0.01 iter=500
  ⭐ NEW BEST | Coherence: 0.4338 (278.4s) → Saved to tunning/math/best_model.bin

[4/16] math | k=25 α_var=0.1 φ_var=0.1 iter=500
  ✓ Coherence: 0.4250 (292.3s) | Best: 0.4338

[5/16] math | k=50 α_var=0.01 φ_var=0.01 iter=500
  ✓ Coherence: 0.4306 (445.5s) | Best: 0.4338

[6/16] math | k=50 α_va

## Summary: Best Parameters Per Subject

In [7]:
print("\n" + "=" * 90)
print("FINAL RESULTS: Best Parameters Per Subject")
print("=" * 90)

for subject in LIST_SUBJECT:
    results_csv_path = TUNNING_DIR / subject / "tuning_results.csv"
    if not results_csv_path.exists():
        print(f"\n{subject.upper()}: No results found")
        continue

    df = pd.read_csv(results_csv_path)
    df_valid = df.dropna(subset=["coherence_cv"])

    if len(df_valid) == 0:
        print(f"\n{subject.upper()}: No valid results")
        continue

    best_row = df_valid.loc[df_valid["coherence_cv"].idxmax()]

    print(f"\n{'─'*50}")
    print(f"  Subject: {subject.upper()}")
    print(f"  Total runs: {len(df_valid)}")
    print(f"  Best Coherence (C_v): {best_row['coherence_cv']:.4f}")
    print(f"  Log-likelihood/word: {best_row['ll_per_word']:.4f}")
    print(f"  Parameters:")
    print(f"    k          = {int(best_row['k'])}")
    print(f"    alpha_var  = {best_row['alpha_var']}")
    print(f"    phi_var    = {best_row['phi_var']}")
    print(f"    train_iter = {int(best_row['train_iter'])}")
    print(f"    min_cf     = {int(best_row['min_cf'])}")
    print(f"    min_df     = {int(best_row['min_df'])}")
    print(f"{'─'*50}")

print("\n" + "=" * 90)
print("Top 5 configurations per subject:")
print("=" * 90)

for subject in LIST_SUBJECT:
    results_csv_path = TUNNING_DIR / subject / "tuning_results.csv"
    if not results_csv_path.exists():
        continue

    df = pd.read_csv(results_csv_path)
    df_valid = df.dropna(subset=["coherence_cv"]).sort_values("coherence_cv", ascending=False)

    print(f"\n{subject.upper()}:")
    print(df_valid[["k", "alpha_var", "phi_var", "train_iter", "coherence_cv", "ll_per_word", "time_seconds"]].head(5).to_string(index=False))
    print()


FINAL RESULTS: Best Parameters Per Subject

──────────────────────────────────────────────────
  Subject: CS
  Total runs: 16
  Best Coherence (C_v): 0.5036
  Log-likelihood/word: 5.3020
  Parameters:
    k          = 150
    alpha_var  = 0.01
    phi_var    = 0.01
    train_iter = 500
    min_cf     = 5
    min_df     = 3
──────────────────────────────────────────────────

──────────────────────────────────────────────────
  Subject: MATH
  Total runs: 16
  Best Coherence (C_v): 0.4549
  Log-likelihood/word: 11.1159
  Parameters:
    k          = 150
    alpha_var  = 0.1
    phi_var    = 0.01
    train_iter = 500
    min_cf     = 5
    min_df     = 3
──────────────────────────────────────────────────

──────────────────────────────────────────────────
  Subject: PHYSICS
  Total runs: 16
  Best Coherence (C_v): 0.4228
  Log-likelihood/word: -10.5394
  Parameters:
    k          = 150
    alpha_var  = 0.01
    phi_var    = 0.1
    train_iter = 500
    min_cf     = 5
    min_df     = 3
