In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/ChiLit_Topic_Modeling

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/ChiLit_Topic_Modeling


In [None]:
!pip install git+https://github.com/tonazzog/OCTIS.git
!pip install optuna

Collecting git+https://github.com/tonazzog/OCTIS.git
  Cloning https://github.com/tonazzog/OCTIS.git to /tmp/pip-req-build-wjjk8v0n
  Running command git clone --filter=blob:none --quiet https://github.com/tonazzog/OCTIS.git /tmp/pip-req-build-wjjk8v0n
  Resolved https://github.com/tonazzog/OCTIS.git to commit 6e0bf318498aee88d43c7e4792648f902aff6b3d
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [

In [None]:
import octis
from octis.models.LDA import LDA
from octis.models.ProdLDA import ProdLDA
from octis.models.ETM import ETM
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.dataset.dataset import Dataset
import optuna
from typing import Tuple, List
import pandas as pd
import json
import os

In [None]:
octis_folder = "./octis/"

### Create OCTIS dataset

In [None]:
def prepare_octis_corpus(output_folder, docs):
  # Write to docs.tsv
  with open(os.path.join(output_folder, "corpus.tsv"), "w", encoding="utf-8") as f:
      for doc in docs:
          f.write(f"{doc}\n")
  # Tokenize and create vocabulary
  vocab = set()
  for doc in docs:
      vocab.update(doc.split())

  vocab = {w for w in vocab if w.isalpha() and len(w) > 2}
  vocab = sorted(vocab)  # sorting is optional but nice for consistency

  # Save vocab.json
  with open(os.path.join(output_folder, "vocab.json"), "w", encoding="utf-8") as f:
      json.dump(vocab, f)

In [None]:
def evaluate_metrics(output, topk=10, measure='c_npmi'):

    diverisity_metric = TopicDiversity(topk=topk) # Initialize metric
    diversity_score = diverisity_metric.score(output) # Compute score of the metric

    coherence_metrics = Coherence(texts=dataset.get_corpus(), #list of our documents
                        measure=measure)
    coherence_score = coherence_metrics.score(output) # Compute score of the metric

    return {'coherence_score' : coherence_score, 'diversity_score' : diversity_score}

In [None]:
df_chilit = pd.read_csv("./data/ChiLit_Chunks.csv")
df_chilit = df_chilit.fillna("")
df_chilit = df_chilit[df_chilit['tokens'] != '']

In [None]:
df_chilit.head()

Unnamed: 0,book_id,chapter_num,paragraph_num,paragraph_text,tokens
0,stalky,6,1,CHAPTER 6. A LITTLE PREP. Easter term was but...,prep term month old major day boy contract dip...
1,stalky,6,2,“How was I to know that the Head ‘ud collar us...,head collar ghastly clothes try raise side iss...
2,stalky,6,3,“I’ve brought over an Indian paper that his mo...,bring indian paper mother send hefty piece wor...
3,stalky,6,4,"“Hullo, Stettson,” said Stalky, checking. “Is ...",check safe right let match mouth plummy trod a...
4,stalky,6,5,“All right. In a minute. But your cuts--your c...,right minute cut cut wound body bit scrimmage ...


In [None]:
docs = []
for _, row in df_chilit.iterrows():
  docs.append(row["tokens"])

prepare_octis_corpus(octis_folder, docs)

#### Load OCTIS Dataset

In [None]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(octis_folder)

### Test LDA, ProdLDA and ETM models with default settings and 20 topics

#### LDA

In [None]:
model = LDA(num_topics=20, passes=3)
output = model.train_model(dataset)



In [None]:
for t in output['topics']:
  print(" ".join(t))

thing want give mother time ask other day put like
old time thing day room mother lady ask hand find
intendant warming day reply cottage leave time give pan other
time other cuckoo leave way old more find thing hand
reply horse ready sir man find time day mile party
weasel bailiff pigeon russian wood find hear thrush fox other
man time find day girl bring old give lady castle
fifth boy man other wicket time last old play run
king knight lord person city man great holy many hand
man hand time great other give place stand fall side
dragon cockatrice drakling town baby blacksmith time mayor eat thing
force army man regiment attack enemy battle order horse advance
thing hear old great way other time tree water find
find other day thing eye time room old turn door
time man find give other hear own day great seem
king queen royal person family palace lady day time other
boy house other man school time old young sir form
time give day speak own hear more poor find great
bairn puir thing man w

In [None]:
scores = evaluate_metrics(output)
print(scores)

{'coherence_score': 0.01104965933116147, 'diversity_score': 0.51}


#### ProdLDA

In [None]:
model = ProdLDA(num_topics=20, use_partitions=False, num_epochs = 3)
output = model.train_model(dataset)

Epoch: [1/3]	Samples: [2748/8244]	Train Loss: 4289.605519923581	Time: 0:00:00.481388
Epoch: [2/3]	Samples: [5496/8244]	Train Loss: 4243.291211790393	Time: 0:00:00.367408
Epoch: [3/3]	Samples: [8244/8244]	Train Loss: 4196.506192003275	Time: 0:00:00.391181


In [None]:
for t in output['topics']:
  print(" ".join(t))

run head air lie apple grow foot pull pocket small
gun captain approach soldier ship sail effect side observe continue
hand wait own life face attend smile attendant remind daughter
ready strong day water remain drive boat fight night join
whisper more bed ask moment stair bear whole try time
sea white sun dark catch water run shine stop green
nice lot else suggest idea sort red hate least right
full conduct church sign swing kindly race tremble justice sway
vers kinswoman cast inspiring bugbear shadow strive prostrate anglo geologist
door room last speak father voice open turn right gentleman
place man follow blood number discover forest same spear pass
crown matter happen remember guinea rich week chimney clever one
home write understand year glad girl happy day first mind
fall stone wild cut fly moon thick pull top piece
able visit keep hasten power letter thrush son other fox
own feeling give person bear subject sister child read family
fear fall danger great ship dead man king cas

In [None]:
scores = evaluate_metrics(output)
print(scores)

{'coherence_score': -0.02175888091450341, 'diversity_score': 0.925}


### ETM

In [None]:
model = ETM(num_topics=20, num_epochs = 3, use_partitions=False, device = 'gpu', embeddings_path='./data/chilit-19th-century-averaged-embeddings.txt')
output = model.train_model(dataset)

model: ETM(
  (t_drop): Dropout(p=0.5, inplace=False)
  (theta_act): ReLU()
  (rho): Linear(in_features=300, out_features=23066, bias=False)
  (alphas): Linear(in_features=300, out_features=20, bias=False)
  (q_theta): Sequential(
    (0): Linear(in_features=23066, out_features=800, bias=True)
    (1): ReLU()
    (2): Linear(in_features=800, out_features=800, bias=True)
    (3): ReLU()
  )
  (mu_q_theta): Linear(in_features=800, out_features=20, bias=True)
  (logsigma_q_theta): Linear(in_features=800, out_features=20, bias=True)
)
Epoch: 1 .. batch: 20/22 .. LR: 0.005 .. KL_theta: 0.69 .. Rec_loss: 3813.05 .. NELBO: 3813.74
****************************************************************************************************
Epoch----->1 .. LR: 0.005 .. KL_theta: 0.67 .. Rec_loss: 3795.2 .. NELBO: 3795.87
****************************************************************************************************
Epoch: 2 .. batch: 20/22 .. LR: 0.005 .. KL_theta: 0.27 .. Rec_loss: 3315.34 .. NELB

In [None]:
for t in output['topics']:
  print(" ".join(t))

time day man other give great find more thing hand
time day man other great give thing find old hand
time day man other give great old thing find more
time day other man give find hand great more thing
time day man other give thing find more great old
time day man other find give more great old thing
time day man other give great thing find hear old
time day man other thing give great more find hand
time day man other give thing great find hand old
time day man other give thing find great hand more
time day man other give thing find great old more
time day other man give thing great old find more
time day man other give find great thing hand more
time day man other give thing old find hand great
time day man other give great find thing old more
time day man other give great find thing more old
time day man other thing give find old great hand
time day man other give thing more great find hand
time man day other give find great thing hand old
time man day other find give thing more hand

In [None]:
scores = evaluate_metrics(output)
print(scores)

{'coherence_score': -0.009471491443359455, 'diversity_score': 0.06}


LDA multi-objective optimization

In [None]:
def objectiveLDA(trial) -> Tuple[float, float]:
    # Define hyperparameters to optimize
    num_topics = trial.suggest_int("num_topics", 10, 50)
    alpha = trial.suggest_float("alpha", 0.01, 0.2)
    eta = trial.suggest_float("eta", 0.01, 0.2)
    passes = trial.suggest_int("passes", 5, 30)
    iterations = trial.suggest_int("iterations", 50, 200)

    # Train LDA model
    model = LDA(
        num_topics=num_topics,
        alpha=alpha,
        eta=eta,
        passes=passes,
        iterations=iterations,
        random_state=42,
    )

    output = model.train_model(dataset)

    # Compute coherence score (can also use perplexity, but coherence is often better)
    coherence_metrics = Coherence(texts=dataset.get_corpus(), #list of our documents
                    measure='c_npmi')
    coherence = coherence_metrics.score(output)

    diverisity_metric = TopicDiversity(topk=10) # Initialize metric
    diversity = diverisity_metric.score(output)

    return coherence, diversity  # Optuna will maximize these

# Run optimization
study = optuna.create_study(directions=["maximize","maximize"])
study.optimize(objectiveLDA, n_trials=50)

[I 2025-08-07 14:06:38,745] A new study created in memory with name: no-name-057d2d35-35e2-4688-b797-3cd80d1dd0e3
[I 2025-08-07 14:11:05,091] Trial 0 finished with values: [-0.052136537968792196, 0.8193548387096774] and parameters: {'num_topics': 31, 'alpha': 0.07210061858087007, 'eta': 0.059849185952258105, 'passes': 27, 'iterations': 116}.
[I 2025-08-07 14:15:09,293] Trial 1 finished with values: [-0.11238423449249609, 0.8125] and parameters: {'num_topics': 40, 'alpha': 0.17615896282492316, 'eta': 0.08460935932900758, 'passes': 25, 'iterations': 75}.
[I 2025-08-07 14:16:28,822] Trial 2 finished with values: [-0.047842842104858556, 0.6815789473684211] and parameters: {'num_topics': 38, 'alpha': 0.16022161078623326, 'eta': 0.1181562743904139, 'passes': 7, 'iterations': 55}.
[I 2025-08-07 14:17:52,506] Trial 3 finished with values: [-0.07657450649087337, 0.7545454545454545] and parameters: {'num_topics': 22, 'alpha': 0.1473831026003162, 'eta': 0.07491147937729875, 'passes': 10, 'iterati

In [None]:
def train_final_model(params):
    """Train final LDA model with selected parameters"""
    print(f"\nTraining final model with parameters: {params}")

    model = LDA(
        num_topics=params["num_topics"],
        alpha=params["alpha"],
        eta=params["eta"],
        passes=params["passes"],
        iterations=params["iterations"],
        random_state=42,
    )

    output = model.train_model(dataset)

    # Calculate final metrics
    coherence_metrics = Coherence(texts=dataset.get_corpus(), #list of our documents
                    measure='c_npmi')
    coherence = coherence_metrics.score(output)

    diverisity_metric = TopicDiversity(topk=10) # Initialize metric
    diversity = diverisity_metric.score(output)


    print(f"Final model metrics:")
    print(f"  Coherence: {coherence:.4f}")
    print(f"  Diversity: {diversity:.4f}")

    return output, coherence, diversity

In [1]:
if study.best_trials:
    # Get balanced solution (you can also pick best coherence or best diversity)
    pareto_trials = study.best_trials

    # For demo, let's pick the first Pareto optimal solution
    selected_params = pareto_trials[0].params
    final_model, final_coherence, final_diversity = train_final_model(selected_params)

    print(f"\nFinal model trained successfully!")

NameError: name 'study' is not defined