## Topic modeling on ChiLit corpus with BERTopic and OPTUNA optimization

In [20]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/ChiLit_Topic_Modeling

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/ChiLit_Topic_Modeling


In [None]:
!pip install git+https://github.com/tonazzog/OCTIS.git

In [22]:
!pip install bertopic
!pip install umap-learn
!pip install hdbscan
!pip install optuna
!pip install gensim



In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
import optuna
from umap import UMAP
from hdbscan import HDBSCAN
from gensim.corpora import Dictionary
import pandas as pd
from typing import Tuple, List
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

ImportError: cannot import name 'BERTopic' from 'bertopic' (unknown location)

In [None]:
optuna_folder = "./optuna_200/"

In [None]:
df_chilit = pd.read_csv("./data/ChiLit_Chunks_200.csv")
df_chilit = df_chilit.fillna("")
sentences = df_chilit['tokens'].to_list()

In [None]:
df_chilit.head()

Unnamed: 0,book_id,chapter_num,paragraph_num,paragraph_text,tokens
0,alice,1,1,CHAPTER I. Down the Rabbit-Hole Alice was begi...,hole begin tired sit sister bank peep book sis...
1,alice,1,2,"Alice was not a bit hurt, and she jumped up on...",bit hurt jump foot moment dark long passage wh...
2,alice,2,1,‘Curiouser and curiouser!’ cried Alice (she wa...,curiouser curiouser cry surprise moment forget...
3,alice,2,2,‘How doth the little crocodile Impr...,crocodile improve shine tail pour water golden...
4,alice,3,1,CHAPTER III. A Caucus-Race and a Long Tale The...,race long tale queer party assemble bank bird ...


### Remove noisy tokens

In [23]:
df_books = df_chilit.groupby('book_id', as_index=False).agg({
    'tokens': lambda x: ' '.join(x)
})

In [None]:
df_books.head()

Unnamed: 0,book_id,tokens
0,alice,hole begin tired sit sister bank peep book sis...
1,alone,close sultry day hot dog day open country dus...
2,amulet,chapter psammead child spend summer holiday wh...
3,beauty,chapter early home first place remember large ...
4,brass,receive commission day week week half pull wat...


In [24]:
docs = df_books['tokens'].to_list()

# Create TF-IDF matrix
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Get feature (term) names
terms = vectorizer.get_feature_names_out()

# Convert to a DataFrame for easy viewing
tfidf_df = pd.DataFrame(X.toarray(), columns=terms)
print(tfidf_df)

    aback     abaft  aband   abandon  abandonment  abase  abasement     abash  \
0     0.0  0.000000    0.0  0.000000     0.000000    0.0        0.0  0.000000   
1     0.0  0.000000    0.0  0.000000     0.000000    0.0        0.0  0.000000   
2     0.0  0.000000    0.0  0.000000     0.000000    0.0        0.0  0.000000   
3     0.0  0.000000    0.0  0.000000     0.000000    0.0        0.0  0.000000   
4     0.0  0.000000    0.0  0.008608     0.000000    0.0        0.0  0.003657   
..    ...       ...    ...       ...          ...    ...        ...       ...   
66    0.0  0.004483    0.0  0.000000     0.000000    0.0        0.0  0.000000   
67    0.0  0.000000    0.0  0.008460     0.003863    0.0        0.0  0.000000   
68    0.0  0.000000    0.0  0.003145     0.000000    0.0        0.0  0.000000   
69    0.0  0.000000    0.0  0.001089     0.000000    0.0        0.0  0.001851   
70    0.0  0.000000    0.0  0.001891     0.000000    0.0        0.0  0.003213   

       abate  abatement  ..

In [25]:
idf_values = vectorizer.idf_
idf_df = pd.DataFrame({
    'term': terms,
    'idf': idf_values
}).sort_values(by='idf', ascending=False)

In [26]:
idf_df[-50:]

Unnamed: 0,term,idf
13486,name,1.04256
1560,become,1.04256
2344,break,1.04256
1273,bad,1.04256
12079,live,1.04256
11712,last,1.04256
18097,set,1.04256
11394,keep,1.04256
21141,tree,1.04256
23064,way,1.04256


In [27]:
filtered_terms = idf_df[idf_df['idf'] < 1.05]['term'].to_list()

In [28]:
len(filtered_terms)

60

### Remove other not interesting tokens

In [29]:
filter_tokens = ['thing','other','day','time','give','old','great','man','way','hand','boy','find','seem']

In [30]:
filter_sent = []
for sentence in sentences:
  words = ''
  for word in sentence.split():
    if word not in filter_tokens + filtered_terms:
      words += word + ' '
  filter_sent.append(words)


In [31]:
sentences = filter_sent

In [32]:
len(sentences)

5843

In [33]:
with open("./data/BERTopic_Documents.txt", 'w') as file:
    file.write('\n'.join(sentences))

### Pre-compute embedding

In [34]:
print("Loading data...")
documents = sentences

# Pre-compute embeddings to save time during optimization
print("Computing embeddings...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(documents, show_progress_bar=True)

Loading data...
Computing embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/183 [00:00<?, ?it/s]

### Trial and error parameter setting

In [None]:
# Set a random_state in UMAP to prevent any stochastic behavior (at the expense of performance)
umap_model = UMAP(
    n_neighbors=10,
    min_dist=0.01,
    n_components=7,
    random_state=42
)

# Initialize BERTopic
topic_model = BERTopic(
    language="english",
    calculate_probabilities=True,
    verbose=True,
    umap_model=umap_model, nr_topics= 18 , top_n_words=7, min_topic_size=8  )

# Fit and transform
topics, probs = topic_model.fit_transform(sentences)


2025-08-19 17:06:04,641 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/183 [00:00<?, ?it/s]

2025-08-19 17:06:23,648 - BERTopic - Embedding - Completed ✓
2025-08-19 17:06:23,649 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-19 17:06:35,981 - BERTopic - Dimensionality - Completed ✓
2025-08-19 17:06:35,982 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-19 17:06:37,711 - BERTopic - Cluster - Completed ✓
2025-08-19 17:06:37,712 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-08-19 17:06:38,555 - BERTopic - Representation - Completed ✓
2025-08-19 17:06:38,556 - BERTopic - Topic reduction - Reducing number of topics
2025-08-19 17:06:38,580 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-19 17:06:39,453 - BERTopic - Representation - Completed ✓
2025-08-19 17:06:39,456 - BERTopic - Topic reduction - Reduced number of topics from 87 to 18


In [None]:
model_dir = "./bertopic_manual"
# Serialize with safetensors
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save(model_dir, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

### OPTUNA multi-objective optimization

In [None]:
def create_bertopic_model(trial, documents):
    """Create BERTopic model with trial-suggested parameters"""

    # UMAP parameters
    umap_n_neighbors = trial.suggest_int('umap_n_neighbors', 5, 15)
    umap_n_components = trial.suggest_int('umap_n_components', 2, 10)
    umap_min_dist = trial.suggest_float('umap_min_dist', 0.0, 0.2)
    umap_metric = trial.suggest_categorical('umap_metric', ['euclidean', 'manhattan', 'cosine'])

    # HDBSCAN parameters
    hdbscan_min_cluster_size = trial.suggest_int('hdbscan_min_cluster_size', 5, 50)
    hdbscan_min_samples = trial.suggest_int('hdbscan_min_samples', 1, 10)
    hdbscan_metric = trial.suggest_categorical('hdbscan_metric', ['euclidean', 'manhattan'])
    hdbscan_cluster_selection_method = trial.suggest_categorical(
        'hdbscan_cluster_selection_method', ['eom', 'leaf']
    )

    # BERTopic parameters
    nr_topics = trial.suggest_int('nr_topics', 10, 50)

    # Create components
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

    umap_model = UMAP(
        n_neighbors=umap_n_neighbors,
        n_components=umap_n_components,
        min_dist=umap_min_dist,
        metric=umap_metric,
        random_state=42
    )

    hdbscan_model = HDBSCAN(
        min_cluster_size=hdbscan_min_cluster_size,
        min_samples=hdbscan_min_samples,
        metric=hdbscan_metric,
        cluster_selection_method=hdbscan_cluster_selection_method
    )


    # Create BERTopic model
    topic_model = BERTopic(
        #nr_topics=nr_topics,
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        verbose=False
    )

    return topic_model

def evaluate_model(topic_model, documents, embeddings) -> Tuple[float, float]:
    """Evaluate the topic model using multiple metrics"""
    try:
        # Fit the model
        topics, _ = topic_model.fit_transform(documents, embeddings)

        # Get topic info
        topic_info = topic_model.get_topic_info()

        topics = []
        for key, value in topic_model.get_topics().items():
          topic = [v[0] for v in value]
          topics.append(topic)

        texts = [s.split() for s in sentences]

        output = {}
        output['topics'] = topics

        coherence_metrics = Coherence(texts=texts, #list of our documents
                            measure='c_npmi')
        coherence = coherence_metrics.score(output)

        diverisity_metric = TopicDiversity(topk=10)
        diversity = diverisity_metric.score(output)

        return coherence, diversity

    except Exception as e:
        print(f"Error in evaluation: {e}")
        return -1000, -1000

def objective(trial) -> Tuple[float, float]:
    """Objective function for Optuna optimization"""

    # Create model with trial parameters
    topic_model = create_bertopic_model(trial, documents)

    # Evaluate model
    cohrence, diversity = evaluate_model(topic_model, documents, embeddings)

    return cohrence, diversity



In [None]:
optuna.delete_study(
    storage=f"sqlite:///{optuna_folder}BERTopic_Study.db",
    study_name="BERTopic_Study"
)

In [None]:
# Main optimization code

# Create study
print("Starting optimization...")
study = optuna.create_study(
    directions=['maximize', 'maximize'],
    storage=f"sqlite:///{optuna_folder}BERTopic_Study.db",
    study_name="BERTopic_Study"
)

# Optimize
study.optimize(objective, n_trials=100)


[I 2025-08-19 17:33:21,274] A new study created in RDB with name: BERTopic_Study


Starting optimization...


[I 2025-08-19 17:34:00,196] Trial 0 finished with values: [-0.008262954505367686, 0.5676328502415459] and parameters: {'umap_n_neighbors': 11, 'umap_n_components': 4, 'umap_min_dist': 0.0470072746289125, 'umap_metric': 'manhattan', 'hdbscan_min_cluster_size': 8, 'hdbscan_min_samples': 2, 'hdbscan_metric': 'manhattan', 'hdbscan_cluster_selection_method': 'leaf', 'nr_topics': 48}.
[I 2025-08-19 17:34:43,057] Trial 1 finished with values: [0.07017730930647059, 0.609375] and parameters: {'umap_n_neighbors': 13, 'umap_n_components': 7, 'umap_min_dist': 0.07939330693118757, 'umap_metric': 'cosine', 'hdbscan_min_cluster_size': 22, 'hdbscan_min_samples': 2, 'hdbscan_metric': 'euclidean', 'hdbscan_cluster_selection_method': 'leaf', 'nr_topics': 47}.
[I 2025-08-19 17:35:14,930] Trial 2 finished with values: [0.08847918429428404, 0.6692307692307692] and parameters: {'umap_n_neighbors': 10, 'umap_n_components': 8, 'umap_min_dist': 0.07548277963135062, 'umap_metric': 'cosine', 'hdbscan_min_cluster_

In [56]:
def train_final_BERTopic_model(params, embeddings):
    """Create BERTopic model with trial-suggested parameters"""

    # UMAP parameters
    umap_n_neighbors = params['umap_n_neighbors']
    umap_n_components = params['umap_n_components']
    umap_min_dist = params['umap_min_dist']
    umap_metric = params['umap_metric']

    # HDBSCAN parameters
    hdbscan_min_cluster_size = params['hdbscan_min_cluster_size']
    hdbscan_min_samples = params['hdbscan_min_samples']
    hdbscan_metric = params['hdbscan_metric']
    hdbscan_cluster_selection_method = params['hdbscan_cluster_selection_method']

    # BERTopic parameters
    nr_topics = params['nr_topics']

    # Create components
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

    umap_model = UMAP(
        n_neighbors=umap_n_neighbors,
        n_components=umap_n_components,
        min_dist=umap_min_dist,
        metric=umap_metric,
        random_state=42
    )

    hdbscan_model = HDBSCAN(
        min_cluster_size=hdbscan_min_cluster_size,
        min_samples=hdbscan_min_samples,
        metric=hdbscan_metric,
        cluster_selection_method=hdbscan_cluster_selection_method,
        prediction_data=True
    )

    # Create BERTopic model
    topic_model = BERTopic(
        nr_topics=nr_topics,
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        verbose=False,
        calculate_probabilities=True
    )

    topics, probs = topic_model.fit_transform(documents, embeddings)

    # Get topic info
    topic_info = topic_model.get_topic_info()

    topics = []
    for key, value in topic_model.get_topics().items():
      topic = [v[0] for v in value]
      topics.append(topic)

    texts = [s.split() for s in sentences]

    output = {}
    output['topics'] = topics

    coherence_metrics = Coherence(texts=texts, #list of our documents
                        measure='c_npmi')
    coherence = coherence_metrics.score(output)

    diverisity_metric = TopicDiversity(topk=10)
    diversity = diverisity_metric.score(output)

    return topic_model, coherence, diversity


In [35]:
BERTopic_study = optuna.load_study(
    storage=f"sqlite:///{optuna_folder}BERTopic_Study.db",
    study_name="BERTopic_Study"
)

In [57]:
# Use the representation model in BERTopic on top of the default pipeline
if BERTopic_study.best_trials:
    # Get balanced solution
    pareto_trials = BERTopic_study.best_trials

    # Pick the first Pareto optimal solution
    selected_params = pareto_trials[0].params
    final_model, final_coherence, final_diversity = train_final_BERTopic_model(selected_params, embeddings)

    print(f"\nFinal model trained successfully!")


Final model trained successfully!


In [58]:
selected_params

{'umap_n_neighbors': 14,
 'umap_n_components': 6,
 'umap_min_dist': 0.07800012047188168,
 'umap_metric': 'manhattan',
 'hdbscan_min_cluster_size': 50,
 'hdbscan_min_samples': 9,
 'hdbscan_metric': 'euclidean',
 'hdbscan_cluster_selection_method': 'eom',
 'nr_topics': 12}

In [54]:
print(f"Final model metrics:")
print(f"  Coherence: {final_coherence:.4f}")
print(f"  Diversity: {final_diversity:.4f}")

Final model metrics:
  Coherence: 0.0901
  Diversity: 0.8667


In [61]:
doc_topics = final_model.get_document_info(documents)

In [64]:
doc_topics.to_csv("./bertopic/doc_topics.csv", encoding="utf-8")

In [63]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
final_model.save("./bertopic/", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

In [None]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
final_model = BERTopic.load("./bertopic", embedding_model=embedding_model)

In [59]:
final_model.visualize_topics(top_n_topics = 60)

In [44]:
final_model.visualize_barchart(top_n_topics=60).show()

In [45]:
topics = final_model.get_topics()

In [46]:
topic_words = []
for key, topic in topics.items():
  if key != -1:
    topic_words.append(' '.join([word_info[0]  for word_info in topic]))


In [47]:
topic_words

['father own dear much room poor hope love wish letter',
 'weasel fly mulgar long tail big bird night stand squirrel',
 'ship sea boat captain island sail deck shore wind vessel',
 'school fellow study paper room fifth mind mean own form',
 'mother child father ask want baby girl room lady like',
 'attack force enemy fire troops fort gun army french advance',
 'king queen prince royal person family lady palace knight princess',
 'night light dream cry fall face voice sleep lie heart',
 'horse master driver cab engine rein train drive carriage stable',
 'reply cottage indian forest animal dog lodge rifle intendant venison',
 'rock chalk volcano earthquake lava ice sea stone sand mountain']