<a href="https://colab.research.google.com/github/tsido/lda-thesis/blob/main/topic_extraction/ctm/CTM_topic_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CTM tuning and topic generation

In this notebook Contextual Topic Model is tuned using OCTIS and topics are extracted.

In [None]:
!mkdir data
!wget --no-check-certificate --output-document=data/enriched_data.csv 'https://raw.githubusercontent.com/tsido/lda-thesis/main/data/enriched_data.csv'
!wget --no-check-certificate --output-document=data/optimization_results.csv 'https://raw.githubusercontent.com/tsido/lda-thesis/main/topic_extraction/ctm/optimization_results.csv'

!mkdir data/octis

--2023-12-15 15:43:00--  https://raw.githubusercontent.com/tsido/lda-thesis/main/data/enriched_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31862266 (30M) [text/plain]
Saving to: ‘data/enriched_data.csv’


2023-12-15 15:43:03 (282 MB/s) - ‘data/enriched_data.csv’ saved [31862266/31862266]

--2023-12-15 15:43:03--  https://raw.githubusercontent.com/tsido/lda-thesis/main/topic_extraction/ctm/optimization_results.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27387 (27K) [text/plain]
Saving to: ‘data/optimization_res

In [None]:
!pip install octis

Collecting octis
  Downloading octis-1.13.1-py2.py3-none-any.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.0/131.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gensim==4.2.0 (from octis)
  Downloading gensim-4.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn==1.1.0 (from octis)
  Downloading scikit_learn-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-optimize>=0.8.1 (from octis)
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting numpy==1.23.0 (from octi

In [None]:

import pandas as pd
import numpy as np
import json

from octis.models.CTM import CTM
from octis.dataset.dataset import Dataset
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

#from octis.models.contextualized_topic_models.datasets.dataset import CTMDataset
from octis.models.contextualized_topic_models.utils.data_preparation import QuickText

from gensim.corpora.dictionary import Dictionary


In [None]:
# Configuration options

num_topics = 46 # Number of topics to generate
top_k = 5      # Top words to inspect in metrics


In [None]:
# read in the document data
df = pd.read_csv('data/enriched_data.csv')
descriptions = df['PreprocessedDescription'].str.split()

# Prepare custom dataset in a format described
# here: https://github.com/MIND-Lab/OCTIS/tree/master/preprocessed_datasets/sample_dataset

# Generate labels for the dataset
df['label'] = pd.Categorical(df.apply(lambda x : eval(x['AppStoreGenres'])[-1], axis=1))

# we need to split the data to training + testing sets, i.e. include additional columns
# in addition to the texts
df['split'] = np.random.choice(['train', 'test', 'val'], size=len(df), p=[0.8, 0.1, 0.1])
df['split'] = pd.Categorical(df['split'], categories=['train', 'val', 'test'], ordered=True)
df = df[['PreprocessedDescription', 'split', 'label']].sort_values(by='split')

df[['PreprocessedDescription', 'split', 'label']].to_csv('data/octis/corpus.tsv', sep='\t', header=False, index=False)


# create the metadata file
# FIXME indexes for the last docs are hardcoded
metadata = { 'total_documents': len(df),
            'vocabulary_length': 2000,
            'preprocessing-info': [],
             'labels': [],
             'total_labels': 0,
             'last-training-doc': 10318,
             'last-validation-doc': 11598
             }
with open('data/octis/metadata.json', 'w') as f:
    json.dump(metadata, f)

# Create the vocabulary.txt file using Gensim and keep 2000 most relevant words only
dictionary = Dictionary(descriptions)

# remove tokens that don't occur in at least 3 documents
# and occur in over 50% of docs, keep 2000 words for vocabulary
dictionary.filter_extremes(no_below=3, no_above=0.5, keep_n=2000)

# create vocabulary.txt file by getting unique words from the dictionary
with open("data/octis/vocabulary.txt", "w") as f:
    for (word) in dictionary.itervalues():
      f.write(word +'\n')




In [None]:
# Load the dataset
octis_dataset = Dataset()
octis_dataset.load_custom_dataset_from_folder('data/octis');


In [None]:
# NOTE: this only works if the workspace is clean of _*.pkl files!

# we're interested in english only, so using roberta as the base contextual model
# for SBERT underneath that's underneath the CTM
# See https://colab.research.google.com/github/MIND-Lab/OCTIS/blob/master/examples/OCTIS_Optimizing_CTM.ipynb#scrollTo=i6Sywe4vCJW5 for
# guidance
model = CTM(num_topics=num_topics, num_epochs=30, inference_type='combined', bert_model="paraphrase-distilroberta-base-v2")

# Evaluation metric
npmi = Coherence(texts=octis_dataset.get_corpus())
diversity = TopicDiversity(topk=top_k)

search_space = {"num_layers": Categorical({1, 2, 3}),
                "num_neurons": Categorical({100, 300, 500, 1000}),
                "dropout": Real(0.0, 0.95)
}

optimizer=Optimizer()
optimization_result = optimizer.optimize(
    model, octis_dataset, npmi, search_space, number_of_call=100,
    model_runs=1, save_models=True,
    extra_metrics=[diversity], # to keep track of other metrics
    save_path='data/'

)

optimization_result.save_to_csv("optimization_results.csv")


Current call:  0
Current call:  1
Current call:  2
Current call:  3
Current call:  4
Current call:  5
Current call:  6
Current call:  7
Current call:  8
Current call:  9
Current call:  10
Current call:  11
Current call:  12
Current call:  13
Current call:  14
Current call:  15
Current call:  16
Current call:  17
Current call:  18
Current call:  19
Current call:  20
Current call:  21
Current call:  22
Current call:  23
Current call:  24
Current call:  25
Current call:  26
Current call:  27
Current call:  28
Current call:  29
Current call:  30
Current call:  31
Current call:  32
Current call:  33
Current call:  34
Current call:  35
Current call:  36
Current call:  37
Current call:  38
Current call:  39
Current call:  40
Current call:  41
Current call:  42
Current call:  43
Current call:  44
Current call:  45
Current call:  46
Current call:  47
Current call:  48
Current call:  49
Current call:  50
Current call:  51
Current call:  52
Current call:  53
Current call:  54
Current call:  55
Cu

# New section

<octis.optimization.optimizer_evaluation.OptimizerEvaluation at 0x7a1cf74a37c0>

In [None]:
results = pd.read_csv('optimization_results.csv')
results = results.sort_values('Mean(model_runs)', ascending=False)
results



Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),dropout,num_layers,num_neurons,Topic diversity(not optimized)
48,dataset_name,RF,LCB,48,61.755902,0.106907,0.106907,0.0,0.024308,1,1000,0.739130
35,dataset_name,RF,LCB,35,65.931262,0.100528,0.100528,0.0,0.024229,3,100,0.743478
55,dataset_name,RF,LCB,55,55.848886,0.100334,0.100334,0.0,0.024332,1,100,0.678261
89,dataset_name,RF,LCB,89,65.384042,0.097116,0.097116,0.0,0.023831,3,300,0.652174
52,dataset_name,RF,LCB,52,65.520842,0.094514,0.094514,0.0,0.024332,3,100,0.691304
...,...,...,...,...,...,...,...,...,...,...,...,...
14,dataset_name,RF,LCB,14,67.478830,0.012966,0.012966,0.0,0.246403,3,1000,0.521739
13,dataset_name,RF,LCB,13,61.237686,0.012541,0.012541,0.0,0.284374,3,1000,0.513043
0,dataset_name,RF,LCB,0,65.271374,0.008819,0.008819,0.0,0.461745,2,1000,0.626087
12,dataset_name,RF,LCB,12,52.810198,-0.011420,-0.011420,0.0,0.344336,3,1000,0.347826


In [None]:
max_diversity_idx = results['Topic diversity(not optimized)'].idxmax()
max_coherence_idx = results['Mean(model_runs)'].idxmax()

random_indexes = results.sample(n=18).index.tolist()
random_indexes.append(max_diversity_idx)
random_indexes.append(max_coherence_idx)

print_df = results[results.index.isin(random_indexes)]
print_df = print_df[['Mean(model_runs)', 'Topic diversity(not optimized)',  'dropout', 'num_layers', 'num_neurons']]
print_df


Unnamed: 0,Mean(model_runs),Topic diversity(not optimized),dropout,num_layers,num_neurons
48,0.106907,0.73913,0.024308,1,1000
35,0.100528,0.743478,0.024229,3,100
41,0.086877,0.643478,0.024226,3,500
81,0.085949,0.708696,0.024317,2,100
42,0.085754,0.634783,0.024073,3,100
54,0.084858,0.804348,0.024298,1,1000
24,0.083714,0.613043,0.023175,3,300
72,0.082663,0.747826,0.021451,2,1000
98,0.079655,0.669565,0.023874,3,100
79,0.07902,0.708696,0.024307,2,100


In [None]:
print(print_df.to_latex(index=False,
                  formatters={"name": str.upper},
                  float_format="{:.6f}".format,
))

\begin{tabular}{rrrrr}
\toprule
 Mean(model\_runs) &  Topic diversity(not optimized) &  dropout &  num\_layers &  num\_neurons \\
\midrule
         0.106907 &                        0.739130 & 0.024308 &           1 &         1000 \\
         0.100528 &                        0.743478 & 0.024229 &           3 &          100 \\
         0.086877 &                        0.643478 & 0.024226 &           3 &          500 \\
         0.085949 &                        0.708696 & 0.024317 &           2 &          100 \\
         0.085754 &                        0.634783 & 0.024073 &           3 &          100 \\
         0.084858 &                        0.804348 & 0.024298 &           1 &         1000 \\
         0.083714 &                        0.613043 & 0.023175 &           3 &          300 \\
         0.082663 &                        0.747826 & 0.021451 &           2 &         1000 \\
         0.079655 &                        0.669565 & 0.023874 &           3 &          100 \\
      

  print(print_df.to_latex(index=False,
