<a href="https://colab.research.google.com/github/tsido/lda-thesis/blob/main/topic_extraction/ctm/CTM_topic_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CTM tuning and topic generation

In this notebook Contextual Topic Model is tuned using OCTIS and topics are extracted.

In [1]:
!mkdir data
!wget --no-check-certificate --output-document=data/enriched_data.csv 'https://raw.githubusercontent.com/tsido/lda-thesis/main/data/enriched_data.csv'
!wget --no-check-certificate --output-document=data/optimization_results.csv 'https://raw.githubusercontent.com/tsido/lda-thesis/main/topic_extraction/ctm/optimization_results.csv'

!mkdir data/octis

--2023-12-15 10:47:52--  https://raw.githubusercontent.com/tsido/lda-thesis/main/data/enriched_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31862266 (30M) [text/plain]
Saving to: ‘data/enriched_data.csv’


2023-12-15 10:47:54 (216 MB/s) - ‘data/enriched_data.csv’ saved [31862266/31862266]

--2023-12-15 10:47:55--  https://raw.githubusercontent.com/tsido/lda-thesis/main/topic_extraction/ctm/optimization_results.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2023-12-15 10:47:55 ERROR 404: Not Found.



In [2]:
!pip install octis

Collecting octis
  Downloading octis-1.13.1-py2.py3-none-any.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.0/131.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gensim==4.2.0 (from octis)
  Downloading gensim-4.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn==1.1.0 (from octis)
  Downloading scikit_learn-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-optimize>=0.8.1 (from octis)
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting numpy==1.23.0 (from octi

In [3]:

import pandas as pd
import numpy as np
import json

from octis.models.CTM import CTM
from octis.dataset.dataset import Dataset
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

#from octis.models.contextualized_topic_models.datasets.dataset import CTMDataset
from octis.models.contextualized_topic_models.utils.data_preparation import QuickText

from gensim.corpora.dictionary import Dictionary


In [4]:
# Configuration options

# Run or skip optimization step, skip if the optimization data is already available
RUN_OPTIMIZER=True

num_topics = 46 # Number of topics to generate
top_k = 5      # Top words to inspect in metrics


In [5]:
# read in the document data
df = pd.read_csv('data/enriched_data.csv')
descriptions = df['PreprocessedDescription'].str.split()

# Prepare custom dataset in a format described
# here: https://github.com/MIND-Lab/OCTIS/tree/master/preprocessed_datasets/sample_dataset

# Generate labels for the dataset
df['label'] = pd.Categorical(df.apply(lambda x : eval(x['AppStoreGenres'])[-1], axis=1))

# we need to split the data to training + testing sets, i.e. include additional columns
# in addition to the texts
df['split'] = np.random.choice(['train', 'test', 'val'], size=len(df), p=[0.8, 0.1, 0.1])
df['split'] = pd.Categorical(df['split'], categories=['train', 'val', 'test'], ordered=True)
df = df[['PreprocessedDescription', 'split', 'label']].sort_values(by='split')

# TODO Also we likely want to use the non-processed texts for CTM since it wants the
# non-processed texts as well

df[['PreprocessedDescription', 'split', 'label']].to_csv('data/octis/corpus.tsv', sep='\t', header=False, index=False)


# create the metadata file
# TODO fix the indexes to be correct!
metadata = { 'total_documents': len(df),
            'vocabulary_length': 2000,
            'preprocessing-info': [],
             'labels': [],
             'total_labels': 0,
             'last-training-doc': 10000,
             'last-validation-doc': 11000
             }
with open('data/octis/metadata.json', 'w') as f:
    json.dump(metadata, f)

# Create the vocabulary.txt file using Gensim and keep 2000 most relevant words only
dictionary = Dictionary(descriptions)

# remove tokens that don't occur in at least 3 documents
# and occur in over 50% of docs, keep 2000 words for vocabulary
dictionary.filter_extremes(no_below=3, no_above=0.5, keep_n=2000)

# create vocabulary.txt file by getting unique words from the dictionary
with open("data/octis/vocabulary.txt", "w") as f:
    for (word) in dictionary.itervalues():
      f.write(word +'\n')




In [6]:
# For CTM we also require the BERT embeddings

# TODO
#ctm_dataset = QuickText()

#octis_dataset.CTMDataset

TypeError: ignored

In [7]:
# Load the dataset
octis_dataset = Dataset()
octis_dataset.load_custom_dataset_from_folder('data/octis');


In [10]:
# TODO there is a issue here in the sense that OCTIS
# only works for the zeroshot version which doesn't use the
# BoW representation at all.

# See if that works for us

# we're interested in english only, so using roberta as the base contextual model
# for SBERT underneath that's underneath the CTM
# See https://colab.research.google.com/github/MIND-Lab/OCTIS/blob/master/examples/OCTIS_Optimizing_CTM.ipynb#scrollTo=i6Sywe4vCJW5 for
# guidance
# NOTE: this only works for zeroshot inference, not the Combined that we thought we
# would use...
model = CTM(num_topics=num_topics, num_epochs=30, inference_type='zeroshot', bert_model="paraphrase-distilroberta-base-v2")

# Evaluation metric
npmi = Coherence(texts=octis_dataset.get_corpus())
diversity = TopicDiversity(topk=top_k)

search_space = {"num_layers": Categorical({1, 2, 3}),
                "num_neurons": Categorical({100, 300, 500}),
                "activation": Categorical({'rrelu', 'relu'}),
                "dropout": Real(0.0, 0.95)
}

optimization_runs=200
model_runs=1

optimizer=Optimizer()
optimization_result = optimizer.optimize(
    model, octis_dataset, npmi, search_space, number_of_call=optimization_runs,
    model_runs=model_runs, save_models=True,
    extra_metrics=[diversity], # to keep track of other metrics
    save_path='data/',
    early_stop=True,
    early_step=5
    )


optimization_result.save_to_csv("optimization_results.csv")


Current call:  0
Current call:  1
Current call:  2
Current call:  3
Current call:  4
Current call:  5
Current call:  6
Current call:  7
Current call:  8
Current call:  9
Current call:  10
Current call:  11
Current call:  12
Current call:  13
Current call:  14
Current call:  15
Current call:  16
Current call:  17
Current call:  18
Current call:  19
Current call:  20
Current call:  21
Current call:  22
Current call:  23
Current call:  24
Current call:  25
Current call:  26
Current call:  27
Current call:  28
Current call:  29
Current call:  30
Current call:  31
Current call:  32
Current call:  33
Current call:  34
Current call:  35
Current call:  36
Current call:  37
Current call:  38
Current call:  39
Current call:  40
Current call:  41
Current call:  42
Current call:  43
Current call:  44
Current call:  45
Current call:  46
Current call:  47
Current call:  48
Current call:  49
Current call:  50
Current call:  51
Current call:  52
Current call:  53
Current call:  54
Current call:  55
Cu

In [19]:
optimization_result


'c_npmi'

In [36]:
results = pd.read_csv('optimization_results.csv')

results.loc[results['Mean(model_runs)'].idxmax()]

dataset                           dataset_name
surrogate model                             RF
acquisition function                       LCB
num_iteration                              121
time                                 50.845753
Median(model_runs)                    0.119142
Mean(model_runs)                      0.119142
Standard_Deviation(model_runs)             0.0
activation                               rrelu
dropout                               0.018282
num_layers                                   1
num_neurons                                300
Topic diversity(not optimized)        0.791304
Name: 121, dtype: object