<a href="https://colab.research.google.com/github/tsido/lda-thesis/blob/main/topic_extraction/ctm/CTM_topic_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install octis

Collecting octis
  Downloading octis-1.13.1-py2.py3-none-any.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.0/131.0 kB[0m [31m885.5 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting gensim==4.2.0 (from octis)
  Downloading gensim-4.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn==1.1.0 (from octis)
  Downloading scikit_learn-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-optimize>=0.8.1 (from octis)
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting numpy==1.23.0 (from octi

In [2]:
!mkdir data
!wget --no-check-certificate --output-document=data/enriched_data.csv 'https://raw.githubusercontent.com/tsido/lda-thesis/main/data/enriched_data.csv'
!wget --no-check-certificate --output-document=data/optimization_results.csv 'https://raw.githubusercontent.com/tsido/lda-thesis/main/topic_extraction/ctm/optimization_results.csv'


# Get the persistent OCTIS dataset so we don't need to re-create it every time
!mkdir data/octis
!wget --no-check-certificate --output-document=data/octis/corpus.tsv 'https://raw.githubusercontent.com/tsido/lda-thesis/main/data/octis/corpus.tsv'
!wget --no-check-certificate --output-document=data/octis/metadata.json 'https://raw.githubusercontent.com/tsido/lda-thesis/main/data/octis/metadata.json'
!wget --no-check-certificate --output-document=data/octis/vocabulary.txt 'https://raw.githubusercontent.com/tsido/lda-thesis/main/data/octis/vocabulary.txt'



--2023-12-28 14:10:11--  https://raw.githubusercontent.com/tsido/lda-thesis/main/data/enriched_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31862266 (30M) [text/plain]
Saving to: ‘data/enriched_data.csv’


2023-12-28 14:10:12 (128 MB/s) - ‘data/enriched_data.csv’ saved [31862266/31862266]

--2023-12-28 14:10:12--  https://raw.githubusercontent.com/tsido/lda-thesis/main/topic_extraction/ctm/optimization_results.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13162 (13K) [text/plain]
Saving to: ‘data/optimization_res

In [3]:
import pandas as pd
import numpy as np
import json

from octis.models.CTM import CTM
from octis.dataset.dataset import Dataset

from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

from gensim.corpora.dictionary import Dictionary



In [4]:
# Configuration options

# whether to re-create the OCTIS dataset in case the underlying enriched dataset
# has changed
RECREATE_OCTIS_DATASET=True

# Topic Model configuration
num_topics = 46 # Number of topics to generate
top_k = 5      # Top words to inspect in metrics

# Neural network hyperparameters
num_neurons=100
num_layers=1
dropout=0.24308


In [5]:
df = None

if RECREATE_OCTIS_DATASET:
  # Create dataset readable by OCTIS

  # read in the document data
  df = pd.read_csv('data/enriched_data.csv')
  descriptions = df['PreprocessedDescription'].str.split()

  # Prepare custom dataset in a format described
  # here: https://github.com/MIND-Lab/OCTIS/tree/master/preprocessed_datasets/sample_dataset

  # Generate labels for the dataset
  df['label'] = pd.Categorical(df.apply(lambda x : eval(x['AppStoreGenres'])[-1], axis=1))

  # we need to split the data to training + testing sets, i.e. include additional columns
  # in addition to the texts
  df['split'] = np.random.choice(['train', 'test', 'val'], size=len(df), p=[0.8, 0.1, 0.1])
  df['split'] = pd.Categorical(df['split'], categories=['train', 'val', 'test'], ordered=True)
  df = df[['PreprocessedDescription', 'split', 'label']].sort_values(by='split')

  df[['PreprocessedDescription', 'split', 'label']].to_csv('data/octis/corpus.tsv', sep='\t', header=False, index=False)


  # create the metadata file
  # FIXME indexes for the last docs are hardcoded
  metadata = { 'total_documents': len(df),
              'vocabulary_length': 2000,
              'preprocessing-info': [],
              'labels': np.asarray(df['label'].unique()).tolist(),
              'total_labels': 0,
              'last-training-doc': 10318,
              'last-validation-doc': 11598
              }
  with open('data/octis/metadata.json', 'w') as f:
      json.dump(metadata, f)

  # Create the vocabulary.txt file using Gensim and keep 2000 most relevant words only
  dictionary = Dictionary(descriptions)

  # remove tokens that don't occur in at least 3 documents
  # and occur in over 50% of docs, keep 2000 words for vocabulary
  dictionary.filter_extremes(no_below=3, no_above=0.5, keep_n=2000)

  # create vocabulary.txt file by getting unique words from the dictionary
  with open("data/octis/vocabulary.txt", "w") as f:
      for (word) in dictionary.itervalues():
        f.write(word +'\n')


In [6]:
# Load the dataset
octis_dataset = Dataset()
octis_dataset.load_custom_dataset_from_folder('data/octis');


In [7]:
octis_dataset.__dict__.keys()
octis_dataset._Dataset__metadata


{'total_documents': 12901,
 'vocabulary_length': 2000,
 'preprocessing-info': [],
 'labels': ['Books',
  'Adventure',
  'Puzzle',
  'Sports',
  'Family',
  'Entertainment',
  'Role Playing',
  'Food & Drink',
  'Action',
  'Casual',
  'Racing',
  'Trivia',
  'Simulation',
  'Reference',
  'News',
  'Strategy',
  'Games',
  'Lifestyle',
  'Board',
  'Card',
  'Education',
  'Utilities',
  'Word',
  'Casino',
  'Music',
  'Travel',
  'Social Networking',
  'Productivity',
  'Photo & Video',
  'Navigation',
  'Shopping',
  'Health & Fitness',
  'Finance',
  'Medical',
  'Magazines & Newspapers',
  'Business',
  'Graphics & Design',
  'Educational'],
 'total_labels': 0,
 'last-training-doc': 10316,
 'last-validation-doc': 11618}

In [8]:
model = CTM(num_topics=num_topics, num_epochs=300,
            inference_type='combined',
            bert_model="paraphrase-distilroberta-base-v2",
            num_neurons=num_neurons,
            num_layers=num_layers,
            dropout=dropout)


In [9]:
output = model.train_model(octis_dataset)

.gitattributes:   0%|          | 0.00/736 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/104 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

In [10]:
npmi = Coherence(texts=octis_dataset.get_corpus(), topk=top_k)
diversity = TopicDiversity(topk=top_k)

coherence_score = npmi.score(output)
diversity_score = diversity.score(output)

print("Coherence ", coherence_score);
print("Diversity ", diversity_score);

# 60 epochs
# Coherence 0.09720620599374605
# Diversity 0.7434782608695653
# 150 epochs
#Coherence  0.095868922314712753
#Diversity  0.7913043478260869
# 300 epochs
#Coherence  0.11287936376416666
#Diversity  0.7913043478260869
# 350 epochs
#Coherence  0.09043808924931326
#Diversity  0.7739130434782608
# 400 epochs
#Coherence  0.10007923777679921
#Diversity  0.7347826086956522
# 500 epochs
#Coherence  0.09543121657464237
#Diversity  0.6608695652173913

Coherence  0.11624365704406045
Diversity  0.7130434782608696


In [18]:

output['topics']


[['app',
  'puzzles',
  'kids',
  'puzzle',
  'contain',
  'fun',
  'inside',
  'positive',
  'com',
  'hint'],
 ['app',
  'purposes',
  'shop',
  'like',
  'serve',
  'certain',
  'fix',
  'services',
  'fashion',
  'choose'],
 ['world',
  'new',
  'friends',
  'dragons',
  'village',
  'craft',
  'collect',
  'unique',
  'dragon',
  'trade'],
 ['new',
  'ipad',
  'world',
  'iphone',
  'god',
  'best',
  'universe',
  'ios',
  'elements',
  'create'],
 ['children',
  'boards',
  'basic',
  'everyday',
  'consists',
  'games',
  'size',
  'learn',
  'child',
  'patterns'],
 ['truck',
  'boat',
  'iphone',
  'wood',
  'fun',
  'generation',
  'snow',
  'ipads',
  'touches',
  'interaction'],
 ['games',
  'free',
  'real',
  'gambling',
  'success',
  'card',
  'vegas',
  'imply',
  'cards',
  'intended'],
 ['it’s',
  'can’t',
  'doesn’t',
  'phones',
  'that’s',
  'customization',
  'won’t',
  'you’re',
  'offering',
  'let’s'],
 ['puzzle',
  'puzzles',
  'jigsaw',
  'fit',
  'animals'

In [14]:
# persist distributions for further analysis
output['topic-word-matrix']
np.savetxt('topic-word-matrix.np', output['topic-word-matrix'])

In [15]:
output['topic-document-matrix']
np.savetxt('topic-document-matrix.np', output['topic-document-matrix'])