<a href="https://colab.research.google.com/github/tsido/lda-thesis/blob/main/topic_extraction/ctm/CTM_topic_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install octis

Collecting octis
  Downloading octis-1.13.1-py2.py3-none-any.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.0/131.0 kB[0m [31m766.2 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting gensim==4.2.0 (from octis)
  Downloading gensim-4.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn==1.1.0 (from octis)
  Downloading scikit_learn-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-optimize>=0.8.1 (from octis)
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting numpy==1.23.0 (from oc

In [2]:
!mkdir data
!wget --no-check-certificate --output-document=data/enriched_data.csv 'https://raw.githubusercontent.com/tsido/lda-thesis/main/data/enriched_data.csv'
!wget --no-check-certificate --output-document=data/optimization_results.csv 'https://raw.githubusercontent.com/tsido/lda-thesis/main/topic_extraction/ctm/optimization_results.csv'


# Get the persistent OCTIS dataset so we don't need to re-create it every time
!mkdir data/octis
!wget --no-check-certificate --output-document=data/octis/corpus.tsv 'https://raw.githubusercontent.com/tsido/lda-thesis/main/data/octis/corpus.tsv'
!wget --no-check-certificate --output-document=data/octis/metadata.json 'https://raw.githubusercontent.com/tsido/lda-thesis/main/data/octis/metadata.json'
!wget --no-check-certificate --output-document=data/octis/vocabulary.txt 'https://raw.githubusercontent.com/tsido/lda-thesis/main/data/octis/vocabulary.txt'



--2023-12-22 08:39:03--  https://raw.githubusercontent.com/tsido/lda-thesis/main/data/enriched_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31862266 (30M) [text/plain]
Saving to: ‘data/enriched_data.csv’


2023-12-22 08:39:04 (148 MB/s) - ‘data/enriched_data.csv’ saved [31862266/31862266]

--2023-12-22 08:39:04--  https://raw.githubusercontent.com/tsido/lda-thesis/main/topic_extraction/ctm/optimization_results.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13162 (13K) [text/plain]
Saving to: ‘data/optimization_res

In [3]:
import pandas as pd
import numpy as np
import json

from octis.models.CTM import CTM
from octis.dataset.dataset import Dataset

from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

from gensim.corpora.dictionary import Dictionary



In [4]:
# Configuration options

# whether to re-create the OCTIS dataset in case the underlying enriched dataset
# has changed
RECREATE_OCTIS_DATASET=True

# Topic Model configuration
num_topics = 46 # Number of topics to generate
top_k = 5      # Top words to inspect in metrics

# Neural network hyperparameters
num_neurons=100
num_layers=1
dropout=0.24308


In [5]:
df = None

if RECREATE_OCTIS_DATASET:
  # Create dataset readable by OCTIS

  # read in the document data
  df = pd.read_csv('data/enriched_data.csv')
  descriptions = df['PreprocessedDescription'].str.split()

  # Prepare custom dataset in a format described
  # here: https://github.com/MIND-Lab/OCTIS/tree/master/preprocessed_datasets/sample_dataset

  # Generate labels for the dataset
  df['label'] = pd.Categorical(df.apply(lambda x : eval(x['AppStoreGenres'])[-1], axis=1))

  # we need to split the data to training + testing sets, i.e. include additional columns
  # in addition to the texts
  df['split'] = np.random.choice(['train', 'test', 'val'], size=len(df), p=[0.8, 0.1, 0.1])
  df['split'] = pd.Categorical(df['split'], categories=['train', 'val', 'test'], ordered=True)
  df = df[['PreprocessedDescription', 'split', 'label']].sort_values(by='split')

  df[['PreprocessedDescription', 'split', 'label']].to_csv('data/octis/corpus.tsv', sep='\t', header=False, index=False)


  # create the metadata file
  # FIXME indexes for the last docs are hardcoded
  metadata = { 'total_documents': len(df),
              'vocabulary_length': 2000,
              'preprocessing-info': [],
              'labels': np.asarray(df['label'].unique()).tolist(),
              'total_labels': 0,
              'last-training-doc': 10318,
              'last-validation-doc': 11598
              }
  with open('data/octis/metadata.json', 'w') as f:
      json.dump(metadata, f)

  # Create the vocabulary.txt file using Gensim and keep 2000 most relevant words only
  dictionary = Dictionary(descriptions)

  # remove tokens that don't occur in at least 3 documents
  # and occur in over 50% of docs, keep 2000 words for vocabulary
  dictionary.filter_extremes(no_below=3, no_above=0.5, keep_n=2000)

  # create vocabulary.txt file by getting unique words from the dictionary
  with open("data/octis/vocabulary.txt", "w") as f:
      for (word) in dictionary.itervalues():
        f.write(word +'\n')


In [6]:
# Load the dataset
octis_dataset = Dataset()
octis_dataset.load_custom_dataset_from_folder('data/octis');


In [7]:
octis_dataset.__dict__.keys()
octis_dataset._Dataset__metadata


{'total_documents': 12901,
 'vocabulary_length': 2000,
 'preprocessing-info': [],
 'labels': ['Books',
  'Adventure',
  'Simulation',
  'Casual',
  'Sports',
  'Puzzle',
  'Action',
  'Word',
  'Reference',
  'News',
  'Racing',
  'Trivia',
  'Entertainment',
  'Board',
  'Family',
  'Education',
  'Role Playing',
  'Food & Drink',
  'Card',
  'Strategy',
  'Music',
  'Lifestyle',
  'Casino',
  'Travel',
  'Productivity',
  'Utilities',
  'Social Networking',
  'Photo & Video',
  'Games',
  'Shopping',
  'Health & Fitness',
  'Navigation',
  'Finance',
  'Educational',
  'Medical',
  'Graphics & Design',
  'Magazines & Newspapers',
  'Business'],
 'total_labels': 0,
 'last-training-doc': 10325,
 'last-validation-doc': 11616}

In [35]:
model = CTM(num_topics=num_topics, num_epochs=300,
            inference_type='combined',
            bert_model="paraphrase-distilroberta-base-v2",
            num_neurons=num_neurons,
            num_layers=num_layers,
            dropout=dropout)


In [36]:
output = model.train_model(octis_dataset)

In [38]:
npmi = Coherence(texts=octis_dataset.get_corpus(), topk=top_k)
diversity = TopicDiversity(topk=top_k)

coherence_score = npmi.score(output)
diversity_score = diversity.score(output)

print("Coherence ", coherence_score);
print("Diversity ", diversity_score);

# 60 epochs
# Coherence 0.09720620599374605
# Diversity 0.7434782608695653
# 150 epochs
#Coherence  0.095868922314712753
#Diversity  0.7913043478260869
# 300 epochs
#Coherence  0.11287936376416666
#Diversity  0.7913043478260869
# 350 epochs
#Coherence  0.09043808924931326
#Diversity  0.7739130434782608
# 400 epochs
#Coherence  0.10007923777679921
#Diversity  0.7347826086956522
# 500 epochs
#Coherence  0.09543121657464237
#Diversity  0.6608695652173913

Coherence  0.09043808924931326
Diversity  0.7739130434782608


In [42]:
output['topics']

[['golden',
  'hidden',
  'array',
  'object',
  'detective',
  'secrets',
  'virtual',
  'pirate',
  'tuned',
  'city'],
 ['kids',
  'words',
  'learn',
  'children',
  'app',
  'names',
  'letter',
  'teaches',
  'different',
  'puzzles'],
 ['dress',
  'fashion',
  'girl',
  'accessories',
  'model',
  'styles',
  'hair',
  'beauty',
  'prefer',
  'girls'],
 ['coloring',
  'puzzles',
  'app',
  'pages',
  'pictures',
  'puzzle',
  'kids',
  'short',
  'jigsaw',
  'easy'],
 ['fun',
  'kids',
  'truck',
  'puzzle',
  'boat',
  'trucks',
  'great',
  'snow',
  'bugs',
  'puzzles'],
 ['iphone',
  'ipad',
  'solitaire',
  'touch',
  'features',
  'portrait',
  'graphics',
  'best',
  'games',
  'mode'],
 ['puzzle',
  'puzzles',
  'logic',
  'sudoku',
  'solved',
  'squares',
  'solving',
  'ranging',
  'easy',
  'sizes'],
 ['friends',
  'word',
  'fun',
  'challenge',
  'score',
  'words',
  'new',
  'facebook',
  'games',
  'best'],
 ['app',
  'use',
  'like',
  'restrict',
  'choose',
 

In [44]:
output['topic-word-matrix']

array([[-0.07386519, -0.17356138, -0.02747964, ...,  0.19174366,
        -0.14757513, -0.1265269 ],
       [ 0.01241002,  0.28974652, -0.24029875, ..., -0.22808543,
        -0.13536076, -0.21015172],
       [ 0.0936621 , -0.15290412,  0.13500112, ...,  0.22157393,
         0.19962041,  0.2524576 ],
       ...,
       [ 0.3987831 , -0.04213056, -0.04507596, ...,  0.21517164,
         0.15831506,  0.2127891 ],
       [-0.02829048, -0.1798288 ,  0.27910906, ..., -0.06684038,
         0.2973013 ,  0.19116871],
       [ 0.01056332, -0.19066122,  0.09993573, ...,  0.24001272,
         0.17772561,  0.20934622]], dtype=float32)

In [45]:
output['topic-document-matrix']

array([[0.01385613, 0.01236514, 0.00082063, ..., 0.01622689, 0.00894619,
        0.01031666],
       [0.01384771, 0.03362858, 0.00161811, ..., 0.01438247, 0.00443614,
        0.00888108],
       [0.02796866, 0.00613438, 0.00331713, ..., 0.02733929, 0.00186275,
        0.0074621 ],
       ...,
       [0.12153164, 0.00391432, 0.00084231, ..., 0.01189789, 0.00464166,
        0.02904256],
       [0.01055065, 0.0110459 , 0.00170352, ..., 0.01981091, 0.00113418,
        0.01656071],
       [0.02556471, 0.01548695, 0.00118123, ..., 0.02128453, 0.00421483,
        0.1270153 ]])