<a href="https://colab.research.google.com/github/tsido/lda-thesis/blob/main/colabs/contextual_tm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Usage of the CombinedTM is based on [Combined Topic Modeling tutorial](https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing#scrollTo=-SEBG6wj9Zdu).

Calculate coherence scores for different number of topics using Combine Topic Model.


In [1]:
!pip install contextualized-topic-models


Collecting contextualized-topic-models
  Downloading contextualized_topic_models-2.5.0-py2.py3-none-any.whl (36 kB)
Collecting gensim==4.2.0 (from contextualized-topic-models)
  Downloading gensim-4.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers>=2.1.1 (from contextualized-topic-models)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ipywidgets==7.5.1 (from contextualized-topic-models)
  Downloading ipywidgets-7.5.1-py2.py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ipython==8.10.0 (from co

In [2]:
# Fetch the app store data from Google Drive
!mkdir data
!wget --no-check-certificate --output-document=data/enriched_data.csv 'https://docs.google.com/uc?export=download&id=1JIWIP_Hvzu69bCDz4Dz1xgs6sXzQXzG-'



--2023-12-07 09:46:01--  https://docs.google.com/uc?export=download&id=1JIWIP_Hvzu69bCDz4Dz1xgs6sXzQXzG-
Resolving docs.google.com (docs.google.com)... 108.177.111.100, 108.177.111.101, 108.177.111.139, ...
Connecting to docs.google.com (docs.google.com)|108.177.111.100|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-04-ao-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/upmhvf7542549td7v7or2b6cmhh9e39g/1701942375000/09640189477530773141/*/1JIWIP_Hvzu69bCDz4Dz1xgs6sXzQXzG-?e=download&uuid=f769b128-41c0-4c8e-ad00-a4befbc8ee7d [following]
--2023-12-07 09:46:21--  https://doc-04-ao-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/upmhvf7542549td7v7or2b6cmhh9e39g/1701942375000/09640189477530773141/*/1JIWIP_Hvzu69bCDz4Dz1xgs6sXzQXzG-?e=download&uuid=f769b128-41c0-4c8e-ad00-a4befbc8ee7d
Resolving doc-04-ao-docs.googleusercontent.com (doc-04-ao-docs.googleusercontent.com)... 173.194.193.132, 2607:

In [None]:
import pandas as pd
import numpy as np

from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.evaluation.measures import CoherenceNPMI, CoherenceUCI, CoherenceUMASS
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
import nltk
from nltk.corpus import stopwords as stop_words


In [None]:
# prepare the documents

nltk.download('stopwords')

df = pd.read_csv('data/enriched_data.csv')
descriptions = df['Description']

documents = [text.strip() for text in descriptions.values]
stopwords = list(stop_words.words("english"))
descriptions = df['Description'].str.split()

# this is used by the coherence model for context windows
coherence_docs = df['PreprocessedDescription'].str.split()

sp = WhiteSpacePreprocessingStopwords(documents, stopwords, vocabulary_size=2000)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

In [None]:
#unpreprocessed_corpus

In [None]:
# we're interested in english only, so using roberta as the base contextual model
# for SBERT underneath that's underneath the CTM
tp = TopicModelDataPreparation("paraphrase-distilroberta-base-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

In [None]:
tp.vocab[:10]


In [None]:
print(unpreprocessed_corpus[:2])
print(preprocessed_documents[:2])

In [None]:
ctm_coherence_scores = []
number_of_topics = range(45, 65)
#number_of_topics = [1,2,3,5,8,13,21,34,55,89,144]

for i in number_of_topics:
  print(i, "topics")
  ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=i, num_epochs=10)
  ctm.fit(training_dataset)

  npmi = CoherenceNPMI(texts=coherence_docs, topics=ctm.get_topic_lists(10))
  #npmi = CoherenceUMASS(texts=coherence_docs, topics=ctm.get_topic_lists(10))

  ctm_coherence_scores.append(npmi.score())

ctm_coherence_scores

In [None]:
ctm_coherence_scores

In [None]:
import matplotlib.pyplot as plt

plt.plot(
    number_of_topics,
    ctm_coherence_scores,
    marker='x',
    label='CTM Coherence Scores'
)

plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score (UCI)')
plt.title('Coherence Scores for CTM topic models')
plt.legend()
plt.show()

In [None]:
# TODO get the top 5 words for topics when k=5
num_topics = 8
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=num_topics, num_epochs=10)
ctm.fit(training_dataset)
ctm.get_topic_lists(5)


In [None]:
# print the topics for latex friendly format
topics = ctm.get_topic_lists(5)
[print("\\makecell{", " ".join(topic), "}") for topic in topics]