<a href="https://colab.research.google.com/github/tsido/lda-thesis/blob/main/topic_extraction/NMF/NMF_octis_coherence_scores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook for evaluation of NMF topic model



In [1]:
# Fetch the app store data from Google Drive
!mkdir data
!wget --no-check-certificate --output-document=data/enriched_data.csv 'https://docs.google.com/uc?export=download&id=1JIWIP_Hvzu69bCDz4Dz1xgs6sXzQXzG-'


--2023-12-11 12:28:09--  https://docs.google.com/uc?export=download&id=1JIWIP_Hvzu69bCDz4Dz1xgs6sXzQXzG-
Resolving docs.google.com (docs.google.com)... 172.217.214.138, 172.217.214.101, 172.217.214.113, ...
Connecting to docs.google.com (docs.google.com)|172.217.214.138|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-04-ao-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/p6u2u52nu22e1hfke6jelen5kej12r89/1702297650000/09640189477530773141/*/1JIWIP_Hvzu69bCDz4Dz1xgs6sXzQXzG-?e=download&uuid=e7dc134b-15d0-45b3-9a9a-74d8e40d7a6d [following]
--2023-12-11 12:28:37--  https://doc-04-ao-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/p6u2u52nu22e1hfke6jelen5kej12r89/1702297650000/09640189477530773141/*/1JIWIP_Hvzu69bCDz4Dz1xgs6sXzQXzG-?e=download&uuid=e7dc134b-15d0-45b3-9a9a-74d8e40d7a6d
Resolving doc-04-ao-docs.googleusercontent.com (doc-04-ao-docs.googleusercontent.com)... 142.250.128.132, 2607:

In [2]:
# install Octis and other required libraries
!pip install octis

Collecting octis
  Downloading octis-1.13.1-py2.py3-none-any.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.0/131.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gensim==4.2.0 (from octis)
  Downloading gensim-4.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn==1.1.0 (from octis)
  Downloading scikit_learn-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-optimize>=0.8.1 (from octis)
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting numpy==1.23.0 (from oct

In [9]:
import pandas as pd

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

from octis.optimization.optimizer import Optimizer
from octis.models.NMF_scikit import NMF_scikit

from octis.evaluation_metrics.coherence_metrics import Coherence


In [4]:
# Run the NMF model
# Final comparison done with 46 topics
num_topics = 46

# read in the document data
df = pd.read_csv('data/enriched_data.csv')

documents = df['PreprocessedDescription']

vectorizer = TfidfVectorizer(
    min_df=3,        #don't add terms that appear less than 3 of the descriptions to the vocabulary
    max_features=2000, # limit to 2000 most frequent terms
    ngram_range=(1, 1)
)


tfidf_vocabulary = vectorizer.fit_transform(documents)
tfidf_word_id_map = vectorizer.get_feature_names_out()


In [75]:
nmf = NMF(
    random_state=1,
    n_components=num_topics, # number of topics to generate
    init='nndsvd',
    solver='cd',
    #init='nndsvda',
    #solver='mu' # use multiplicative update solver as that is described in the theory
).fit(tfidf_vocabulary)




In [76]:
# Getting a df with each topic by document
H_doc_by_topic = nmf.transform(vectorizer.transform(documents))

n_topic_words = 5

topics = {}
for topic_idx, topic in enumerate(nmf.components_):
    t = (topic_idx)
    topics[t] = ' '.join([tfidf_word_id_map[i] for i in topic.argsort()[:(-n_topic_words - 1): -1]])


In [77]:
# get top 3 scoring topics and create a data frame that contains the topics and the original document text
docweights = H_doc_by_topic

topic_strings = []
topic_probabilities = []
n_top_topics = 3

for weight in docweights:
    top_topic_idx = weight.argsort()[::-1][:n_top_topics]
    topic_strings.append([topics[i] for i in top_topic_idx])
    topic_probabilities.append([weight[i] for i in top_topic_idx])

topic_df = pd.concat([df['App_Name'], documents, pd.DataFrame(topic_strings), pd.DataFrame(topic_probabilities)], axis=1)
topic_df.columns = ['AppName', 'TrimmedDescription', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_1_p', 'Topic_2_p', 'Topic_3_p']



In [78]:
# convert topics and documents into something that Octis coherence counter understands
from octis.evaluation_metrics.coherence_metrics import Coherence

octis_topic_model = {}
octis_topic_model['topics'] = [topics[t].split(' ') for t in topics]
octis_topic_model['texts'] = [doc.split(' ') for doc in documents]

nmpi = Coherence(texts=octis_topic_model['texts'],topk=n_topic_words, measure='c_npmi')
print('Coherence: ' + str(nmpi.score(octis_topic_model)))


# For     init='nndsvda', solver='mu'
# topk=5  Coherence: 0.2516171796392429
# topk=10 Coherence: 0.16116779285799177

# For    init='nndsvd', solver='cd',
# topk=5  Coherence: 0.27146724729511523
# topk=10 Coherence: 0.16773117323704292


Coherence: 0.27146724729511523


In [79]:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

diversity = TopicDiversity(topk=n_topic_words)
print('Diversity: ' + str(diversity.score(octis_topic_model)))

# For     init='nndsvda', solver='mu'
# topk=5 Diversity: 0.908695652173913
# topk=10 Diversity: 0.8108695652173913

# For    init='nndsvd', solver='cd',
# topk=5 Diversity: 0.9347826086956522
# topk=10 Diversity: 0.8217391304347826


Diversity: 0.9347826086956522


In [62]:
topics
"""
Topics with CD:
{0: 'game center best playing simple enjoy good played version games',
 1: 'word words letters letter search vocabulary dictionary grid spelling list',
 2: 'cards card deck suit pile hand rules memory game ace',
 3: 'kids learning fun educational learn games puzzles skills preschool coloring',
 4: 'subscription period account renewal charged gameclub current auto purchase subscriptions',
 5: 'puzzles puzzle jigsaw pieces solve logic crossword solving picture free',
 6: 'fish big games enjoy discover virtually promotion release leading library',
 7: 'player board players pieces opponent single checkers tac tic toe',
 8: 'space enemy ships alien ship arcade shoot missions weapons shooter',
 9: 'car racing cars race tracks driving drive speed physics realistic',
 10: 'iphone ipad touch ipod universal ios support retina devices version',
 11: 'numbers number math brain correct time bingo addition memory training',
 12: 'english spanish french german italian languages chinese portuguese russian language',
 13: 'slots casino slot vegas win real money free machines coins',
 14: 'tiles mahjong tile board remove matching match layouts pairs puzzle',
 15: 'chess moves board games engine pieces piece opponent play improve',
 16: 'solitaire spider klondike games freecell pyramid classic undo draw golf',
 17: 'escape room objects solve download place need ahead free hidden',
 18: 'levels level difficulty challenging complete challenge different easy hard stars',
 19: 'ball balls bowling physics soccer goal pool tilt control football',
 20: 'bubble bubbles pop shooter popping blast shoot match shooting color',
 21: 'sudoku notes row grid column cell pencil grids difficulty number',
 22: 'questions trivia quiz knowledge answer answers choice categories history test',
 23: 'truck garbage trucks monster vehicles ice cream drive boat vehicle',
 24: 'battle enemies heroes monsters fight rpg powerful strategy battles epic',
 25: 'animals animal farm zoo sounds cute toddlers funny sheep wild',
 26: 'blocks color block match coloring puzzle colors clear lines board',
 27: 'dice roll rolls rolling die board backgammon und rules shake',
 28: 'fun great addictive graphics music gameplay sound simple effects easy',
 29: 'poker texas chips hold video hand tournaments table hands casino',
 30: 'zombies zombie weapons undead survive guns survival gun shooting kill',
 31: 'baby dress pet little cute fashion hair pets like help',
 32: 'tap screen button left right fly red bird tapping finger',
 33: 'mode modes challenge time normal arcade multiplayer classic single survival',
 34: 'children child learning learn parents educational skills scene apps sounds',
 35: 'play friends online players multiplayer opponents chat facebook challenge family',
 36: 'hidden adventure object objects story mystery cradle island city secrets',
 37: 'score points high scores bonus time possible beat highest try',
 38: 'new unlock brand experience create challenges you gameplay characters games',
 39: 'christmas santa presents holiday season time edition family year gifts',
 40: 'app free store purchases apps support thank best purchase review',
 41: 'tabtale privacy policy app limited certain users sites device parents',
 42: 'guess friends movie guessing football test quiz logo hints fun',
 43: 'jump coins run obstacles collect avoid coin jumping running ninja',
 44: 'world real best city players compete countries ranking time country',
 45: 'para que cartas los del com juego las online les'}
 """


""" topics with mu solver:
{0: 'game center simple best playing enjoy support good leaderboards play',
 1: 'word words letters letter search vocabulary dictionary grid spelling list',
 2: 'solitaire cards card klondike spider suit deck freecell pyramid games',
 3: 'kids learning fun educational games learn app skills preschool puzzles',
 4: 'subscription period account renewal gameclub charged current auto purchase subscriptions',
 5: 'puzzles puzzle jigsaw pieces solve logic crossword solving picture free',
 6: 'fish big games discover enjoy virtually promotion leading release sign',
 7: 'play player players online multiplayer friends opponents opponent single board',
 8: 'battle enemies enemy war weapons fight heroes space strategy combat',
 9: 'car racing cars race tracks driving drive speed physics realistic',
 10: 'iphone ipad touch ipod universal support ios retina app version',
 11: 'english spanish french german languages italian chinese portuguese language russian',
 12: 'tiles mahjong tile board remove layouts matching solitaire match pairs',
 13: 'slots casino slot vegas win real free money machines coins',
 14: 'numbers number math brain color colors correct time bingo grid',
 15: 'chess moves board pieces games piece engine opponent play human',
 16: 'guess friends facebook share challenge test family twitter fun brain',
 17: 'escape room objects solve download place need ahead free use',
 18: 'levels level difficulty challenging complete challenge different easy hard stars',
 19: 'ball balls bowling physics soccer goal control pool tilt football',
 20: 'bubble bubbles pop shooter popping blast shoot color shooting match',
 21: 'sudoku notes grid column row cell pencil difficulty grids unlimited',
 22: 'questions trivia quiz knowledge answer answers choice categories history multiple',
 23: 'truck garbage trucks monster vehicles ice cream drive boat vehicle',
 24: 'adventure story island characters mysterious dragon atlantis explore villagers dracula',
 25: 'animals animal farm zoo sounds cute toddlers funny sheep wild',
 26: 'blocks block color match puzzle clear colors coloring board lines',
 27: 'dice roll rolls board rolling backgammon games shake classic rules',
 28: 'fun great graphics music addictive gameplay sound easy effects play',
 29: 'poker texas chips hold video hand cards tournaments table hands',
 30: 'zombies zombie weapons undead survive shoot shooting guns survival gun',
 31: 'tac tic toe player row line pieces board grid classic',
 32: 'tap screen button left right fly red bird finger play',
 33: 'mode modes challenge time arcade normal multiplayer classic survival single',
 34: 'children child learning parents learn educational skills apps scene app',
 35: 'para que cartas com del los juego las online les',
 36: 'hidden object objects cradle city rome trails mystery golden match',
 37: 'score points high scores bonus possible highest time beat earn',
 38: 'new games unlock create best you brand experience like way',
 39: 'christmas santa presents holiday time season edition year pieces family',
 40: 'app free real money purchases like use want device settings',
 41: 'tabtale privacy app policy limited certain users contact purposes parents',
 42: 'und die der med des app les och man quiz',
 43: 'jump run coins obstacles collect avoid coin running jumping ninja',
 44: 'world compete real best countries ranking players friends country explore',
 45: 'baby dress pet little hair fashion cute pets care cat'}
 """


{0: 'game center best playing simple enjoy good played version games',
 1: 'word words letters letter search vocabulary dictionary grid spelling list',
 2: 'cards card deck suit pile hand rules memory game ace',
 3: 'kids learning fun educational learn games puzzles skills preschool coloring',
 4: 'subscription period account renewal charged gameclub current auto purchase subscriptions',
 5: 'puzzles puzzle jigsaw pieces solve logic crossword solving picture free',
 6: 'fish big games enjoy discover virtually promotion release leading library',
 7: 'player board players pieces opponent single checkers tac tic toe',
 8: 'space enemy ships alien ship arcade shoot missions weapons shooter',
 9: 'car racing cars race tracks driving drive speed physics realistic',
 10: 'iphone ipad touch ipod universal ios support retina devices version',
 11: 'numbers number math brain correct time bingo addition memory training',
 12: 'english spanish french german italian languages chinese portuguese rus