In [None]:
# Generate top topics for documents using NMF 
# Outputs a dataframe for each app with top 3 topics they are related to, and the topics' probabilities

# Partly inspired by NMF topic modelling tutorial 
# at https://www.kaggle.com/code/rockystats/topic-modelling-using-nmf

import pandas as pd 

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer


# read in the document data
df = pd.read_csv('../../data/enriched_data.csv')

documents = df['PreprocessedDescription']


In [None]:
vectorizer = TfidfVectorizer(
    min_df=3,        #don't add terms that appear less than 3 of the descriptions to the vocabulary
    max_features=2000, # limit to 2000 most frequent terms
    ngram_range=(1, 1)
)


tfidf_vocabulary = vectorizer.fit_transform(documents)
tfidf_word_id_map = vectorizer.get_feature_names_out()


In [None]:
# Do the non-negative matrix factorization
# TODO: explore the number of topics we want separately
nmf = NMF(
    n_components=35, # number of topics to generate
    init='nndsvd'
).fit(tfidf_vocabulary)

In [None]:
# Getting a df with each topic by document
H_doc_by_topic = nmf.transform(vectorizer.transform(documents))

n_topic_words = 5

topics = {}
for topic_idx, topic in enumerate(nmf.components_):
    t = (topic_idx)
    topics[t] = ' '.join([tfidf_word_id_map[i] for i in topic.argsort()[:(-n_topic_words - 1): -1]])


In [None]:
# get top 3 scoring topics and create a data frame that contains the topics and the original document text
docweights = H_doc_by_topic

topic_strings = []
topic_probabilities = []
n_top_topics = 3

for weight in docweights:
    top_topic_idx = weight.argsort()[::-1][:n_top_topics]
    topic_strings.append([topics[i] for i in top_topic_idx])
    topic_probabilities.append([weight[i] for i in top_topic_idx])
    
topic_df = pd.concat([df['App_Name'], documents, pd.DataFrame(topic_strings), pd.DataFrame(topic_probabilities)], axis=1)
topic_df.columns = ['AppName', 'TrimmedDescription', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_1_p', 'Topic_2_p', 'Topic_3_p']



In [None]:
topic_df

Unnamed: 0,AppName,TrimmedDescription,Topic_1,Topic_2,Topic_3,Topic_1_p,Topic_2_p,Topic_3_p
0,AAA³ Little Ponies & Unicorns,welcome land fairies puzzle game toddlers kids...,puzzles puzzle jigsaw pieces solve,kids children learning child learn,game center best games playing,0.061556,0.049068,0.022431
1,GA1: An Assassin in Orlandes,interactive fantasy adventure book game rpg ga...,dice roll board rolling die,battle enemies world fight new,iphone ipad touch ipod universal,0.048158,0.038820,0.020225
2,Baby Games App (by HAPPYTOUCH®),plenty press touch explore children ages happy...,kids children learning child learn,animals animal farm zoo cute,app free purchases like use,0.060599,0.050713,0.041175
3,Hadean Lands,winner xyzzy interactive fiction awards best p...,puzzles puzzle jigsaw pieces solve,game center best games playing,battle enemies world fight new,0.040001,0.027388,0.017816
4,Pango and friends,discover unpublished pango stories interactive...,kids children learning child learn,animals animal farm zoo cute,fish bigfi games big bigfishgames,0.038539,0.018421,0.013402
...,...,...,...,...,...,...,...,...
12896,Touchdown Hero: New Season,new season finally field outmanoeuvre defender...,player play players online friends,tap score screen points right,racing race tracks speed world,0.019077,0.018811,0.016797
12897,Hidden Objects USA Time Object,vacation usa explore beautiful travel spots me...,levels level difficulty challenge challenging,word words letters letter search,fish bigfi games big bigfishgames,0.035260,0.034364,0.030222
12898,Slots! Golden Cherry,number casino slot machine real play free fore...,slots casino slot vegas win,fun great addictive music graphics,player play players online friends,0.112477,0.027091,0.012178
12899,XAirports,xairports utility app allows owner popular fli...,app free purchases like use,racing race tracks speed world,numbers number math brain memory,0.012379,0.012096,0.010692


In [None]:
# Explore the results

# get the most high scores for each unique topic
topic_df.groupby('Topic_1').max().sort_values(by='Topic_1_p', ascending=False)

Unnamed: 0_level_0,AppName,TrimmedDescription,Topic_2,Topic_3,Topic_1_p,Topic_2_p,Topic_3_p
Topic_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
truck garbage trucks monster vehicles,Wrecking Ball Truck,young children fascinated things colorful puzz...,zombies zombie weapons shoot shooting,zombies zombie weapons shoot shooting,0.355203,0.148641,0.070769
poker texas chips hold hand,Zynga Poker - Texas Holdem,＊＊＊ play way success ＊＊＊ video poker player wo...,tiles mahjong tile board remove,word words letters letter search,0.350608,0.129603,0.054856
zombies zombie weapons shoot shooting,Zompy Jumpy - Zombie Jump,★★★★★ tired building best farm try destroying ...,word words letters letter search,truck garbage trucks monster vehicles,0.300987,0.116184,0.076828
tabtale privacy policy com terms,Yatzy Multiplayer - Dice Game,★★★★★ sally salon limited time ★★★★★ sally sal...,truck garbage trucks monster vehicles,zombies zombie weapons shoot shooting,0.286363,0.104762,0.044331
escape room objects hidden solve,XIII Lost Identity – HD,“allghoi heard indiana heard allghoi legendary...,tiles mahjong tile board remove,zombies zombie weapons shoot shooting,0.276529,0.109622,0.049418
christmas santa presents holiday time,Xmas Jigsaws Puzzle Game: Farm,•pop groups christmas bubbles icon •collect sc...,zombies zombie weapons shoot shooting,word words letters letter search,0.272733,0.126515,0.065183
dice roll board rolling die,YourGameDice,„wer war ist genau die richtige wahl für logik...,zombies zombie weapons shoot shooting,zombies zombie weapons shoot shooting,0.264924,0.103524,0.084713
ball balls bowling soccer physics,ZigZag,★★★★★ highly addictive brick breaker style gam...,word words letters letter search,zombies zombie weapons shoot shooting,0.258712,0.107048,0.073983
tiles mahjong tile board remove,Words Alone Lite,◉this famous zhongyuan mahjong solitaire zmahj...,word words letters letter search,zombies zombie weapons shoot shooting,0.249792,0.138935,0.065652
bubble bubbles pop shooter popping,Winter Pop: Save the Snowman,welcome bubble bath knowledge people world com...,word words letters letter search,zombies zombie weapons shoot shooting,0.249585,0.103144,0.055993


In [None]:
# Most dominant topic across the documents based on the probability
topic_df.groupby('Topic_1').sum('Topic_1_p').sort_values(by='Topic_1_p', ascending=False)

Unnamed: 0_level_0,Topic_1_p,Topic_2_p,Topic_3_p
Topic_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
battle enemies world fight new,64.710132,32.006937,20.989679
puzzles puzzle jigsaw pieces solve,61.066347,28.756246,17.526684
ball balls bowling soccer physics,43.207403,15.904337,9.982221
solitaire cards card klondike spider,41.832404,14.347964,8.964667
com www http facebook twitter,41.250366,23.998754,16.330228
fun great addictive music graphics,40.354193,24.883268,17.127295
subscription period account gameclub renewal,37.761056,10.625606,5.23597
kids children learning child learn,35.590103,20.854341,12.880889
questions quiz trivia guess knowledge,35.144604,11.270089,7.109139
animals animal farm zoo cute,33.92533,15.034157,9.399699


In [None]:
# Most frequent topics across the documents based on counts, i.e. which topic appears the most
# regardless of the probability sum
topic_df.groupby('Topic_1').count().sort_values(by='Topic_1_p', ascending=False)

Unnamed: 0_level_0,AppName,TrimmedDescription,Topic_2,Topic_3,Topic_1_p,Topic_2_p,Topic_3_p
Topic_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
battle enemies world fight new,1249,1249,1249,1249,1249,1249,1249
fun great addictive music graphics,803,803,803,803,803,803,803
puzzles puzzle jigsaw pieces solve,717,717,717,717,717,717,717
com www http facebook twitter,710,710,710,710,710,710,710
player play players online friends,577,577,577,577,577,577,577
kids children learning child learn,547,547,547,547,547,547,547
app free purchases like use,510,510,510,510,510,510,510
iphone ipad touch ipod universal,493,493,493,493,493,493,493
game center best games playing,492,492,492,492,492,492,492
solitaire cards card klondike spider,491,491,491,491,491,491,491


In [None]:

import topicwizard

pipeline = topicwizard.gensim_pipeline(dictionary, model=nmf)
topicwizard.visualize(pipeline=pipeline, corpus=corpus)



ImportError: cannot import name '_fit_context' from 'sklearn.base' (/Users/tsido/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py)

: 