In [17]:
# Generate top topics for documents using NMF
# Outputs a dataframe for each app with top 3 topics they are related to, and the topics' probabilities

# Partly inspired by NMF topic modelling tutorial
# at https://www.kaggle.com/code/rockystats/topic-modelling-using-nmf

import pandas as pd

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

# set number of topics
num_topics=5

# read in the document data
df = pd.read_csv('../../data/enriched_data.csv')

documents = df['PreprocessedDescription']


In [18]:
vectorizer = TfidfVectorizer(
    min_df=3,        #don't add terms that appear less than 3 of the descriptions to the vocabulary
    max_features=2000, # limit to 2000 most frequent terms
    ngram_range=(1, 1)
)


tfidf_vocabulary = vectorizer.fit_transform(documents)
tfidf_word_id_map = vectorizer.get_feature_names_out()


In [19]:
# Do the non-negative matrix factorization
# TODO: explore the number of topics we want separately
nmf = NMF(
    n_components=num_topics, # number of topics to generate
    init='nndsvd'
).fit(tfidf_vocabulary)


In [20]:
# Getting a df with each topic by document
H_doc_by_topic = nmf.transform(vectorizer.transform(documents))

n_topic_words = 5

topics = {}
for topic_idx, topic in enumerate(nmf.components_):
    t = (topic_idx)
    topics[t] = ' '.join([tfidf_word_id_map[i] for i in topic.argsort()[:(-n_topic_words - 1): -1]])


In [21]:
# get top 3 scoring topics and create a data frame that contains the topics and the original document text
docweights = H_doc_by_topic

topic_strings = []
topic_probabilities = []
n_top_topics = 3

for weight in docweights:
    top_topic_idx = weight.argsort()[::-1][:n_top_topics]
    topic_strings.append([topics[i] for i in top_topic_idx])
    topic_probabilities.append([weight[i] for i in top_topic_idx])

topic_df = pd.concat([df['App_Name'], documents, pd.DataFrame(topic_strings), pd.DataFrame(topic_probabilities)], axis=1)
topic_df.columns = ['AppName', 'TrimmedDescription', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_1_p', 'Topic_2_p', 'Topic_3_p']



In [22]:
topic_df


Unnamed: 0,AppName,TrimmedDescription,Topic_1,Topic_2,Topic_3,Topic_1_p,Topic_2_p,Topic_3_p
0,AAA³ Little Ponies & Unicorns,welcome land fairies puzzle game toddlers kids...,puzzles kids puzzle children app,subscription period account renewal gameclub,cards solitaire card game play,0.075075,0.000000,0.000000
1,GA1: An Assassin in Orlandes,interactive fantasy adventure book game rpg ga...,game play new levels world,puzzles kids puzzle children app,subscription period account renewal gameclub,0.038181,0.008148,0.003314
2,Baby Games App (by HAPPYTOUCH®),plenty press touch explore children ages happy...,puzzles kids puzzle children app,subscription period account renewal gameclub,game play new levels world,0.094701,0.002759,0.000243
3,Hadean Lands,winner xyzzy interactive fiction awards best p...,game play new levels world,puzzles kids puzzle children app,cards solitaire card game play,0.034640,0.025919,0.002735
4,Pango and friends,discover unpublished pango stories interactive...,puzzles kids puzzle children app,cards solitaire card game play,game play new levels world,0.055988,0.011154,0.010588
...,...,...,...,...,...,...,...,...
12896,Touchdown Hero: New Season,new season finally field outmanoeuvre defender...,game play new levels world,word words letters letter game,subscription period account renewal gameclub,0.026045,0.001838,0.000706
12897,Hidden Objects USA Time Object,vacation usa explore beautiful travel spots me...,word words letters letter game,game play new levels world,puzzles kids puzzle children app,0.061564,0.028081,0.019840
12898,Slots! Golden Cherry,number casino slot machine real play free fore...,game play new levels world,cards solitaire card game play,subscription period account renewal gameclub,0.034885,0.026399,0.021423
12899,XAirports,xairports utility app allows owner popular fli...,game play new levels world,puzzles kids puzzle children app,subscription period account renewal gameclub,0.015326,0.008405,0.005726


In [23]:
# Explore the results

# get the most high scores for each unique topic
topic_df.groupby('Topic_1').max().sort_values(by='Topic_1_p', ascending=False)


Unnamed: 0_level_0,AppName,TrimmedDescription,Topic_2,Topic_3,Topic_1_p,Topic_2_p,Topic_3_p
Topic_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
subscription period account renewal gameclub,Zombie Match Defense,町一個分のオープンワールドで冒険する精神的新作「wonder――あそびもふしぎも、大冒険も。...,word words letters letter game,word words letters letter game,0.299693,0.150469,0.042683
word words letters letter game,Zgadywanka - guess what party?,●●● word pics guessing quiz latest word game c...,subscription period account renewal gameclub,subscription period account renewal gameclub,0.238788,0.137703,0.038655
cards solitaire card game play,Zotz!,＊＊＊ play way success ＊＊＊ video poker player wo...,word words letters letter game,word words letters letter game,0.21718,0.139286,0.038306
puzzles kids puzzle children app,Zoombinis,💓✨💘🌟🚃travel “long long time ago” wake kid open...,word words letters letter game,word words letters letter game,0.187204,0.120197,0.040818
game play new levels world,Zynga Poker - Texas Holdem,＊＊＊ play keno caveman style ＊＊＊ hit eggs multi...,word words letters letter game,word words letters letter game,0.087785,0.056688,0.036313


In [24]:
# Most dominant topic across the documents based on the probability
topic_df.groupby('Topic_1').sum('Topic_1_p').sort_values(by='Topic_1_p', ascending=False)


Unnamed: 0_level_0,Topic_1_p,Topic_2_p,Topic_3_p
Topic_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
game play new levels world,323.21355,84.692894,30.539212
puzzles kids puzzle children app,135.503493,40.804679,14.0233
cards solitaire card game play,82.076307,16.151365,5.242225
word words letters letter game,75.155212,16.929191,5.212999
subscription period account renewal gameclub,51.191627,8.879229,2.737355


In [25]:
# Most frequent topics across the documents based on counts, i.e. which topic appears the most
# regardless of the probability sum
topic_df.groupby('Topic_1').count().sort_values(by='Topic_1_p', ascending=False)


Unnamed: 0_level_0,AppName,TrimmedDescription,Topic_2,Topic_3,Topic_1_p,Topic_2_p,Topic_3_p
Topic_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
game play new levels world,8653,8653,8653,8653,8653,8653,8653
puzzles kids puzzle children app,2297,2297,2297,2297,2297,2297,2297
cards solitaire card game play,884,884,884,884,884,884,884
word words letters letter game,698,698,698,698,698,698,698
subscription period account renewal gameclub,369,369,369,369,369,369,369


In [27]:

import topicwizard
from topicwizard.pipeline import make_topic_pipeline

from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer()
nmf = NMF(n_components=num_topics)
pipeline = make_topic_pipeline(bow_vectorizer, nmf)
pipeline.fit(documents)

app = topicwizard.visualize(corpus=documents, pipeline=pipeline)



Preprocessing
