In [378]:
# Generate top topics for documents using NMF 
# Outputs a dataframe for each app with top 3 topics they are related to, and the topics' probabilities

# Partly inspired by NMF topic modelling tutorial 
# at https://www.kaggle.com/code/rockystats/topic-modelling-using-nmf

import pandas as pd 

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer


# read in the document data
df = pd.read_csv('../../data/enriched_data.csv')

documents = df['PreprocessedDescription']


In [379]:
vectorizer = TfidfVectorizer(
    min_df=3,        #don't add terms that appear less than 3 of the descriptions to the vocabulary
    max_features=2000, # limit to 2000 most frequent terms
    ngram_range=(1, 1)
)


tfidf_vocabulary = vectorizer.fit_transform(documents)
tfidf_word_id_map = vectorizer.get_feature_names_out()


In [380]:
# Do the non-negative matrix factorization
# TODO: explore the number of topics we want separately
nmf = NMF(
    n_components=35, # number of topics to generate
    init='nndsvd'
).fit(tfidf_vocabulary)

In [381]:
# Getting a df with each topic by document
H_doc_by_topic = nmf.transform(vectorizer.transform(documents))

n_topic_words = 5

topics = {}
for topic_idx, topic in enumerate(nmf.components_):
    t = (topic_idx)
    topics[t] = ' '.join([tfidf_word_id_map[i] for i in topic.argsort()[:(-n_topic_words - 1): -1]])


In [382]:
# get top 3 scoring topics and create a data frame that contains the topics and the original document text
docweights = H_doc_by_topic

topic_strings = []
topic_probabilities = []
n_top_topics = 3

for weight in docweights:
    top_topic_idx = weight.argsort()[::-1][:n_top_topics]
    topic_strings.append([topics[i] for i in top_topic_idx])
    topic_probabilities.append([weight[i] for i in top_topic_idx])
    
topic_df = pd.concat([df['App_Name'], documents, pd.DataFrame(topic_strings), pd.DataFrame(topic_probabilities)], axis=1)
topic_df.columns = ['AppName', 'TrimmedDescription', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_1_p', 'Topic_2_p', 'Topic_3_p']



In [383]:
topic_df

Unnamed: 0,AppName,TrimmedDescription,Topic_1,Topic_2,Topic_3,Topic_1_p,Topic_2_p,Topic_3_p
0,AAA³ Little Ponies & Unicorns,welcome land fairies puzzle game toddlers kids...,kids fun learning educational learn,puzzles puzzle jigsaw pieces solve,game fun simple center addictive,0.114558,0.035439,0.019932
1,GA1: An Assassin in Orlandes,interactive fantasy adventure book game rpg ga...,dice roll score points rolling,adventure new world story hidden,iphone ipad touch ipod universal,0.055986,0.051974,0.018577
2,Baby Games App (by HAPPYTOUCH®),plenty press touch explore children ages happy...,children child learning learn educational,app free purchases like use,animals animal farm zoo cute,0.090324,0.045237,0.034639
3,Hadean Lands,winner xyzzy interactive fiction awards best p...,adventure new world story hidden,game fun simple center addictive,puzzles puzzle jigsaw pieces solve,0.038714,0.029774,0.025086
4,Pango and friends,discover unpublished pango stories interactive...,adventure new world story hidden,children child learning learn educational,kids fun learning educational learn,0.038789,0.035318,0.031375
...,...,...,...,...,...,...,...,...
12896,Touchdown Hero: New Season,new season finally field outmanoeuvre defender...,player board players pieces game,tap jump score screen avoid,racing race tracks speed physics,0.041782,0.016877,0.014100
12897,Hidden Objects USA Time Object,vacation usa explore beautiful travel spots me...,levels level difficulty challenging easy,adventure new world story hidden,word words letters letter search,0.067113,0.040558,0.034805
12898,Slots! Golden Cherry,number casino slot machine real play free fore...,slots casino slot vegas win,cards card deck suit pile,friends play online players multiplayer,0.112647,0.014415,0.012516
12899,XAirports,xairports utility app allows owner popular fli...,player board players pieces game,app free purchases like use,racing race tracks speed physics,0.021295,0.013819,0.011499
