In [59]:
# Generate top topics for documents using NMF 
# Outputs a dataframe for each app with top 3 topics they are related to, and the topics' probabilities

# Partly inspired by NMF topic modelling tutorial 
# at https://www.kaggle.com/code/rockystats/topic-modelling-using-nmf

import pandas as pd 
import numpy as np
import string

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

# trim the vocabulary with stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# read in the document data
df = pd.read_csv('../../data/enriched_data.csv')


# NMF preprocessing functions
def clean_whitespace(row):
    word_array = row.split()
    trimmed_row = ' '.join(word_array)
    return trimmed_row

def strip_punctuation(row):
    translator = str.maketrans('', '', string.punctuation)
    trimmed_row = row.translate(translator)
    return trimmed_row

def lowercase(row):
    return row.lower()

def remove_stop_words(row):
    word_array = row.split()
    trimmed_row = ' '.join([word for word in word_array if word not in ENGLISH_STOP_WORDS])
    return trimmed_row


# combine separate preprocessing steps
def preprocess(row):
    trimmed_row = strip_punctuation(row)
    trimmed_row = clean_whitespace(trimmed_row)
    trimmed_row = lowercase(trimmed_row)
    trimmed_row = remove_stop_words(trimmed_row)
    return trimmed_row


# TrimmedDescriptions will be used to construct the document matrix for
# NMF
df['TrimmedDescription'] = df['Description'].apply(preprocess)

documents = df['TrimmedDescription']


In [60]:
vectorizer = TfidfVectorizer(
    min_df=3,        #don't add terms that appear less than 3 of the descriptions to the vocabulary
    max_features=5000, # limit to 5000 most frequent terms
    ngram_range=(1, 1)
)


tfidf_vocabulary = vectorizer.fit_transform(documents)
tfidf_word_id_map = tfidf_vectorizer.get_feature_names_out()


In [61]:
# Do the non-negative matrix factorization
# TODO: explore the number of topics we want
nmf = NMF(
    n_components=30, # number of topics to generate
    init='nndsvd'
).fit(tfidf_vocabulary)

In [76]:
# Getting a df with each topic by document
H_doc_by_topic = nmf.transform(vectorizer.transform(documents))

n_topic_words = 5

topics = {}
for topic_idx, topic in enumerate(nmf.components_):
    t = (topic_idx)
    topics[t] = ' '.join([tfidf_word_id_map[i] for i in topic.argsort()[:(-n_topic_words - 1): -1]])


In [74]:
# get top 3 scoring topics and create a data frame that contains the topics and the original document text
docweights = H_doc_by_topic

topic_strings = []
topic_probabilities = []
n_top_topics = 3

for weight in docweights:
    top_topic_idx = weight.argsort()[::-1][:n_top_topics]
    topic_strings.append([topics[i] for i in top_topic_idx])
    topic_probabilities.append([weight[i] for i in top_topic_idx])
    
topic_df = pd.concat([df['App_Name'], documents, pd.DataFrame(topic_strings), pd.DataFrame(topic_probabilities)], axis=1)
topic_df.columns = ['AppName', 'TrimmedDescription', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_1_p', 'Topic_2_p', 'Topic_3_p']



In [75]:
topic_df

Unnamed: 0,AppName,TrimmedDescription,Topic_1,Topic_2,Topic_3,Topic_1_p,Topic_2_p,Topic_3_p
0,AAA³ Little Ponies & Unicorns,welcome land fairies puzzle game toddlers kids...,kids learning fun educational learn,puzzles puzzle jigsaw pieces solve,game fun simple addictive great,0.071003,0.036524,0.019223
1,GA1: An Assassin in Orlandes,interactive fantasy adventure book game rpg ga...,battle enemies weapons world fight,dice roll rolls yatzy points,iphone ipad touch ipod app,0.031390,0.019959,0.014119
2,Baby Games App (by HAPPYTOUCH®),theres plenty press touch explore children age...,children child scene learning app,animals animal farm zoo cute,iphone ipad touch ipod app,0.122700,0.032197,0.032015
3,Baby Games from HAPPYTOUCH®,theres plenty press touch explore children age...,children child scene learning app,animals animal farm zoo cute,iphone ipad touch ipod app,0.122700,0.032197,0.032015
4,Hadean Lands,winner xyzzy interactive fiction awards best p...,game fun simple addictive great,puzzles puzzle jigsaw pieces solve,battle enemies weapons world fight,0.036021,0.024655,0.016700
...,...,...,...,...,...,...,...,...
13169,Touchdown Hero: New Season,new season finally field outmanoeuvre defender...,play players player friends online,tap jump screen score avoid,car racing cars race tracks,0.015155,0.007901,0.006954
13170,Hidden Objects USA Time Object,vacation usa explore beautiful travel spots me...,levels level difficulty challenging challenge,word words letters letter search,fish big discover games enjoy,0.074346,0.038964,0.026477
13171,Slots! Golden Cherry,number 1 casino slot machine doesnt real play ...,slots casino slot vegas win,cards card deck suit pile,game fun simple addictive great,0.129149,0.013216,0.009825
13172,XAirports,xairports utility app allows owners popular fl...,iphone ipad touch ipod app,battle enemies weapons world fight,escape room objects hidden solve,0.010149,0.008640,0.008582
