In [6]:
# Generate topics for documents using NMF 

# Inspired by NMF topic modelling tutorial at https://www.kaggle.com/code/rockystats/topic-modelling-using-nmf
import pandas as pd 
import numpy as np
import string

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer


# trim the vocabulry with stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# read in the document data
df = pd.read_csv('../enriched_data.csv')


# NMF preprocessing functions
def clean_whitespace(row):
    word_array = row.split()
    trimmed_row = ' '.join(word_array)
    return trimmed_row

def strip_punctuation(row):
    translator = str.maketrans('', '', string.punctuation)
    trimmed_row = row.translate(translator)
    return trimmed_row

def lowercase(row):
    return row.lower()

def remove_stop_words(row):
    word_array = row.split()
    trimmed_row = ' '.join([word for word in word_array if word not in ENGLISH_STOP_WORDS])
    return trimmed_row


# combine separate preprocessing steps
def preprocess(row):
    trimmed_row = strip_punctuation(row)
    trimmed_row = clean_whitespace(trimmed_row)
    trimmed_row = lowercase(trimmed_row)
    trimmed_row = remove_stop_words(trimmed_row)
    return trimmed_row


# TrimmedDescriptions will be used to construct the document matrix for
# NMF
df['TrimmedDescription'] = df['Description'].apply(preprocess)

documents = df['TrimmedDescription']


0        welcome land fairies puzzle game toddlers kids...
1        interactive fantasy adventure book game rpg ga...
2        theres plenty press touch explore children age...
3        theres plenty press touch explore children age...
4        winner xyzzy interactive fiction awards best p...
                               ...                        
13169    new season finally field outmanoeuvre defender...
13170    vacation usa explore beautiful travel spots me...
13171    number 1 casino slot machine doesnt real play ...
13172    xairports utility app allows owners popular fl...
13173    esta aplicación es uso exclusivo para salones ...
Name: TrimmedDescription, Length: 13174, dtype: object

In [13]:
vectorizer = TfidfVectorizer(
    min_df=3,        #don't add terms that appear less than 3 of the descriptions to the vocabulary
    max_features=5000, # limit to 5000 most frequent terms
    ngram_range=(1, 1)
)


tfidf_vocabulary = vectorizer.fit_transform(documents)
tfidf_word_id_map = tfidf_vectorizer.get_feature_names_out()


In [19]:
# Do the non-negative matrix factorization
# TODO: explore the number of topics we want
nmf = NMF(
    n_components=30, # number of topics to generate
    init='nndsvd'
).fit(tfidf_vocabulary)

In [24]:
# Getting a df with each topic by document
H_doc_by_topic = nmf.transform(vectorizer.transform(documents))

n_topic_words = 5

topics = {}
for topic_idx, topic in enumerate(nmf.components_):
    t = (topic_idx)
    topics[t] = ' '.join([tfidf_word_id_map[i] for i in topic.argsort()[:(-n_topic_words - 1): -1]])

topics

# Transform H_doc_by_topic so that it contains the actual topic[i] 
# as well as the probabilities



{0: 'game fun simple addictive great',
 1: 'word words letters letter search',
 2: 'cards card deck suit pile',
 3: 'kids learning fun educational learn',
 4: 'fish big discover games enjoy',
 5: 'subscription period account charged current',
 6: 'puzzles puzzle jigsaw pieces solve',
 7: 'play players player friends online',
 8: 'battle enemies weapons world fight',
 9: 'slots casino slot vegas win',
 10: 'car racing cars race tracks',
 11: 'solitaire spider klondike games freecell',
 12: 'app tabtale privacy make use',
 13: 'blocks color block coloring mode',
 14: 'sudoku numbers number notes grid',
 15: 'bubble bubbles pop shooter blast',
 16: 'tiles mahjong tile board remove',
 17: 'chess moves board pieces mate',
 18: 'iphone ipad touch ipod app',
 19: 'truck garbage trucks monster vehicles',
 20: 'escape room objects hidden solve',
 21: 'ball balls bowling soccer physics',
 22: 'questions quiz trivia guess knowledge',
 23: 'dice roll rolls yatzy points',
 24: 'tap jump screen scor

In [42]:
# TODO get list of topics for documents
# get top 3 scoring topics
[topics[i] for i in docweights[0].argsort()[::-1][:3]]




df[['Topic_1', 'Topic_2','Topic_3']] =



['kids learning fun educational learn',
 'puzzles puzzle jigsaw pieces solve',
 'game fun simple addictive great']