In [61]:
import pandas as pd
from pandas import option_context
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer 

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob

In [62]:
df = pd.read_csv('04-data/preprocessed_app_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40475 entries, 0 to 40474
Data columns (total 56 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     40475 non-null  object 
 1   description               40475 non-null  object 
 2   summary                   40463 non-null  object 
 3   installs                  40475 non-null  object 
 4   minInstalls               40475 non-null  float64
 5   score                     40475 non-null  float64
 6   ratings                   40475 non-null  float64
 7   reviews                   40475 non-null  float64
 8   histogram                 40475 non-null  object 
 9   price                     40475 non-null  float64
 10  free                      40475 non-null  int64  
 11  currency                  40475 non-null  object 
 12  sale                      40475 non-null  bool   
 13  offersIAP                 40475 non-null  bool   
 14  inAppP

In [63]:
df = df[df['description_clean'].notna()]

In [64]:
df_analysis = df[['title','description_clean']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [65]:
df_analysis

Unnamed: 0,title,description_clean
0,World War 2: Offline Strategy,command allies in 25 epic world war 2 locatio...
1,AndroXLS editor for XLS sheets,androxls is an android app to edit xls spreads...
2,SoundSeeder -Play music simultaneously and in ...,soundseeder syncs up your music playback on mu...
3,LibreOffice & OpenOffice document reader | ODF,view and modify documents created using libreo...
4,"All PDF - PDF Reader, PDF Viewer & PDF Converter",pdf reader for android pdfs converter free is ...
...,...,...
40466,Marriage Anniversary Photo Frame,specially designed for weddings or anniversary...
40467,Lucky Block for MCPE,lucky block race for mcpe is the best app to ...
40468,PokeCraft : Monsters Mod,the pokecraft monsters mod for minecraft is a...
40469,TongitsXtreme,enjoy this exciting filipino card game called ...


In [66]:
stop = stopwords.words('english')
stop.extend(['free','new','get','hd','use','game','games','make','makes','play','fun'])

In [67]:
vectorizer = CountVectorizer(stop_words = stop)

doc_word = vectorizer.fit_transform(df_analysis.description_clean)

In [68]:
nmf_model = NMF(30)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_

In [69]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-6:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['app', 'apps', 'also', 'download', 'using'],
 ['car', 'driving', 'cars', 'drive', 'simulator'],
 ['photo', 'editor', 'photos', 'frames', 'effects'],
 ['video', 'videos', 'maker', 'status', 'download'],
 ['keyboard', 'theme', 'typing', 'themes', 'emoji'],
 ['robot', 'transform', 'transforming', 'flying', 'robots'],
 ['truck', 'driving', 'simulator', 'cargo', 'transport'],
 ['shooting', 'fps', 'war', 'sniper', 'gun'],
 ['bike', 'stunt', 'moto', 'stunts', 'driving'],
 ['bus', 'driving', 'simulator', 'city', 'coach'],
 ['wallpaper', 'live', 'wallpapers', 'background', 'backgrounds'],
 ['world', 'like', 'time', 'best', 'one'],
 ['music', 'player', 'songs', 'audio', 'mp3'],
 ['coloring', 'book', 'color', 'pages', 'glitter'],
 ['screen', 'lock', 'phone', 'apps', 'android'],
 ['police', 'city', 'crime', 'gangster', 'hero'],
 ['pdf', 'files', 'file', 'reader', 'text'],
 ['gps', 'map', 'live', 'navigation', 'location'],
 ['english', 'words', 'language', 'word', 'learn'],
 ['phone', 'call', 'fl

In [70]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [71]:
df_analysis

Unnamed: 0,title,description_clean,topic
0,World War 2: Offline Strategy,command allies in 25 epic world war 2 locatio...,7
1,AndroXLS editor for XLS sheets,androxls is an android app to edit xls spreads...,16
2,SoundSeeder -Play music simultaneously and in ...,soundseeder syncs up your music playback on mu...,12
3,LibreOffice & OpenOffice document reader | ODF,view and modify documents created using libreo...,16
4,"All PDF - PDF Reader, PDF Viewer & PDF Converter",pdf reader for android pdfs converter free is ...,16
...,...,...,...
40466,Marriage Anniversary Photo Frame,specially designed for weddings or anniversary...,2
40467,Lucky Block for MCPE,lucky block race for mcpe is the best app to ...,0
40468,PokeCraft : Monsters Mod,the pokecraft monsters mod for minecraft is a...,11
40469,TongitsXtreme,enjoy this exciting filipino card game called ...,11
