In [312]:
import pandas as pd
from pandas import option_context
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, PCA

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [313]:
df = pd.read_csv('04-data/preprocessed_app_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22015 entries, 0 to 22014
Data columns (total 56 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     22015 non-null  object 
 1   description               22015 non-null  object 
 2   summary                   22014 non-null  object 
 3   installs                  22015 non-null  object 
 4   minInstalls               22015 non-null  float64
 5   score                     22015 non-null  float64
 6   ratings                   22015 non-null  float64
 7   reviews                   22015 non-null  float64
 8   histogram                 22015 non-null  object 
 9   price                     22015 non-null  float64
 10  free                      22015 non-null  int64  
 11  currency                  22015 non-null  object 
 12  sale                      22015 non-null  bool   
 13  offersIAP                 22015 non-null  bool   
 14  inAppP

In [314]:
df = df[df['description_clean'].notna()]
df = df[df['score'] >= 3.8]

In [315]:
df_analysis = df[['title','description_clean']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [316]:
df_analysis

Unnamed: 0,title,description_clean
0,World War 2: Offline Strategy,command allies in 25 epic world war 2 locatio...
1,"All PDF - PDF Reader, PDF Viewer & PDF Converter",pdf reader for android pdfs converter free is ...
2,MSN Sports - Scores & Schedule,be in a league of your ownget real-time game u...
3,QRbot: QR & barcode reader,scan all kinds of qr codes and barcodes with t...
4,QR & Barcode Scanner,qr barcode scannerthis qr barcode scanner wi...
...,...,...
17136,Aksara Jawa - Nulis Aksara Jawa | Ketik & Konv...,aksara jawa adalah salah satu aset budaya indo...
17137,Mp3 Songs Download,the application provides search stream and dow...
17138,PDF Maker,main features of orangepalm s pdf maker app- s...
17139,DSLR Blur Photo,this app lets you blur parts of your photo whi...


In [317]:
stop = stopwords.words('english')
stop.extend(['free','new','get','hd','use','game','games','make','makes','play',
             'fun','features', 'need','live', 'also','using','best','us','app','apps', 'one', '2020','2021',
            'this','like','enjoy','thing','free'])

# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])

# # Extend for most common irregular verbs (except pay,lose,send,buy,spend)

stop.extend(['say','make','go','take','come','see','know','get','got','give',
            'find','think','tell','become','show','leave','feel','put','bring',
            'begin','keep','hold','stand','hear','let','mean','set',
            'sit','speak','lie','lead','grow','fall','understand',
             'break','cut','rise','drive','choose'])

# ## Extend for prepositions
stop.extend(['without','among'])

In [318]:
vectorizer = CountVectorizer(stop_words = stop)

doc_word = vectorizer.fit_transform(df_analysis.description_clean)



In [319]:
nmf_model = NMF(40)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_



In [320]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['car',
  'driving',
  'parking',
  'cars',
  'simulator',
  'real',
  'drift',
  'city',
  'driver'],
 ['photo',
  'editor',
  'photos',
  'frames',
  'camera',
  'effects',
  'background',
  'frame',
  'collage'],
 ['keyboard',
  'theme',
  'themes',
  'emoji',
  'typing',
  'cute',
  'emojis',
  'download',
  'note'],
 ['robot',
  'transform',
  'transforming',
  'flying',
  'robots',
  'transformation',
  'battle',
  'war',
  'futuristic'],
 ['video',
  'videos',
  'maker',
  'chat',
  'audio',
  'editor',
  'player',
  'download',
  'effects'],
 ['maker',
  'text',
  'create',
  'stickers',
  'add',
  'card',
  'logo',
  'photos',
  'sticker'],
 ['truck',
  'driving',
  'simulator',
  'cargo',
  'transport',
  'driver',
  'offroad',
  'heavy',
  'road'],
 ['shooting',
  'fps',
  'gun',
  'sniper',
  'shooter',
  'commando',
  'strike',
  'action',
  'terrorist'],
 ['coloring',
  'book',
  'color',
  'pages',
  'glitter',
  'girls',
  'number',
  'beautiful',
  'kids'],
 ['bike',


In [321]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [322]:
df_analysis.sort_values('topic').sample(10)

Unnamed: 0,title,description_clean,topic
1883,Cubedise,cubedise - game puzzle from the first person i...,36
7239,Farm Harvest 3- Match 3 Game,tony planted a farm at the foot of alpshe want...,36
2815,Ripio Bitcoin Wallet: the new digital economy,were the biggest crypto-platform in latin amer...,30
1257,Doraemon MusicPad,welcome to abc doraemoncelebrating the 80th an...,12
16870,Strategy for League of Legends,this app provided the information about league...,4
8085,Mountain Lion Family Sim : Animal Simulator,dive into the wildlife of mountain lion family...,36
10228,Fitvate - Home & Gym Workout Trainer Fitness P...,fitvate - home gym workout trainer is an easy...,35
15000,Mountain Bike Xtreme,mountain bike xtreme allows you to become a pr...,9
13103,Human Life,can you escape - holidays - out nowin our live...,36
66,Balls Bricks Breaker - Stack Blast,balls bricks breaker 3 - stack blast is a simp...,36


## Lemma

In [323]:
df_analysis2 = df[['title','description_clean']].copy()
df_analysis2.reset_index(drop=True,inplace=True)

In [324]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

vectorizer2 = CountVectorizer(tokenizer = LemmaTokenizer(),
                              stop_words = stop)

doc_word2 = vectorizer2.fit_transform(df_analysis.description_clean)



In [325]:
nmf_model2 = NMF(40)
doc_topic2 = nmf_model2.fit_transform(doc_word2)
topic_word2 = nmf_model2.components_

In [326]:
words2 = vectorizer2.get_feature_names()
t2 = nmf_model2.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words2 = [[words2[e] for e in l] for l in t2]
topic_words2

[['--', 'u', 'device', 'hindi', 'bubble', 'please', 'create', '2', 'hotspot'],
 ['photo',
  'frame',
  'editor',
  'effect',
  'picture',
  'image',
  'background',
  'collage',
  'camera'],
 ['car',
  'driving',
  'parking',
  'racing',
  'simulator',
  'real',
  'drift',
  'city',
  'vehicle'],
 ['phone',
  'android',
  'device',
  'mobile',
  'ringtones',
  'number',
  'battery',
  'sound',
  'application'],
 ['video',
  'maker',
  'download',
  'chat',
  'audio',
  'create',
  'share',
  'effect',
  'downloader'],
 ['robot',
  'transform',
  'transforming',
  'flying',
  'war',
  'transformation',
  'battle',
  'futuristic',
  'shooting'],
 ['keyboard',
  'theme',
  'emoji',
  'typing',
  'font',
  'language',
  'note',
  'cute',
  'emojis'],
 ['truck',
  'driving',
  'simulator',
  'transport',
  'cargo',
  'driver',
  'offroad',
  'army',
  'heavy'],
 ['-',
  'feature',
  'support',
  'setting',
  'mode',
  'color',
  'change',
  'background',
  'download'],
 ['shooting',
  'gun'

In [327]:
df_analysis2['topic'] = doc_topic2.argmax(axis=1)

## Stemmed

In [328]:
df['tokens'] = df.description_clean.apply(nltk.word_tokenize)

In [329]:
stemmer = PorterStemmer()

df['stemmed'] = df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

In [330]:
df.tokens = df.tokens.apply(lambda x: ' '.join(x))

In [331]:
df.stemmed = df.stemmed.apply(lambda x: ' '.join(x))

In [332]:
df_analysis3 = df[['title','stemmed']].copy()
df_analysis3.reset_index(drop=True,inplace=True)

In [333]:
df_analysis3

Unnamed: 0,title,stemmed
0,World War 2: Offline Strategy,command alli in 25 epic world war 2 locat we h...
1,"All PDF - PDF Reader, PDF Viewer & PDF Converter",pdf reader for android pdf convert free is one...
2,MSN Sports - Scores & Schedule,be in a leagu of your ownget real-tim game upd...
3,QRbot: QR & barcode reader,scan all kind of qr code and barcod with the q...
4,QR & Barcode Scanner,qr barcod scannerthi qr barcod scanner will le...
...,...,...
17136,Aksara Jawa - Nulis Aksara Jawa | Ketik & Konv...,aksara jawa adalah salah satu aset budaya indo...
17137,Mp3 Songs Download,the applic provid search stream and download a...
17138,PDF Maker,main featur of orangepalm s pdf maker app- sel...
17139,DSLR Blur Photo,thi app let you blur part of your photo which ...


In [334]:
vectorizer3 = CountVectorizer(stop_words = stop)

doc_word3 = vectorizer3.fit_transform(df_analysis3.stemmed)



In [335]:
nmf_model3 = NMF(30)
doc_topic3 = nmf_model3.fit_transform(doc_word3)
topic_word3 = nmf_model3.components_



In [336]:
words3 = vectorizer3.get_feature_names()
t3 = nmf_model3.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words3 = [[words3[e] for e in l] for l in t3]
topic_words3

[['thi', 'download', 'differ', 'level', 'applic', 'time', 'veri', 'ani', 'ha'],
 ['car', 'race', 'park', 'drift', 'stunt', 'simul', 'real', 'extrem', 'speed'],
 ['photo',
  'frame',
  'editor',
  'effect',
  'pictur',
  'imag',
  'background',
  'collag',
  'camera'],
 ['video',
  'download',
  'statu',
  'maker',
  'chat',
  'share',
  'creat',
  'audio',
  'player'],
 ['robot',
  'transform',
  'fli',
  'war',
  'fight',
  'battl',
  'futurist',
  'shoot',
  'car'],
 ['keyboard',
  'theme',
  'emoji',
  'type',
  'font',
  'cute',
  'languag',
  'note',
  'person'],
 ['color',
  'book',
  'page',
  'number',
  'girl',
  'paint',
  'glitter',
  'beauti',
  'kid'],
 ['shoot',
  'gun',
  'fp',
  'sniper',
  'shooter',
  'mission',
  'commando',
  'terrorist',
  'armi'],
 ['truck',
  'transport',
  'simul',
  'cargo',
  'driver',
  'offroad',
  'road',
  'armi',
  'heavi'],
 ['bike',
  'race',
  'stunt',
  'ramp',
  'track',
  'imposs',
  'moto',
  'extrem',
  'ride'],
 ['wallpap',
  'ba

In [337]:
df_analysis3['topic'] = doc_topic3.argmax(axis=1)

In [338]:
df_analysis.topic.tail(20)

17121    18
17122    17
17123    14
17124     3
17125    21
17126     2
17127    33
17128     2
17129    37
17130     5
17131    35
17132    36
17133     1
17134    19
17135    16
17136    33
17137    12
17138    19
17139     1
17140    25
Name: topic, dtype: int64

In [339]:
#Alarm Adzan Otomatis Muslim Indonesia 2020 : T...

In [340]:
df_analysis3[df_analysis3['topic'] == 1].sample(60)

Unnamed: 0,title,stemmed,topic
16782,Extreme Car Mountain Climb 3D,in thi awesom mountain game you can drive your...,1
5160,Real City Racer,drift and race ha never been so fun real citi ...,1
12523,Furious Payback - 2020's new Action Racing Game,note thi is not an offici ff gamefuri payback ...,1
17104,Fanatics Car Drive,fanat car drive - open world physic engin car ...,1
12268,RC City Police Heavy Traffic Racer,rc citi polic heavi traffic racer is the most ...,1
14826,Train Vs Car Racing 2 Player,are you readi racer for someth big excit train...,1
770,BR Style,race with brazilian car drag race and wheeli i...,1
1997,Crossy Brakes : Blocky Toon Racer,whi couldnt the chicken cross the roadcrossi b...,1
1933,Crime Car Driving Simulator,take control of amaz car speed car suv and 4x4...,1
12599,Cars Transport Trailer : cars transporter 2020,you will have to load car on the big trailer d...,1


In [341]:
doc_topic_nmf = pd.DataFrame(doc_topic3.round(5),
                             index = df_analysis3['title'])
doc_topic_nmf

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
World War 2: Offline Strategy,0.02081,0.00000,0.00000,0.00000,0.04090,0.00000,0.00000,0.02372,0.00000,0.00000,...,0.11688,0.00208,0.02755,0.00000,0.00151,0.0,0.00000,0.00000,0.00000,0.00149
"All PDF - PDF Reader, PDF Viewer & PDF Converter",0.11631,0.00090,0.01826,0.00000,0.00000,0.00000,0.10433,0.00055,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.06315,0.00679,0.0,0.00000,0.00000,0.00925,0.02079
MSN Sports - Scores & Schedule,0.00000,0.00676,0.00000,0.04157,0.00000,0.00000,0.00000,0.00175,0.00004,0.00553,...,0.06148,0.00000,0.00000,0.00317,0.00000,0.0,0.00000,0.00000,0.00000,0.00150
QRbot: QR & barcode reader,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.01079,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.16957,0.00000,0.00000,0.0,0.00000,0.00000,0.00000,0.00000
QR & Barcode Scanner,0.00000,0.00000,0.00104,0.00000,0.00000,0.00033,0.05412,0.00000,0.00000,0.00095,...,0.00692,0.00638,0.00000,0.00359,0.00000,0.0,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Aksara Jawa - Nulis Aksara Jawa | Ketik & Konversi,0.00000,0.00000,0.00000,0.00317,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00124,0.00000,0.00000,0.0,0.00874,0.00000,0.00000,0.05191
Mp3 Songs Download,0.06727,0.00172,0.00000,0.01632,0.00000,0.01701,0.00000,0.00000,0.00128,0.04117,...,0.00000,0.00000,0.02838,0.01494,0.03807,0.0,0.00000,0.00000,0.03796,0.02507
PDF Maker,0.04779,0.00000,0.31073,0.03184,0.00000,0.00000,0.00000,0.00181,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.02774,0.08071,0.0,0.00000,0.03155,0.00565,0.00434
DSLR Blur Photo,0.02410,0.00000,0.10400,0.00667,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.01666,0.00289,0.0,0.00000,0.00000,0.00000,0.00025


In [342]:
# pca2 = PCA(n_components=15)
# pca2.fit(df_analysis3['stemmed'])
# pcafeatures_train2 = pca2.transform(df_analysis3['stemmed'])

In [343]:
# plt.plot(pca2.explained_variance_ratio_)
# plt.xlabel('# components')
# plt.ylabel('explained variance');
# plt.title('Scree plot for digits dataset');

In [344]:
# plt.plot(np.cumsum(pca2.explained_variance_ratio_))
# plt.xlabel('# components')
# plt.ylabel('cumulative explained variance');
# plt.title('Cumulative explained variance by PCA for digits');

In [345]:
# from matplotlib import pyplot as plt
# %matplotlib inline

In [346]:
# doc_word3_array = doc_word3.toarray()

In [347]:
# NUM_COMPONENTS = 200
# pca = PCA(NUM_COMPONENTS)
# reduced = pca.fit_transform(doc_word3_array)

In [348]:
# variance_explained = np.cumsum(pca.explained_variance_)

In [349]:
# fig, ax = plt.subplots(figsize=(15, 8))
# plt.plot(range(NUM_COMPONENTS),variance_explained, color='r')
# ax.grid(True)
# plt.xlabel("Number of components")
# plt.ylabel("Cumulative explained variance")
# #It takes around 300 components to explain 60% of variance. While 300 components post that explain only 10%