In [281]:
import pandas as pd
from pandas import option_context
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, PCA

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob

In [282]:
df = pd.read_csv('04-data/preprocessed_app_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22050 entries, 0 to 22049
Data columns (total 56 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     22050 non-null  object 
 1   description               22050 non-null  object 
 2   summary                   22049 non-null  object 
 3   installs                  22050 non-null  object 
 4   minInstalls               22050 non-null  float64
 5   score                     22050 non-null  float64
 6   ratings                   22050 non-null  float64
 7   reviews                   22050 non-null  float64
 8   histogram                 22050 non-null  object 
 9   price                     22050 non-null  float64
 10  free                      22050 non-null  int64  
 11  currency                  22050 non-null  object 
 12  sale                      22050 non-null  bool   
 13  offersIAP                 22050 non-null  bool   
 14  inAppP

In [283]:
df = df[df['description_clean'].notna()]
df = df[df['score'] >= 3.8]

In [284]:
df_analysis = df[['title','description_clean']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [285]:
df_analysis

Unnamed: 0,title,description_clean
0,World War 2: Offline Strategy,command allies in 25 epic world war 2 locatio...
1,"All PDF - PDF Reader, PDF Viewer & PDF Converter",pdf reader for android pdfs converter free is ...
2,MSN Sports - Scores & Schedule,be in a league of your ownget real-time game u...
3,QRbot: QR & barcode reader,scan all kinds of qr codes and barcodes with t...
4,QR & Barcode Scanner,qr barcode scannerthis qr barcode scanner wi...
...,...,...
17169,Mp3 Songs Download,the application provides search stream and dow...
17170,PDF Maker,main features of orangepalm s pdf maker app- s...
17171,DSLR Blur Photo,this app lets you blur parts of your photo whi...
17172,Hyderabad RTC Info,1find out the details of the bus numbers2get b...


In [286]:
stop = stopwords.words('english')
stop.extend(['free','new','get','hd','use','game','games','make','makes','play',
             'fun','features', 'need','live', 'also','using','best','us','app','apps', 'one', '2020','2021',
            'this','like','enjoy','thing','free'])

# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])

# # Extend for most common irregular verbs (except pay,lose,send,buy,spend)

stop.extend(['say','make','go','take','come','see','know','get','got','give',
            'find','think','tell','become','show','leave','feel','put','bring',
            'begin','keep','hold','stand','hear','let','mean','set',
            'sit','speak','lie','lead','grow','fall','understand',
             'break','cut','rise','drive','choose'])

# ## Extend for prepositions
stop.extend(['without','among'])

In [287]:
vectorizer = CountVectorizer(stop_words = stop)

doc_word = vectorizer.fit_transform(df_analysis.description_clean)



In [288]:
nmf_model = NMF(40)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_

In [289]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['car',
  'driving',
  'cars',
  'simulator',
  'drift',
  'driver',
  'city',
  'drifting',
  'real'],
 ['photo',
  'editor',
  'photos',
  'frames',
  'camera',
  'effects',
  'background',
  'frame',
  'collage'],
 ['keyboard',
  'theme',
  'themes',
  'emoji',
  'typing',
  'cute',
  'emojis',
  'download',
  'note'],
 ['robot',
  'transform',
  'transforming',
  'flying',
  'robots',
  'transformation',
  'battle',
  'war',
  'futuristic'],
 ['video',
  'videos',
  'maker',
  'chat',
  'audio',
  'player',
  'editor',
  'download',
  'effects'],
 ['time',
  'friends',
  'card',
  'help',
  'online',
  'world',
  'puzzle',
  'match',
  'levels'],
 ['truck',
  'driving',
  'simulator',
  'transport',
  'cargo',
  'offroad',
  'driver',
  'army',
  'transporter'],
 ['shooting',
  'fps',
  'gun',
  'sniper',
  'shooter',
  'commando',
  'strike',
  'action',
  'terrorist'],
 ['coloring',
  'book',
  'color',
  'pages',
  'glitter',
  'girls',
  'number',
  'beautiful',
  'kids'],
 ['

In [290]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [291]:
df_analysis.sort_values('topic').sample(10)

Unnamed: 0,title,description_clean,topic
11295,Railway PNR Check,railway pnr check enjoy the lightning speed ra...,24
6252,Story Saver for Instagram,save stories easily download photo or videos s...,24
9099,Ground Breaking 3D,take aim and pull the trigger in this satisfyi...,32
3303,KLWP Live Wallpaper Maker,make your android launcher look unique with ku...,31
1418,Car transporter - vehicle transport trailer truck,welcome to the car transporter games - vehicle...,6
4327,Puzzle Combat: Match-3 RPG,puzzle combat is a dynamic match-3 action game...,32
16384,Mehndi Designs fashion Free,using this best mehndi designs of all time app...,27
6509,Royal Revolt!,royal revolt is a brand new reverse t-defense ...,32
5224,Jiggy: Full Body Swap Videos & Reface GIFs,make your friends parents grandma and grandpa ...,4
1964,Wall Jump,lets challenge how high you can jump up across...,5


## Lemma

In [292]:
df_analysis2 = df[['title','description_clean']].copy()
df_analysis2.reset_index(drop=True,inplace=True)

In [293]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

vectorizer2 = CountVectorizer(tokenizer = LemmaTokenizer(),
                              stop_words = stop)

doc_word2 = vectorizer2.fit_transform(df_analysis.description_clean)



In [294]:
nmf_model2 = NMF(40)
doc_topic2 = nmf_model2.fit_transform(doc_word2)
topic_word2 = nmf_model2.components_

In [295]:
words2 = vectorizer2.get_feature_names()
t2 = nmf_model2.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words2 = [[words2[e] for e in l] for l in t2]
topic_words2

[['--', 'u', 'device', 'hindi', 'bubble', 'please', 'create', '2', 'hotspot'],
 ['photo',
  'frame',
  'editor',
  'effect',
  'picture',
  'image',
  'background',
  'collage',
  'camera'],
 ['car',
  'driving',
  'parking',
  'racing',
  'simulator',
  'real',
  'drift',
  'city',
  'vehicle'],
 ['phone',
  'android',
  'device',
  'mobile',
  'ringtones',
  'number',
  'battery',
  'sound',
  'application'],
 ['video',
  'maker',
  'chat',
  'download',
  'audio',
  'create',
  'share',
  'effect',
  'downloader'],
 ['robot',
  'transform',
  'transforming',
  'flying',
  'war',
  'transformation',
  'battle',
  'futuristic',
  'shooting'],
 ['keyboard',
  'theme',
  'emoji',
  'typing',
  'font',
  'language',
  'note',
  'cute',
  'emojis'],
 ['truck',
  'driving',
  'simulator',
  'transport',
  'cargo',
  'driver',
  'offroad',
  'army',
  'heavy'],
 ['-',
  'feature',
  'support',
  'setting',
  'mode',
  'color',
  'change',
  'background',
  'download'],
 ['shooting',
  'gun'

In [296]:
df_analysis2['topic'] = doc_topic2.argmax(axis=1)

## Stemmed

In [297]:
df['tokens'] = df.description_clean.apply(nltk.word_tokenize)

In [298]:
stemmer = PorterStemmer()

df['stemmed'] = df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

In [299]:
df.tokens = df.tokens.apply(lambda x: ' '.join(x))

In [300]:
df.stemmed = df.stemmed.apply(lambda x: ' '.join(x))

In [301]:
df_analysis3 = df[['title','stemmed']].copy()
df_analysis3.reset_index(drop=True,inplace=True)

In [302]:
df_analysis3

Unnamed: 0,title,stemmed
0,World War 2: Offline Strategy,command alli in 25 epic world war 2 locat we h...
1,"All PDF - PDF Reader, PDF Viewer & PDF Converter",pdf reader for android pdf convert free is one...
2,MSN Sports - Scores & Schedule,be in a leagu of your ownget real-tim game upd...
3,QRbot: QR & barcode reader,scan all kind of qr code and barcod with the q...
4,QR & Barcode Scanner,qr barcod scannerthi qr barcod scanner will le...
...,...,...
17169,Mp3 Songs Download,the applic provid search stream and download a...
17170,PDF Maker,main featur of orangepalm s pdf maker app- sel...
17171,DSLR Blur Photo,thi app let you blur part of your photo which ...
17172,Hyderabad RTC Info,1find out the detail of the bu numbers2get bu ...


In [303]:
vectorizer3 = CountVectorizer(stop_words = stop)

doc_word3 = vectorizer3.fit_transform(df_analysis3.stemmed)



In [304]:
nmf_model3 = NMF(40)
doc_topic3 = nmf_model3.fit_transform(doc_word3)
topic_word3 = nmf_model3.components_

In [305]:
words3 = vectorizer3.get_feature_names()
t3 = nmf_model3.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words3 = [[words3[e] for e in l] for l in t3]
topic_words3

[['design',
  'maker',
  'creat',
  'logo',
  'text',
  'background',
  'font',
  'add',
  'imag'],
 ['car', 'race', 'park', 'drift', 'stunt', 'simul', 'extrem', 'real', 'speed'],
 ['photo',
  'frame',
  'editor',
  'effect',
  'pictur',
  'imag',
  'collag',
  'camera',
  'background'],
 ['video',
  'maker',
  'chat',
  'audio',
  'download',
  'edit',
  'share',
  'slideshow',
  'creat'],
 ['robot',
  'transform',
  'fli',
  'war',
  'fight',
  'battl',
  'futurist',
  'shoot',
  'citi'],
 ['keyboard',
  'theme',
  'type',
  'emoji',
  'font',
  'languag',
  'arab',
  'hindi',
  'cute'],
 ['color',
  'book',
  'page',
  'number',
  'kid',
  'girl',
  'paint',
  'glitter',
  'beauti'],
 ['shoot',
  'gun',
  'fp',
  'shooter',
  'sniper',
  'mission',
  'commando',
  'terrorist',
  'armi'],
 ['truck',
  'transport',
  'cargo',
  'simul',
  'offroad',
  'driver',
  'road',
  'armi',
  'monster'],
 ['bike',
  'race',
  'stunt',
  'ramp',
  'track',
  'imposs',
  'moto',
  'extrem',
  'me

In [306]:
df_analysis3['topic'] = doc_topic3.argmax(axis=1)

In [307]:
#Alarm Adzan Otomatis Muslim Indonesia 2020 : T...

In [308]:
df_analysis3[df_analysis3['topic'] == 39].sample(60)

Unnamed: 0,title,stemmed,topic
3394,Facebook Analytics,easili track your growth engag and monet effor...,39
11511,Computer Course,free comput cours learn basic to use the compu...,39
3276,Fast Charging,fast charger also call fast charg is an app th...,39
4187,Furby Connect World,connect to a virtual world of surpris with the...,39
13166,Free VPN - Unblock & Fast Hotspot Security Proxy,freevpn is a lightning-fast app provid free vp...,39
5652,Govee Home,gove home is an app to help you manag your sma...,39
226,Parallels Client (legacy),thi version of the app should be use with para...,39
14661,Combo VPN,free vpn support http tcp and ssl connect mode...,39
13512,MyYes,now you can have access to your postpaid and p...,39
13828,Personality Trait Test,albert einstein onc said everyon is a geniu bu...,39


In [309]:
# pca2 = PCA(n_components=15)
# pca2.fit(df_analysis3['stemmed'])
# pcafeatures_train2 = pca2.transform(df_analysis3['stemmed'])

In [310]:
# plt.plot(pca2.explained_variance_ratio_)
# plt.xlabel('# components')
# plt.ylabel('explained variance');
# plt.title('Scree plot for digits dataset');

In [311]:
# plt.plot(np.cumsum(pca2.explained_variance_ratio_))
# plt.xlabel('# components')
# plt.ylabel('cumulative explained variance');
# plt.title('Cumulative explained variance by PCA for digits');

In [312]:
# from matplotlib import pyplot as plt
# %matplotlib inline

In [313]:
# doc_word3_array = doc_word3.toarray()

In [314]:
# NUM_COMPONENTS = 200
# pca = PCA(NUM_COMPONENTS)
# reduced = pca.fit_transform(doc_word3_array)

In [315]:
# variance_explained = np.cumsum(pca.explained_variance_)

In [316]:
# fig, ax = plt.subplots(figsize=(15, 8))
# plt.plot(range(NUM_COMPONENTS),variance_explained, color='r')
# ax.grid(True)
# plt.xlabel("Number of components")
# plt.ylabel("Cumulative explained variance")
# #It takes around 300 components to explain 60% of variance. While 300 components post that explain only 10%