In [166]:
import pandas as pd
from pandas import option_context
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob

In [167]:
df = pd.read_csv('04-data/preprocessed_app_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22144 entries, 0 to 22143
Data columns (total 56 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     22144 non-null  object 
 1   description               22144 non-null  object 
 2   summary                   22143 non-null  object 
 3   installs                  22144 non-null  object 
 4   minInstalls               22144 non-null  float64
 5   score                     22144 non-null  float64
 6   ratings                   22144 non-null  float64
 7   reviews                   22144 non-null  float64
 8   histogram                 22144 non-null  object 
 9   price                     22144 non-null  float64
 10  free                      22144 non-null  int64  
 11  currency                  22144 non-null  object 
 12  sale                      22144 non-null  bool   
 13  offersIAP                 22144 non-null  bool   
 14  inAppP

In [168]:
df = df[df['description_clean'].notna()]
df = df[df['score'] >= 3.8]

In [169]:
df_analysis = df[['title','description_clean']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [170]:
df_analysis

Unnamed: 0,title,description_clean
0,World War 2: Offline Strategy,command allies in 25 epic world war 2 locatio...
1,"All PDF - PDF Reader, PDF Viewer & PDF Converter",pdf reader for android pdfs converter free is ...
2,MSN Sports - Scores & Schedule,be in a league of your ownget real-time game u...
3,QRbot: QR & barcode reader,scan all kinds of qr codes and barcodes with t...
4,QR & Barcode Scanner,qr barcode scannerthis qr barcode scanner wi...
...,...,...
17263,Mp3 Songs Download,the application provides search stream and dow...
17264,PDF Maker,main features of orangepalm s pdf maker app- s...
17265,DSLR Blur Photo,this app lets you blur parts of your photo whi...
17266,Hyderabad RTC Info,1find out the details of the bus numbers2get b...


In [171]:
stop = stopwords.words('english')
stop.extend(['free','new','get','hd','use','game','games','make','makes','play',
             'fun','features', 'need','live', 'also','using','best','us','app','apps', 'one', '2020','2021',
            'this','like','enjoy'])

In [193]:
vectorizer = CountVectorizer(stop_words = stop)

doc_word = vectorizer.fit_transform(df_analysis.description_clean)

In [194]:
nmf_model = NMF(60)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_

In [195]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['time',
  'data',
  'access',
  'information',
  'mobile',
  'available',
  'application',
  'account',
  'help'],
 ['car',
  'cars',
  'driving',
  'simulator',
  'drift',
  'drive',
  'wash',
  'city',
  'drifting'],
 ['photo',
  'editor',
  'photos',
  'frames',
  'frame',
  'collage',
  'effects',
  'suit',
  'gallery'],
 ['robot',
  'transform',
  'transforming',
  'flying',
  'robots',
  'transformation',
  'battle',
  'war',
  'futuristic'],
 ['video',
  'videos',
  'player',
  'download',
  'editor',
  'downloader',
  'maker',
  'effects',
  'slideshow'],
 ['keyboard',
  'theme',
  'themes',
  'emoji',
  'typing',
  'cute',
  'download',
  'emojis',
  'huawei'],
 ['truck',
  'cargo',
  'transport',
  'driving',
  'simulator',
  'driver',
  'drive',
  'offroad',
  'monster'],
 ['shooting',
  'fps',
  'gun',
  'shooter',
  'sniper',
  'commando',
  'strike',
  'terrorist',
  'action'],
 ['coloring',
  'book',
  'color',
  'pages',
  'glitter',
  'girls',
  'beautiful',
  'numbe

In [196]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [197]:
df_analysis.sort_values('topic').sample(10)

Unnamed: 0,title,description_clean,topic
12112,Kids Cars hill Racing games - Toddler Driving,the game is actually designed for kids learnin...,57
16336,Tamil Comedy,watch super hit tamil comedy videos latest ful...,50
10074,Birthday Countdown Widget,countdown to birthdays in special ways such as...,15
181,Moshi: Sleep and Mindfulness,enjoy calmer day times and quicker bedtimes wi...,51
14777,Zombie Shooter Hell 4 Survival,zombie hell 4 takes you back for a ride in hel...,37
10390,Gym Exercises & Workouts,join over 1000000 users get fit and strong in...,35
12939,Secret High School 4: Love Triangle,what is the feeling of love and betrayal bella...,51
16506,SouzaSim - Drag Race,souzasim drag race is all about racing those m...,9
6837,Oil Tycoon - Idle Tap Factory & Miner Clicker ...,be an oil tycoon miner by managing your oil we...,59
12554,MR RACER : Car Racing Game 2020 - Car Race Games,1 free offline car racing game mr racer game i...,57


## Lemma

In [178]:
df_analysis2 = df[['title','description_clean']].copy()
df_analysis2.reset_index(drop=True,inplace=True)

In [179]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

vectorizer2 = CountVectorizer(tokenizer = LemmaTokenizer(),
                              stop_words = stop)

doc_word2 = vectorizer2.fit_transform(df_analysis.description_clean)



In [180]:
nmf_model2 = NMF(50)
doc_topic2 = nmf_model2.fit_transform(doc_word2)
topic_word2 = nmf_model2.components_

In [181]:
words2 = vectorizer2.get_feature_names()
t2 = nmf_model2.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words2 = [[words2[e] for e in l] for l in t2]
topic_words2

[['--', 'u', 'device', 'hindi', 'bubble', 'create', 'please', '2', 'hotspot'],
 ['effect',
  'camera',
  'photo',
  'background',
  'blur',
  'image',
  'editor',
  'filter',
  'picture'],
 ['car',
  'driving',
  'parking',
  'drive',
  'drift',
  'vehicle',
  'city',
  'real',
  'wash'],
 ['photo',
  'frame',
  'editor',
  'collage',
  'picture',
  'gallery',
  'beautiful',
  'pic',
  'add'],
 ['video',
  'maker',
  'download',
  'chat',
  'audio',
  'downloader',
  'share',
  'editor',
  'slideshow'],
 ['robot',
  'transform',
  'transforming',
  'flying',
  'war',
  'transformation',
  'battle',
  'futuristic',
  'mech'],
 ['theme', 'phone', 'samsung', 'apply', 'black', 's8', 'gold', 'cool', 'neon'],
 ['truck',
  'cargo',
  'driving',
  'transport',
  'driver',
  'drive',
  'simulator',
  'offroad',
  'monster'],
 ['-',
  'feature',
  'setting',
  'support',
  'mode',
  'beat',
  'download',
  'version',
  'change'],
 ['shooting',
  'gun',
  'fps',
  'shooter',
  'sniper',
  'missio

In [182]:
df_analysis2['topic'] = doc_topic2.argmax(axis=1)

## Stemmed

In [183]:
df['tokens'] = df.description_clean.apply(nltk.word_tokenize)

In [184]:
stemmer = PorterStemmer()

df['stemmed'] = df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

In [185]:
df.tokens = df.tokens.apply(lambda x: ' '.join(x))

In [186]:
df.stemmed = df.stemmed.apply(lambda x: ' '.join(x))

In [187]:
df_analysis3 = df[['title','stemmed']].copy()
df_analysis3.reset_index(drop=True,inplace=True)

In [188]:
df_analysis3

Unnamed: 0,title,stemmed
0,World War 2: Offline Strategy,command alli in 25 epic world war 2 locat we h...
1,"All PDF - PDF Reader, PDF Viewer & PDF Converter",pdf reader for android pdf convert free is one...
2,MSN Sports - Scores & Schedule,be in a leagu of your ownget real-tim game upd...
3,QRbot: QR & barcode reader,scan all kind of qr code and barcod with the q...
4,QR & Barcode Scanner,qr barcod scannerthi qr barcod scanner will le...
...,...,...
17263,Mp3 Songs Download,the applic provid search stream and download a...
17264,PDF Maker,main featur of orangepalm s pdf maker app- sel...
17265,DSLR Blur Photo,thi app let you blur part of your photo which ...
17266,Hyderabad RTC Info,1find out the detail of the bu numbers2get bu ...


In [189]:
vectorizer3 = CountVectorizer(stop_words = stop)

doc_word3 = vectorizer3.fit_transform(df_analysis3.stemmed)

In [198]:
nmf_model3 = NMF(60)
doc_topic3 = nmf_model3.fit_transform(doc_word3)
topic_word3 = nmf_model3.components_



In [199]:
words3 = vectorizer3.get_feature_names()
t3 = nmf_model3.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words3 = [[words3[e] for e in l] for l in t3]
topic_words3

[['thi', 'applic', 'veri', 'like', 'ha', 'ani', 'differ', 'enjoy', 'go'],
 ['car',
  'drive',
  'simul',
  'drift',
  'vehicl',
  'driver',
  'citi',
  'wash',
  'real'],
 ['photo',
  'frame',
  'editor',
  'collag',
  'pictur',
  'galleri',
  'add',
  'beauti',
  'edit'],
 ['robot',
  'transform',
  'fli',
  'war',
  'fight',
  'battl',
  'futurist',
  'shoot',
  'mech'],
 ['video',
  'maker',
  'chat',
  'edit',
  'share',
  'editor',
  'effect',
  'slideshow',
  'creat'],
 ['keyboard',
  'theme',
  'type',
  'emoji',
  'font',
  'languag',
  'cute',
  'arab',
  'note'],
 ['color',
  'book',
  'page',
  'girl',
  'glitter',
  'paint',
  'kid',
  'number',
  'beauti'],
 ['truck',
  'drive',
  'transport',
  'simul',
  'cargo',
  'offroad',
  'driver',
  'road',
  'armi'],
 ['shoot',
  'gun',
  'fp',
  'shooter',
  'sniper',
  'mission',
  'commando',
  'terrorist',
  'strike'],
 ['bike',
  'moto',
  'drive',
  'rider',
  'stunt',
  'ride',
  'motorcycl',
  'motorbik',
  'taxi'],
 ['wa

In [200]:
df_analysis3['topic'] = doc_topic3.argmax(axis=1)