In [10]:
import pandas as pd
from pandas import option_context
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob

In [2]:
df = pd.read_csv('04-data/preprocessed_app_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22015 entries, 0 to 22014
Data columns (total 56 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     22015 non-null  object 
 1   description               22015 non-null  object 
 2   summary                   22014 non-null  object 
 3   installs                  22015 non-null  object 
 4   minInstalls               22015 non-null  float64
 5   score                     22015 non-null  float64
 6   ratings                   22015 non-null  float64
 7   reviews                   22015 non-null  float64
 8   histogram                 22015 non-null  object 
 9   price                     22015 non-null  float64
 10  free                      22015 non-null  int64  
 11  currency                  22015 non-null  object 
 12  sale                      22015 non-null  bool   
 13  offersIAP                 22015 non-null  bool   
 14  inAppP

In [3]:
df.title.nunique()

22015

In [4]:
df[df['title'].duplicated() == True]

Unnamed: 0,title,description,summary,installs,minInstalls,score,ratings,reviews,histogram,price,...,star_2,star_3,star_4,star_5,top_developer,current_date,days,installs_day,updated_days,has_video


In [5]:
df_analysis = df[['title','score','comments']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [6]:
df_analysis = pd.DataFrame(df_analysis.comments.str.split(",").tolist(), index=df_analysis.title).stack()
df_analysis = df_analysis.reset_index([0, 'title'])
df_analysis.columns = ['title', 'comments']

In [7]:
df_analysis

Unnamed: 0,title,comments
0,World War 2: Offline Strategy,"[""I just started playing"
1,World War 2: Offline Strategy,I'm on the 3rd level and it's already so hard...
2,World War 2: Offline Strategy,games are fun when they are easy to play but ...
3,World War 2: Offline Strategy,plus there are way too many ads on this game....
4,World War 2: Offline Strategy,"""Fun game to play with decent graphics"
...,...,...
1473040,Tarot Card Reading,
1473041,Tarot Card Reading,we all are destined to be here
1473042,Tarot Card Reading,there and maybe for the unsure anywhere for a...
1473043,Tarot Card Reading,'This app is very awesome and amazing and eve...


In [8]:
def clean_text(text):
    '''Make text lowercase, remove punctuation, remove links and mentions'''
    text = text.lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub('[0-9\n]',' ',text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub("\s\s+", " ", text)
    
    return text

df_analysis['comments'] = df_analysis.comments.map(clean_text)

In [9]:
df['tokens'] = df.comments.apply(nltk.word_tokenize)

In [11]:
stemmer = PorterStemmer()

df['stemmed'] = df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

In [12]:
df.tokens = df.tokens.apply(lambda x: ' '.join(x))

In [13]:
df.stemmed = df.stemmed.apply(lambda x: ' '.join(x))

In [14]:
df_analysis = df[['title','stemmed']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [15]:
df_analysis

Unnamed: 0,title,stemmed
0,World War 2: Offline Strategy,"[ `` I just start play , I 'm on the 3rd level..."
1,SoundSeeder -Play music simultaneously and in ...,[ `` onli need to buy the app on the devic you...
2,"All PDF - PDF Reader, PDF Viewer & PDF Converter","[ 'fine , with mani featur . but somewhat fail..."
3,MSN Sports - Scores & Schedule,[ `` I had been experienc the same lost favori...
4,QRbot: QR & barcode reader,[ `` the ad free version of thi app is the bes...
...,...,...
22010,Gas Pedal,[ 'when I start the car it goe sicko mode and ...
22011,PDF Maker,"[ 'it work . but rotat by itself the imag , ev..."
22012,DSLR Blur Photo,[ `` thi app is great and I can blur my pic to...
22013,Shoppers Stop Fashion Shopping,[ 'atroci app ... I place an order in the mont...


In [88]:
stop = stopwords.words('english')
stop.extend(['free','new','get','hd','use','game','games','make','makes','play',
             'fun','features', 'need','live', 'also','using','best','us','app','apps', 'one', '2020','2021',
            'this','like','enjoy','thing','free'])

# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])


stop.extend(['car','race','go','thi','thing','becaus','thank','give','people',
            'interest','pleas','thing','every','way','player','win',
            'onli','coin','spend','say','want','realli','veri','wa',
            'because','very','many','puzzle','onli','challeng','great',
            'nice','cute','good','would','easi','peopl','time','tri',
            'take','buy','mani','everi','pop','doe','set','amaz','easi',
            'ha','robot','drive','video','download','record','song','music',
            'level','ca','photo','wonder','star','show','think','awesom',
            'could','got','shoot','lot','easili','love','cool','tv','fantast',
            '30','truck','ani','first','add','bad','editor','beauti','recommend',
            'day'])

# ## Extend for prepositions
stop.extend(['without','among'])

In [89]:
vectorizer = CountVectorizer(stop_words = stop)

doc_word = vectorizer.fit_transform(df_analysis.stemmed)



In [90]:
nmf_model = NMF(10)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_

In [91]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['work',
  'phone',
  'connect',
  'devic',
  'android',
  'instal',
  'screen',
  'camera',
  'turn'],
 ['ad',
  'watch',
  'second',
  'uninstal',
  'annoy',
  'advertis',
  'remov',
  'start',
  'minut'],
 ['money',
  'pay',
  'start',
  'keep',
  'watch',
  'charact',
  'point',
  'fix',
  'upgrad'],
 ['graphic',
  'control',
  'featur',
  'simul',
  'gameplay',
  'real',
  'sound',
  'mission',
  'develop'],
 ['applic',
  'featur',
  'work',
  'help',
  'instal',
  'edit',
  'store',
  'mobil',
  'develop'],
 ['learn',
  'help',
  'word',
  'english',
  'languag',
  'translat',
  'know',
  'understand',
  'find'],
 ['updat',
  'fix',
  'open',
  'problem',
  'version',
  'issu',
  'crash',
  'load',
  'keep'],
 ['wallpap',
  'color',
  'pictur',
  'look',
  'screen',
  'option',
  'phone',
  'chang',
  'see'],
 ['read',
  'book',
  'stori',
  'chapter',
  'charact',
  'page',
  'news',
  'ticket',
  'find'],
 ['order',
  'servic',
  'call',
  'custom',
  'account',
  'card',
  'n

In [92]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [95]:
df_analysis[df_analysis['topic'] ==7].sample(60)

Unnamed: 0,title,stemmed,topic
13090,Bokeh Camera Effects,[ 'nice effect but most are unus as a full pho...,7
1039,Anime Kawaii Dress Up,[ 'so fun ! there is a litrl bug though ... I ...,7
48,Neon Tiger Keyboard Theme,[ 'nice what a app ! ! fast and easi to use tr...,7
21012,Super Fashion Designer HD,[ `` thi game look like a great game until you...,7
16365,Kids Doodle Glow,[ `` game wa good my child love it and i found...,7
12554,Remove & Add Watermark,"[ `` i'wil write a honest one , thi app ca n't...",7
12744,Wallpapers for Chat Backgrounds,[ `` I am give thi app 4 star becaus when I wa...,7
6486,Picta Photo Print - Free Same Day Photo Prints...,"[ ' I enjoy it . easi to use , but I am not su...",7
15820,Happy new year 2021 live wallpaper,[ `` thi a realli veri nice app ..... but i do...,7
5427,Glitter Ice Cream Coloring,[ `` It is veri relax when you just need somet...,7


In [None]:
# [['call',
#   'account',
#   'servic',
#   'phone',
#   'card',
#   'number',
#   'messag',
#   'custom',
#   'email'],
#  ['ad',
#   'watch',
#   'add',
#   'second',
#   'uninstal',
#   'annoy',
#   'bad',
#   'advertis',
#   'remov'],
#  ['work',
#   'phone',
#   'connect',
#   'devic',
#   'android',
#   'screen',
#   'instal',
#   'camera',
#   'turn'],
#  ['graphic',
#   'control',
#   'bad',
#   'add',
#   'featur',
#   'simul',
#   'real',
#   'gameplay',
#   'fantast'],
#  ['applic',
#   'featur',
#   'recommend',
#   'fantast',
#   'work',
#   'beauti',
#   'help',
#   'instal',
#   'store'],
#  ['learn',
#   'help',
#   'word',
#   'english',
#   'read',
#   'languag',
#   'translat',
#   'know',
#   'find'],
#  ['updat',
#   'fix',
#   'open',
#   'problem',
#   'version',
#   'issu',
#   'crash',
#   'load',
#   'keep'],
#  ['order',
#   'item',
#   'servic',
#   'deliveri',
#   'custom',
#   'product',
#   'food',
#   'shop',
#   'price'],
#  ['money',
#   'pay',
#   'day',
#   'start',
#   'stori',
#   'keep',
#   'first',
#   'charact',
#   'watch'],
#  ['wallpap',
#   'color',
#   'pictur',
#   'add',
#   'look',
#   'option',
#   'screen',
#   'phone',
#   'chang']]