In [53]:
import pandas as pd
from pandas import option_context
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob

In [54]:
df = pd.read_csv('04-data/preprocessed_app_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22015 entries, 0 to 22014
Data columns (total 56 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     22015 non-null  object 
 1   description               22015 non-null  object 
 2   summary                   22014 non-null  object 
 3   installs                  22015 non-null  object 
 4   minInstalls               22015 non-null  float64
 5   score                     22015 non-null  float64
 6   ratings                   22015 non-null  float64
 7   reviews                   22015 non-null  float64
 8   histogram                 22015 non-null  object 
 9   price                     22015 non-null  float64
 10  free                      22015 non-null  int64  
 11  currency                  22015 non-null  object 
 12  sale                      22015 non-null  bool   
 13  offersIAP                 22015 non-null  bool   
 14  inAppP

In [55]:
df.title.nunique()

22015

In [56]:
df[df['title'].duplicated() == True]

Unnamed: 0,title,description,summary,installs,minInstalls,score,ratings,reviews,histogram,price,...,star_2,star_3,star_4,star_5,top_developer,current_date,days,installs_day,updated_days,has_video


In [57]:
df_analysis = df[['title','score','comments']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [58]:
df_analysis = pd.DataFrame(df_analysis.comments.str.split(",").tolist(), index=df_analysis.title).stack()
df_analysis = df_analysis.reset_index([0, 'title'])
df_analysis.columns = ['title', 'comments']

In [60]:
def clean_text(text):
    '''Make text lowercase, remove punctuation, remove links and mentions'''
    text = text.lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub('[0-9\n]',' ',text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub("\s\s+", " ", text)
    
    return text

df_analysis['comments'] = df_analysis.comments.map(clean_text)

In [61]:
df_analysis['tokens'] = df_analysis.comments.apply(nltk.word_tokenize)

In [62]:
df_analysis

Unnamed: 0,title,comments,tokens
0,World War 2: Offline Strategy,i just started playing,"[i, just, started, playing]"
1,World War 2: Offline Strategy,im on the rd level and its already so hard th...,"[im, on, the, rd, level, and, its, already, so..."
2,World War 2: Offline Strategy,games are fun when they are easy to play but ...,"[games, are, fun, when, they, are, easy, to, p..."
3,World War 2: Offline Strategy,plus there are way too many ads on this game ...,"[plus, there, are, way, too, many, ads, on, th..."
4,World War 2: Offline Strategy,fun game to play with decent graphics,"[fun, game, to, play, with, decent, graphics]"
...,...,...,...
1473040,Tarot Card Reading,,[]
1473041,Tarot Card Reading,we all are destined to be here,"[we, all, are, destined, to, be, here]"
1473042,Tarot Card Reading,there and maybe for the unsure anywhere for a...,"[there, and, maybe, for, the, unsure, anywhere..."
1473043,Tarot Card Reading,this app is very awesome and amazing and ever...,"[this, app, is, very, awesome, and, amazing, a..."


In [63]:
stemmer = PorterStemmer()

df_analysis['stemmed'] = df_analysis['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

In [64]:
df_analysis.tokens = df_analysis.tokens.apply(lambda x: ' '.join(x))

In [65]:
df_analysis.stemmed = df_analysis.stemmed.apply(lambda x: ' '.join(x))

In [67]:
#df_analysis1 = df[['title','stemmed']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [257]:
df_analysis = df_analysis[df_analysis['stemmed'].map(len) > 15]

In [258]:
df_analysis.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1257775 entries, 0 to 1473044
Data columns (total 6 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   title           1257775 non-null  object
 1   comments        1257775 non-null  object
 2   tokens          1257775 non-null  object
 3   stemmed         1257775 non-null  object
 4   topic           1257775 non-null  int64 
 5   comments_check  1257775 non-null  object
dtypes: int64(1), object(5)
memory usage: 67.2+ MB


In [265]:
stop = stopwords.words('english')
stop.extend(['free','get','hd','use','game','games','make','makes','play',
             'fun','features', 'need','live', 'also','using','best','us','app','apps', 'one', '2020','2021',
            'this','like','enjoy','thing','free'])

# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])


stop.extend(['car','race','go','thi','thing','becaus','thank','give','people',
            'interest','pleas','thing','every','way','player','win',
            'onli','coin','spend','say','want','realli','veri','wa',
            'because','very','many','puzzle','onli','challeng','great',
            'nice','cute','good','would','easi','peopl','time','tri',
            'take','buy','mani','everi','pop','doe','set','amaz','easi',
            'ha','robot','drive','video','download','record','song','music',
            'level','ca','photo','wonder','star','show','think','awesom',
            'could','got','shoot','lot','easili','love','cool','tv','fantast',
            '30','truck','ani','first','add','editor','beauti', 'instal',
            'day','turn','charact','second','kid','see','doesnt','im','ive',
            'sure','sinc','fine','didnt','dont','know','sinc','whi','anyth',
            'properli','perfectli','hope','wont','cant','highli', 'android',
            'recommend','excel','worst','wast','look','new','pl','keep','come','bad',
            'annoy','connect','seem','sometim','featur'])

# ## Extend for prepositions
stop.extend(['without','among'])

In [276]:
vectorizer = CountVectorizer(stop_words = stop)

doc_word = vectorizer.fit_transform(df_analysis.stemmed)



In [277]:
nmf_model = NMF(8)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_

In [278]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-6:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['ad', 'watch', 'remov', 'uninstal', 'minut'],
 ['work', 'stop', 'devic', 'button', 'samsung'],
 ['fix', 'problem', 'issu', 'bug', 'crash'],
 ['phone', 'screen', 'call', 'number', 'old'],
 ['updat', 'version', 'last', 'latest', 'year'],
 ['money', 'option', 'pay', 'start', 'chang'],
 ['applic', 'develop', 'graphic', 'mobil', 'store'],
 ['help', 'learn', 'find', 'word', 'problem']]

In [279]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analysis['topic'] = doc_topic.argmax(axis=1)


In [280]:
with option_context('display.max_colwidth', 600):
    display(df_analysis[df_analysis['topic'] == 3].sample(5))

Unnamed: 0,title,comments,tokens,stemmed,topic,comments_check
487275,MirrorGo (Stream & Recorder),even if i had to force close to cut off pc even im thinking task manager might be useful in stopping process so that safely eject phone,even if i had to force close to cut off pc even im thinking task manager might be useful in stopping process so that safely eject phone,even if i had to forc close to cut off pc even im think task manag might be use in stop process so that safe eject phone,3,even if i had to force close to cut off pc even im thinking task manager might be useful in stopping process so that safely eject phone
302899,Dunia Games,up wise rhyyd i will be there for interview on the phone with the kids so we can go over to my place for me and i can come by and pick it up at your house and i will get you a check for the full day on the job site i was going on with the house is a good time to come by and see you soon and have a great day and i will be there at the same time as the other one is,up wise rhyyd i will be there for interview on the phone with the kids so we can go over to my place for me and i can come by and pick it up at your house and i will get you a check for the full day on the job site i was going on with the house is a good time to come by and see you soon and have a great day and i will be there at the same time as the other one is,up wise rhyyd i will be there for interview on the phone with the kid so we can go over to my place for me and i can come by and pick it up at your hous and i will get you a check for the full day on the job site i wa go on with the hous is a good time to come by and see you soon and have a great day and i will be there at the same time as the other one is,3,up wise rhyyd i will be there for interview on the phone with the kids so we can go over to my place for me and i can come by and pick it up at your house and i will get you a check for the full day on the job site i was going on with the house is a good time to come by and see you soon and have a great day and i will be there at the same time as the other one is
1238749,Ringtones - Wallpapers,i love minecraft ringtones i get to personalize the ring tone and text tones to all of my favorite people so i know exactly whos calling or texting without looking at my phone i even have all others apps personalized so i know which notification im receiving so many to choose from,i love minecraft ringtones i get to personalize the ring tone and text tones to all of my favorite people so i know exactly whos calling or texting without looking at my phone i even have all others apps personalized so i know which notification im receiving so many to choose from,i love minecraft rington i get to person the ring tone and text tone to all of my favorit peopl so i know exactli who call or text without look at my phone i even have all other app person so i know which notif im receiv so mani to choos from,3,i love minecraft ringtones i get to personalize the ring tone and text tones to all of my favorite people so i know exactly whos calling or texting without looking at my phone i even have all others apps personalized so i know which notification im receiving so many to choose from
290289,AppRadio,every few mins your phone trys to start the app even if you dont want it too so the music just stops as your the pioneer app takes over it makes me regret buying this pioneer stereo unit for my truck i even deleted the app,every few mins your phone trys to start the app even if you dont want it too so the music just stops as your the pioneer app takes over it makes me regret buying this pioneer stereo unit for my truck i even deleted the app,everi few min your phone tri to start the app even if you dont want it too so the music just stop as your the pioneer app take over it make me regret buy thi pioneer stereo unit for my truck i even delet the app,3,every few mins your phone trys to start the app even if you dont want it too so the music just stops as your the pioneer app takes over it makes me regret buying this pioneer stereo unit for my truck i even deleted the app
999040,Valley Parking 3D,this game is nice but why it not fully fit in my phones screen at the lower portion of the screen is a thick black lineand the main game is displayed above the black linewhy this is happening to me i cant get the full screen gaming experience from this gameplease fix it now,this game is nice but why it not fully fit in my phones screen at the lower portion of the screen is a thick black lineand the main game is displayed above the black linewhy this is happening to me i cant get the full screen gaming experience from this gameplease fix it now,thi game is nice but whi it not fulli fit in my phone screen at the lower portion of the screen is a thick black lineand the main game is display abov the black linewhi thi is happen to me i cant get the full screen game experi from thi gamepleas fix it now,3,this game is nice but why it not fully fit in my phones screen at the lower portion of the screen is a thick black lineand the main game is displayed above the black linewhy this is happening to me i cant get the full screen gaming experience from this gameplease fix it now


## Sentiment analysis

In [281]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for comment in df_analysis.comments:
    sentiment.append(sid_obj.polarity_scores(comment))

In [282]:
sentiment_df = pd.DataFrame(sentiment)
sentiment_df

Unnamed: 0,neg,neu,pos,compound
0,0.000,0.625,0.375,0.2023
1,0.206,0.794,0.000,-0.4409
2,0.071,0.647,0.281,0.8555
3,0.119,0.766,0.115,-0.0258
4,0.000,0.467,0.533,0.6908
...,...,...,...,...
1257770,0.000,0.696,0.304,0.5859
1257771,0.000,1.000,0.000,0.0000
1257772,0.031,0.933,0.036,0.0644
1257773,0.000,0.510,0.490,0.9109


In [283]:
df_merged = pd.concat([df_analysis,sentiment_df],axis=1)

In [284]:
df_merged

Unnamed: 0,title,comments,tokens,stemmed,topic,comments_check,neg,neu,pos,compound
0,World War 2: Offline Strategy,i just started playing,i just started playing,i just start play,5.0,i just started playing,0.000,0.625,0.375,0.2023
1,World War 2: Offline Strategy,im on the rd level and its already so hard th...,im on the rd level and its already so hard tha...,im on the rd level and it alreadi so hard that...,5.0,im on the rd level and its already so hard th...,0.206,0.794,0.000,-0.4409
2,World War 2: Offline Strategy,games are fun when they are easy to play but ...,games are fun when they are easy to play but w...,game are fun when they are easi to play but wh...,5.0,games are fun when they are easy to play but ...,0.071,0.647,0.281,0.8555
3,World War 2: Offline Strategy,plus there are way too many ads on this game ...,plus there are way too many ads on this game i...,plu there are way too mani ad on thi game ill ...,0.0,plus there are way too many ads on this game ...,0.119,0.766,0.115,-0.0258
4,World War 2: Offline Strategy,fun game to play with decent graphics,fun game to play with decent graphics,fun game to play with decent graphic,5.0,fun game to play with decent graphics,0.000,0.467,0.533,0.6908
...,...,...,...,...,...,...,...,...,...,...
1473037,Tarot Card Reading,good energies are passed on and just to go wi...,good energies are passed on and just to go wit...,good energi are pass on and just to go with th...,5.0,good energies are passed on and just to go wi...,,,,
1473041,Tarot Card Reading,we all are destined to be here,we all are destined to be here,we all are destin to be here,5.0,we all are destined to be here,,,,
1473042,Tarot Card Reading,there and maybe for the unsure anywhere for a...,there and maybe for the unsure anywhere for as...,there and mayb for the unsur anywher for as lo...,5.0,there and maybe for the unsure anywhere for a...,,,,
1473043,Tarot Card Reading,this app is very awesome and amazing and ever...,this app is very awesome and amazing and every...,thi app is veri awesom and amaz and everyth it...,5.0,this app is very awesome and amazing and ever...,,,,


In [285]:
df_merged.groupby('topic')['compound'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,112466.0,0.260407,0.497273,-1.0,0.0,0.3289,0.7003,1.0
1.0,66929.0,0.260869,0.495983,-0.9999,0.0,0.3291,0.6996,1.0
2.0,45409.0,0.26508,0.493878,-0.9994,0.0,0.34,0.7003,1.0
3.0,36265.0,0.255408,0.498196,-0.9989,0.0,0.3182,0.6956,1.0
4.0,35231.0,0.26369,0.492799,-1.0,0.0,0.3291,0.7003,0.9998
5.0,718056.0,0.261397,0.495384,-1.0,0.0,0.3252,0.7003,1.0
6.0,19398.0,0.244858,0.504374,-0.9999,0.0,0.3089,0.6901,1.0
7.0,45526.0,0.255024,0.495801,-0.9999,0.0,0.3182,0.6908,1.0


In [None]:
[['ad', 'watch', 'remov', 'uninstal', 'minut'],
 ['work', 'stop', 'devic', 'button', 'seem'],
 ['fix', 'problem', 'issu', 'bug', 'crash'],
 ['phone', 'screen', 'call', 'number', 'old'],
 ['updat', 'version', 'last', 'latest', 'year'],
 ['help', 'money', 'option', 'pay', 'start'],
 ['applic', 'featur', 'develop', 'help', 'mobil'],
 ['graphic', 'control', 'featur', 'sound', 'gameplay']]

In [None]:
[['ad', 'watch', 'remov', 'uninstal', 'minut'],
 ['work', 'stop', 'devic', 'button', 'samsung'],
 ['fix', 'problem', 'issu', 'bug', 'crash'],
 ['phone', 'screen', 'call', 'number', 'old'],
 ['updat', 'version', 'last', 'latest', 'year'],
 ['money', 'option', 'pay', 'start', 'chang'],
 ['applic', 'featur', 'develop', 'mobil', 'store'],
 ['graphic', 'control', 'featur', 'sound', 'gameplay'],
 ['help', 'learn', 'find', 'problem', 'word']]

In [149]:
# [['ad', 'watch', 'annoy', 'remov', 'uninstal'],
#  ['work', 'stop', 'connect', 'devic', 'button'],
#  ['money', 'option', 'pay', 'start', 'chang'],
#  ['phone', 'screen', 'instal', 'call', 'number'],
#  ['updat', 'version', 'last', 'latest', 'year'],
#  ['fix', 'problem', 'issu', 'bug', 'crash'],
#  ['bad', 'graphic', 'control', 'instal', 'featur'],
#  ['applic', 'featur', 'instal', 'develop', 'mobil'],
#  ['help', 'learn', 'find', 'problem', 'word']]

In [None]:
#Odnoklassniki Moderator