In [1]:
import pandas as pd
from pandas import option_context
import numpy as np
import re
import string
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [2]:
df = pd.read_csv('04-data/test_data_scraped.csv')

In [3]:
print('Shape:', df.shape)
print( )
print('Columns:', df.columns)

Shape: (3376676, 13)

Columns: Index(['Unnamed: 0', 'reviewId', 'userName', 'userImage', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt', 'sortOrder', 'appId'],
      dtype='object')


In [4]:
df['count'] = 1

In [5]:
df_agg = df.groupby(['userName']).agg({'count':sum})

In [6]:
df_analysis = df[['appId','content']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [7]:
df_analysis['content'] = df_analysis['content'].astype(str)

In [8]:
def clean_text(text):
    '''Make text lowercase, remove punctuation, remove links and mentions'''
    text = re.sub('[0-9\n]',' ',text)
    text = text.lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub("\s\s+", " ", text)
    text = re.sub("[^a-zA-Z0-9 -]","",text)
    
    return text

df_analysis['content'] = df_analysis.content.map(clean_text)

## Topic modeling

In [9]:
df_analysis_topic = df_analysis.copy()

In [10]:
df_analysis_topic['tokens'] = df_analysis_topic.content.apply(nltk.word_tokenize)

In [11]:
stemmer = PorterStemmer()

df_analysis_topic['stemmed'] = df_analysis_topic['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

In [12]:
df_analysis_topic.tokens = df_analysis_topic.tokens.apply(lambda x: ' '.join(x))

In [13]:
df_analysis_topic.stemmed = df_analysis_topic.stemmed.apply(lambda x: ' '.join(x))

In [14]:
df_analysis_topic = df_analysis_topic[df_analysis_topic['stemmed'].map(len) > 15]

In [15]:
#df_analysis1 = df[['title','stemmed']].copy()
df_analysis_topic.reset_index(drop=True,inplace=True)

In [16]:
df_analysis_topic.shape

(1938897, 4)

In [17]:
stop = stopwords.words('english')
stop.extend(['free','get','hd','use','game','games','make','makes','play',
             'fun','features', 'need','live', 'also','using','best','us','app','apps', 'one', '2020','2021',
            'this','like','enjoy','thing','free'])

# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])


stop.extend(['car','race','go','thi','thing','becaus','thank','give','people',
            'interest','pleas','thing','every','way','player','win',
            'onli','coin','spend','say','want','realli','veri','wa',
            'because','very','many','puzzle','onli','challeng','great',
            'nice','cute','good','would','easi','peopl','tri', 'abl',
            'take','buy','mani','everi','pop','doe','set','amaz',
            'ha','robot','drive','video','download','record','song','music',
            'level','ca','photo','wonder','star','show','think','awesom',
            'could','got','shoot','lot','easili','love','cool','tv','fantast',
            '30','truck','ani','first','add','editor','beauti', 'instal',
            'day','turn','charact','second','kid','see','doesnt','im','ive',
            'sure','sinc','fine','didnt','dont','know','sinc','whi','anyth',
            'properli','perfectli','hope','wont','cant','highli', 'android',
            'recommend','excel','worst','wast','look','new','pl','keep','come','bad',
            'annoy','sometim','seem','screen','connect','samsung','uninstal','open',
            'time','pass','problem','paid','applic','call','number','phone','version'])

# ## Extend for prepositions
stop.extend(['without','among'])

In [18]:
vectorizer = CountVectorizer(stop_words = stop)

doc_word = vectorizer.fit_transform(df_analysis_topic.stemmed)



In [19]:
nmf_model = NMF(7)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_

In [20]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-6:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['ad', 'watch', 'remov', 'click', 'forc'],
 ['work', 'stop', 'devic', 'reinstal', 'button'],
 ['updat', 'last', 'latest', 'year', 'befor'],
 ['option', 'chang', 'start', 'find', 'featur'],
 ['fix', 'bug', 'issu', 'crash', 'load'],
 ['money', 'pay', 'real', 'card', 'purchas'],
 ['help', 'learn', 'find', 'account', 'support']]

Topics:
* Ads in the app
* Device compatability
* App updates
* Finding, using and requesting features
* App bugs
* Payments-related
* Using the app and app support

In [21]:
df_analysis_topic['topic'] = doc_topic.argmax(axis=1)

In [22]:
with option_context('display.max_colwidth', 600):
    display(df_analysis_topic[df_analysis_topic['topic'] == 2].sample(5))

Unnamed: 0,appId,content,tokens,stemmed,topic
197174,com.superluckycasino.nolimit.slots.vegas.android.free,this game wont even launch cant get past the update to new version message looooooser,this game wont even launch cant get past the update to new version message looooooser,thi game wont even launch cant get past the updat to new version messag looooooser,2
541091,com.app.notepad.vault.hider,after recent update the app opening without password,after recent update the app opening without password,after recent updat the app open without password,2
540155,com.bloomberg.android.anywhere,new update a disaster shuts down in minutes,new update a disaster shuts down in minutes,new updat a disast shut down in minut,2
444894,com.survivalcrafting.gunship.battle.crafting.building.flying.shooting.ww2.game.army.war.air.survival.minecraft.helicopter.mcpe,hey i have a question do you still update this game,hey i have a question do you still update this game,hey i have a question do you still updat thi game,2
931235,kr.co.angames.astrokings.google.android,update after weeks playing im now giving up too after attacks by same playeri lost everything days shield and planet relocated by the gameits too little too late at a month i cant afford daily shields i cant keep building resources only to get back to square one federation cer in game are nothing but bullies and even said theyd stop the war if all players stayed out of public chat not a game for children or those with financial responsibilities,update after weeks playing im now giving up too after attacks by same playeri lost everything days shield and planet relocated by the gameits too little too late at a month i cant afford daily shields i cant keep building resources only to get back to square one federation cer in game are nothing but bullies and even said theyd stop the war if all players stayed out of public chat not a game for children or those with financial responsibilities,updat after week play im now give up too after attack by same playeri lost everyth day shield and planet reloc by the gameit too littl too late at a month i cant afford daili shield i cant keep build resourc onli to get back to squar one feder cer in game are noth but bulli and even said theyd stop the war if all player stay out of public chat not a game for children or those with financi respons,2


## Sentiment analysis 

In [23]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for comment in df_analysis_topic.content:
    sentiment.append(sid_obj.polarity_scores(comment))

In [24]:
sentiment_df = pd.DataFrame(sentiment)
sentiment_df

Unnamed: 0,neg,neu,pos,compound
0,0.341,0.659,0.000,-0.4767
1,0.000,1.000,0.000,0.0000
2,0.108,0.700,0.192,0.8271
3,0.103,0.897,0.000,-0.4449
4,0.241,0.759,0.000,-0.5423
...,...,...,...,...
1938892,0.000,0.284,0.716,0.8775
1938893,0.000,0.618,0.382,0.4754
1938894,0.000,0.423,0.577,0.6249
1938895,0.000,1.000,0.000,0.0000


## Merge topic model and sentiment analysis

In [25]:
df_merged = pd.concat([df_analysis_topic,sentiment_df],axis=1)
df_merged

Unnamed: 0,appId,content,tokens,stemmed,topic,neg,neu,pos,compound
0,com.skizze.wwii,poor game just another copy of game,poor game just another copy of game,poor game just anoth copi of game,3,0.341,0.659,0.000,-0.4767
1,com.skizze.wwii,soooo many addsavoid,soooo many addsavoid,soooo mani addsavoid,3,0.000,1.000,0.000,0.0000
2,com.skizze.wwii,i just started playing im on the rd level and ...,i just started playing im on the rd level and ...,i just start play im on the rd level and it al...,5,0.108,0.700,0.192,0.8271
3,com.skizze.wwii,adware youll spend more time watching ads then...,adware youll spend more time watching ads then...,adwar youll spend more time watch ad then pay ...,5,0.103,0.897,0.000,-0.4449
4,com.skizze.wwii,this does not even work at all the enemy are n...,this does not even work at all the enemy are n...,thi doe not even work at all the enemi are not...,1,0.241,0.759,0.000,-0.5423
...,...,...,...,...,...,...,...,...,...
1938892,tarotcardreadingfree.bitapps,exceptional readings with great talent no chea...,exceptional readings with great talent no chea...,except read with great talent no cheat,3,0.000,0.284,0.716,0.8775
1938893,tarotcardreadingfree.bitapps,really liked my first reading itself,really liked my first reading itself,realli like my first read itself,3,0.000,0.618,0.382,0.4754
1938894,tarotcardreadingfree.bitapps,awesome and spot on,awesome and spot on,awesom and spot on,3,0.000,0.423,0.577,0.6249
1938895,tarotcardreadingfree.bitapps,i have to see if its accurate,i have to see if its accurate,i have to see if it accur,3,0.000,1.000,0.000,0.0000


In [43]:
df_merged.to_csv('sentiment_tableau.csv')

In [26]:
df_merged.groupby('topic')['compound'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,246881.0,0.24575,0.467249,-1.0,0.0,0.3612,0.6369,0.9998
1,142208.0,0.16377,0.468525,-0.9989,-0.070425,0.0292,0.5719,0.9989
2,70331.0,0.141569,0.472511,-0.9941,-0.1027,0.0,0.5267,0.9999
3,1224437.0,0.181526,0.48566,-0.9997,-0.1027,0.1531,0.6249,1.0
4,91556.0,0.170869,0.485518,-0.9918,-0.1531,0.1779,0.5927,0.9977
5,95198.0,0.153032,0.51995,-0.9928,-0.2263,0.1406,0.6183,0.9995
6,68286.0,0.444348,0.433809,-0.9935,0.21955,0.5574,0.7783,0.9977


In [32]:
df_comp = df_merged.groupby(['appId'])['compound'].mean().reset_index()

In [33]:
df_comp

Unnamed: 0,appId,compound
0,AutomateIt.mainPackage,0.179567
1,B4A.BigFivePersonalityTest,0.107415
2,DOCECG2.doctor,0.135315
3,Gecko.Droid.PhysicsHelper,0.254861
4,MyING.be,0.084341
...,...,...
22140,zombie.survival.online.craft,0.151007
22141,zoo.rescue,0.231101
22142,zsj.android.systemappremover,0.181393
22143,zumbafitness.weightlossdance,0.206941


### Create dataframe which is average sentiment per topic per app
This will be used in the linear regression modeling

In [27]:
final_df = df_merged.groupby(['appId','topic'])['compound'].mean().reset_index()

In [28]:
final_df = final_df.pivot_table(index=['appId'],columns=['topic'],values='compound').fillna(0)

In [29]:
final_df.rename(columns = {0: 'app_ads',
                                  1: 'compatibility',
                                  2: 'updates',
                                  3: 'features',
                                  4: 'bugs',
                                  5: 'payments',
                                  6: 'use_support'},
                                   inplace=True)

In [40]:
final_df = pd.merge(
    final_df,
    df_comp,
    how="inner",
    on='appId',
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
)

In [41]:
final_df

Unnamed: 0,appId,app_ads,compatibility,updates,features,bugs,payments,use_support,compound
0,AutomateIt.mainPackage,0.219857,0.219800,0.004050,0.157251,0.078522,0.000000,0.796167,0.179567
1,B4A.BigFivePersonalityTest,0.006000,0.081300,0.000000,0.107957,0.165250,0.000000,0.421500,0.107415
2,DOCECG2.doctor,0.103000,0.516950,0.307100,0.041460,0.726900,0.000000,0.329025,0.135315
3,Gecko.Droid.PhysicsHelper,0.014882,0.547950,0.516650,0.229898,0.134900,-0.612400,0.657855,0.254861
4,MyING.be,0.100560,0.058135,0.084291,0.109709,0.159910,-0.162960,0.116720,0.084341
...,...,...,...,...,...,...,...,...,...
22140,zombie.survival.online.craft,0.477687,-0.633700,0.227817,0.237792,0.198900,-0.300818,0.000000,0.151007
22141,zoo.rescue,0.276058,-0.014333,-0.004500,0.249309,0.184100,0.143767,0.459833,0.231101
22142,zsj.android.systemappremover,0.336800,0.210760,-0.119300,0.146147,0.025800,-0.283200,0.375044,0.181393
22143,zumbafitness.weightlossdance,-0.035430,0.303322,0.000000,0.210964,0.000000,-0.239500,0.540414,0.206941


In [42]:
final_df.to_csv('04-data/sentiment_topics_apps.csv')