In [1]:
import pandas as pd
from pandas import option_context
import numpy as np
import re
import string
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [2]:
df = pd.read_csv('04-data/test_data_scraped.csv')

In [3]:
print('Shape:', df.shape)
print( )
print('Columns:', df.columns)

Shape: (3376676, 13)

Columns: Index(['Unnamed: 0', 'reviewId', 'userName', 'userImage', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt', 'sortOrder', 'appId'],
      dtype='object')


In [4]:
df['count'] = 1

In [5]:
df_agg = df.groupby(['userName']).agg({'count':sum})

In [6]:
df_analysis = df[['appId','content']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [7]:
df_analysis['content'] = df_analysis['content'].astype(str)

In [8]:
def clean_text(text):
    '''Make text lowercase, remove punctuation, remove links and mentions'''
    text = re.sub('[0-9\n]',' ',text)
    text = text.lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub("\s\s+", " ", text)
    text = re.sub("[^a-zA-Z0-9 -]","",text)
    
    return text

df_analysis['content'] = df_analysis.content.map(clean_text)

## Topic modeling

In [13]:
df_analysis_topic = df_analysis.copy()

In [14]:
df_analysis_topic['tokens'] = df_analysis_topic.content.apply(nltk.word_tokenize)

In [15]:
stemmer = PorterStemmer()

df_analysis_topic['stemmed'] = df_analysis_topic['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

In [16]:
df_analysis_topic.tokens = df_analysis_topic.tokens.apply(lambda x: ' '.join(x))

In [17]:
df_analysis_topic.stemmed = df_analysis_topic.stemmed.apply(lambda x: ' '.join(x))

In [18]:
df_analysis_topic = df_analysis_topic[df_analysis_topic['stemmed'].map(len) > 15]

In [19]:
#df_analysis1 = df[['title','stemmed']].copy()
df_analysis_topic.reset_index(drop=True,inplace=True)

In [20]:
df_analysis_topic.shape

(1938897, 4)

In [140]:
stop = stopwords.words('english')
stop.extend(['free','get','hd','use','game','games','make','makes','play',
             'fun','features', 'need','live', 'also','using','best','us','app','apps', 'one', '2020','2021',
            'this','like','enjoy','thing','free'])

# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])


stop.extend(['car','race','go','thi','thing','becaus','thank','give','people',
            'interest','pleas','thing','every','way','player','win',
            'onli','coin','spend','say','want','realli','veri','wa',
            'because','very','many','puzzle','onli','challeng','great',
            'nice','cute','good','would','easi','peopl','tri', 'abl',
            'take','buy','mani','everi','pop','doe','set','amaz',
            'ha','robot','drive','video','download','record','song','music',
            'level','ca','photo','wonder','star','show','think','awesom',
            'could','got','shoot','lot','easili','love','cool','tv','fantast',
            '30','truck','ani','first','add','editor','beauti', 'instal',
            'day','turn','charact','second','kid','see','doesnt','im','ive',
            'sure','sinc','fine','didnt','dont','know','sinc','whi','anyth',
            'properli','perfectli','hope','wont','cant','highli', 'android',
            'recommend','excel','worst','wast','look','new','pl','keep','come','bad',
            'annoy','sometim','seem','screen','connect','samsung','uninstal','open',
            'time','pass','problem','paid','applic','call','number','phone','version'])

# ## Extend for prepositions
stop.extend(['without','among'])

In [141]:
vectorizer = CountVectorizer(stop_words = stop)

doc_word = vectorizer.fit_transform(df_analysis_topic.stemmed)



In [142]:
nmf_model = NMF(7)
doc_topic = nmf_model.fit_transform(doc_word)
topic_word = nmf_model.components_

In [143]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-6:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['ad', 'watch', 'remov', 'click', 'forc'],
 ['work', 'stop', 'devic', 'reinstal', 'button'],
 ['updat', 'last', 'latest', 'year', 'befor'],
 ['option', 'chang', 'start', 'find', 'featur'],
 ['fix', 'bug', 'issu', 'crash', 'load'],
 ['money', 'pay', 'real', 'card', 'purchas'],
 ['help', 'learn', 'find', 'account', 'support']]

Topics:
* Ads in the app
* Device compatability
* App updates
* Finding, using and requesting features
* App bugs
* Payments-related
* Using the app and app support

In [144]:
df_analysis_topic['topic'] = doc_topic.argmax(axis=1)

In [145]:
with option_context('display.max_colwidth', 600):
    display(df_analysis_topic[df_analysis_topic['topic'] == 2].sample(5))

Unnamed: 0,appId,content,tokens,stemmed,topic
1778164,de.dwd.warnapp,funktioniert nicht hinter einem proxyserver keine daten,funktioniert nicht hinter einem proxyserver keine daten,funktioniert nicht hinter einem proxyserv kein daten,2
327543,de.elfsoft.bestbrokers,its a good app but its needs to be updated to show prices for the stocks you own not just percentages as well as other things,its a good app but its needs to be updated to show prices for the stocks you own not just percentages as well as other things,it a good app but it need to be updat to show price for the stock you own not just percentag as well as other thing,2
996413,com.com2us.soccerspirits.normal2.freefull.google.global.android.common,after the reborn update this game feels bad appreciate the dev to change the character upgrade systems with spirit and stone upgrade skip result the downside was grindings means more stamina to be consumed yet the game have a lot of stages and or events to play especially to get those character spirit from every stages,after the reborn update this game feels bad appreciate the dev to change the character upgrade systems with spirit and stone upgrade skip result the downside was grindings means more stamina to be consumed yet the game have a lot of stages and or events to play especially to get those character spirit from every stages,after the reborn updat thi game feel bad appreci the dev to chang the charact upgrad system with spirit and stone upgrad skip result the downsid wa grind mean more stamina to be consum yet the game have a lot of stage and or event to play especi to get those charact spirit from everi stage,2
728116,com.generalmagic.magicearth,my phone was automatically disconnected and out of this app once i received the update,my phone was automatically disconnected and out of this app once i received the update,my phone wa automat disconnect and out of thi app onc i receiv the updat,2
1361468,de.kroegerama.android4batpercent,after my latest update my galaxy s it doest work anymore,after my latest update my galaxy s it doest work anymore,after my latest updat my galaxi s it doest work anymor,2


## Sentiment analysis 

In [146]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for comment in df_analysis_topic.content:
    sentiment.append(sid_obj.polarity_scores(comment))

In [147]:
sentiment_df = pd.DataFrame(sentiment)
sentiment_df

Unnamed: 0,neg,neu,pos,compound
0,0.341,0.659,0.000,-0.4767
1,0.000,1.000,0.000,0.0000
2,0.108,0.700,0.192,0.8271
3,0.103,0.897,0.000,-0.4449
4,0.241,0.759,0.000,-0.5423
...,...,...,...,...
1938892,0.000,0.284,0.716,0.8775
1938893,0.000,0.618,0.382,0.4754
1938894,0.000,0.423,0.577,0.6249
1938895,0.000,1.000,0.000,0.0000


## Merge topic model and sentiment analysis

In [148]:
df_merged = pd.concat([df_analysis_topic,sentiment_df],axis=1)
df_merged

In [150]:
df_merged.groupby('topic')['compound'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,246835.0,0.245739,0.467223,-1.0,0.0,0.3612,0.6369,0.9998
1,142233.0,0.163732,0.468477,-0.9989,-0.0705,0.029,0.5719,0.9989
2,70292.0,0.141557,0.472527,-0.9941,-0.1027,0.0,0.5267,0.9999
3,1224932.0,0.181478,0.485676,-0.9997,-0.1027,0.1531,0.6249,1.0
4,91384.0,0.1709,0.485608,-0.9918,-0.1531,0.1779,0.5927,0.9977
5,94654.0,0.152984,0.520062,-0.9928,-0.2263,0.1406,0.6183,0.9995
6,68567.0,0.444057,0.433744,-0.9935,0.2163,0.5574,0.7783,0.9977


### Create dataframe which is average sentiment per topic per app
This will be used in the linear regression modeling

In [175]:
final_df = df_merged.groupby(['appId','topic'])['compound'].mean().reset_index()

In [176]:
final_df = final_df.pivot_table(index=['appId'],columns=['topic'],values='compound').fillna(0)

In [185]:
final_df.rename(columns = {0: 'app_ads',
                                  1: 'compatibility',
                                  2: 'updates',
                                  3: 'features',
                                  4: 'bugs',
                                  5: 'payments',
                                  6: 'use_support'},
                                   inplace=True)

In [186]:
final_df.to_csv('04-data/sentiment_topics_apps.csv')