In [104]:
import pandas as pd
from pandas import option_context
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer 

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob

In [105]:
df = pd.read_csv('04-data/preprocessed_app_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40475 entries, 0 to 40474
Data columns (total 56 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     40475 non-null  object 
 1   description               40475 non-null  object 
 2   summary                   40463 non-null  object 
 3   installs                  40475 non-null  object 
 4   minInstalls               40475 non-null  float64
 5   score                     40475 non-null  float64
 6   ratings                   40475 non-null  float64
 7   reviews                   40475 non-null  float64
 8   histogram                 40475 non-null  object 
 9   price                     40475 non-null  float64
 10  free                      40475 non-null  int64  
 11  currency                  40475 non-null  object 
 12  sale                      40475 non-null  bool   
 13  offersIAP                 40475 non-null  bool   
 14  inAppP

In [106]:
df.title.nunique()

40475

In [107]:
df[df['title'].duplicated() == True]

Unnamed: 0,title,description,summary,installs,minInstalls,score,ratings,reviews,histogram,price,...,star_2,star_3,star_4,star_5,top_developer,current_date,days,installs_day,updated_days,has_video


In [108]:
df_analysis = df[['title','score','comments']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [109]:
df_analysis = pd.DataFrame(df_analysis.comments.str.split(",").tolist(), index=df_analysis.title).stack()
df_analysis = df_analysis.reset_index([0, 'title'])
df_analysis.columns = ['title', 'comments']

In [110]:
df_analysis

Unnamed: 0,title,comments
0,World War 2: Offline Strategy,"[""I just started playing"
1,World War 2: Offline Strategy,I'm on the 3rd level and it's already so hard...
2,World War 2: Offline Strategy,games are fun when they are easy to play but ...
3,World War 2: Offline Strategy,plus there are way too many ads on this game....
4,World War 2: Offline Strategy,"""Fun game to play with decent graphics"
...,...,...
2683186,Tarot Card Reading,
2683187,Tarot Card Reading,we all are destined to be here
2683188,Tarot Card Reading,there and maybe for the unsure anywhere for a...
2683189,Tarot Card Reading,'This app is very awesome and amazing and eve...


In [111]:
def clean_text(text):
    '''Make text lowercase, remove punctuation, remove links and mentions'''
    text = text.lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub('[0-9\n]',' ',text)
    #text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub("\s\s+", " ", text)
    
    return text

df_analysis['comments'] = df_analysis.comments.map(clean_text)

In [112]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for comment in df_analysis.comments:
    sentiment.append(sid_obj.polarity_scores(comment))

In [113]:
sentiment_df = pd.DataFrame(sentiment)
sentiment_df

Unnamed: 0,neg,neu,pos,compound
0,0.000,0.625,0.375,0.2023
1,0.206,0.794,0.000,-0.4409
2,0.071,0.647,0.281,0.8555
3,0.119,0.766,0.115,-0.0258
4,0.000,0.467,0.533,0.6908
...,...,...,...,...
2683186,0.000,0.000,0.000,0.0000
2683187,0.000,1.000,0.000,0.0000
2683188,0.031,0.933,0.036,0.0644
2683189,0.000,0.510,0.490,0.9109


In [114]:
df_merged = pd.concat([df_analysis,sentiment_df],axis=1)

In [115]:
df_merged.shape

(2683191, 6)

In [126]:
new = pd.DataFrame(df_merged.groupby('title')['compound'].mean())

In [128]:
new

Unnamed: 0_level_0,compound
title,Unnamed: 1_level_1
"""Memory Game for kids"" - Memory Game",0.428942
"#1 Vocab App: Hindu Editorial, Grammar, Dictionary",0.475844
#DRIVE,0.351430
#SelfCare,0.416312
(FPL) Fantasy Football Manager for Premier League,0.269035
...,...
zumba deluxe 2020,0.337665
✨Impossible Draw👆: Color helix puzzle maze,0.236921
蝦皮購物 | 花得更少買得更好,0.350481
👻My Town : Haunted House - Scary Game for Kids 👻,0.410969


In [127]:
test = df_merged[['title','compound']]

In [124]:
test

Unnamed: 0,title,compound
0,World War 2: Offline Strategy,0.2023
1,World War 2: Offline Strategy,-0.4409
2,World War 2: Offline Strategy,0.8555
3,World War 2: Offline Strategy,-0.0258
4,World War 2: Offline Strategy,0.6908
...,...,...
2683186,Tarot Card Reading,0.0000
2683187,Tarot Card Reading,0.0000
2683188,Tarot Card Reading,0.0644
2683189,Tarot Card Reading,0.9109


In [119]:
test['compound'].describe()

count    2.683191e+06
mean     2.456523e-01
std      4.725654e-01
min     -1.000000e+00
25%      0.000000e+00
50%      2.737000e-01
75%      6.369000e-01
max      1.000000e+00
Name: compound, dtype: float64

In [120]:
df_merged

Unnamed: 0,title,comments,neg,neu,pos,compound
0,World War 2: Offline Strategy,i just started playing,0.000,0.625,0.375,0.2023
1,World War 2: Offline Strategy,im on the rd level and its already so hard th...,0.206,0.794,0.000,-0.4409
2,World War 2: Offline Strategy,games are fun when they are easy to play but ...,0.071,0.647,0.281,0.8555
3,World War 2: Offline Strategy,plus there are way too many ads on this game ...,0.119,0.766,0.115,-0.0258
4,World War 2: Offline Strategy,fun game to play with decent graphics,0.000,0.467,0.533,0.6908
...,...,...,...,...,...,...
2683186,Tarot Card Reading,,0.000,0.000,0.000,0.0000
2683187,Tarot Card Reading,we all are destined to be here,0.000,1.000,0.000,0.0000
2683188,Tarot Card Reading,there and maybe for the unsure anywhere for a...,0.031,0.933,0.036,0.0644
2683189,Tarot Card Reading,this app is very awesome and amazing and ever...,0.000,0.510,0.490,0.9109


In [129]:
new.to_csv('merged_sentiment.csv')

In [122]:
#蝦皮購物 | 花得更少買得更好