# Sentiment Analysis on Google Play store apps

<h2> Import Data 

In [21]:
import pandas as pd
import numpy as np

import re
import os
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [22]:
reviews = pd.read_csv('googleplaystore_user_reviews.csv')
reviews = reviews.dropna()
reviews.head(10)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3
5,10 Best Foods for You,Best way,Positive,1.0,0.3
6,10 Best Foods for You,Amazing,Positive,0.6,0.9
8,10 Best Foods for You,"Looking forward app,",Neutral,0.0,0.0
9,10 Best Foods for You,It helpful site ! It help foods get !,Neutral,0.0,0.0
10,10 Best Foods for You,good you.,Positive,0.7,0.6
11,10 Best Foods for You,Useful information The amount spelling errors ...,Positive,0.2,0.1


<h2> Data cleaning & wrangling

In [23]:
# encode sentiment into numeric values
conditions = [
    (reviews['Sentiment'] == 'Positive'),
    (reviews['Sentiment'] == 'Neutral'),
    (reviews['Sentiment'] == 'Negative')]

choices = [1, 0, -1]
reviews['Sentiment_encode'] = np.select(conditions, choices, default= None)

In [24]:
# take a look at the sentiment distribution
reviews.Sentiment_encode.value_counts() 

 1    23998
-1     8271
 0     5158
Name: Sentiment_encode, dtype: int64

In [26]:
# clean text data
def clean_text(sentence):
    sent = sentence.lower()  # lowercase
    sent = re.sub(r'[^\w\s]',' ',sent) # remove punctuation
    sent = sent.replace(os.linesep,"")  # remove line break
    sent = re.sub(r'\d+','',sent)  # remove digits
    sent = ' '.join([tok for tok in sent.split() if tok not in STOP_WORDS]) # remove stop words
    return sent

In [32]:
reviews['reviews'] = reviews['Translated_Review'].apply(clean_text)
reviews.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_encode,reviews
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,1,like eat delicious food s m cooking food case ...
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,1,help eating healthy exercise regular basis
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,1,works great especially going grocery store
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3,1,best idea
5,10 Best Foods for You,Best way,Positive,1.0,0.3,1,best way


<h2> Split Data into training and test set

In [25]:
# split data into training and test data
from sklearn.model_selection import train_test_split
train, test = train_test_split(reviews,test_size = 0.3,random_state = 0) 
print(train.shape)
print(test.shape)

(26198, 6)
(11229, 6)


<h2> Naive Bayes classifier

In [30]:
# Naive Bayes
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

text_clf_NB = Pipeline([('vect', CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])

text_clf_NB.fit(train.reviews.values, train.Sentiment_encode.astype('int'))  # 注意这里要specify type为 "int"| 否则报错
predicted_NB = text_clf_NB.predict(test.reviews.values)
predicted_NB

array([ 1,  1, -1, ...,  1,  1,  1])

In [34]:
from sklearn.metrics import accuracy_score
accuracy_score(predicted_NB, test.Sentiment_encode.astype('int'))  # only get 0.68 accuracy

0.6873274556950753