In [1]:
import pandas as pd
import numpy as np

import re
import os
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
reviews = pd.read_csv('googleplaystore_user_reviews.csv')
reviews = reviews.dropna()
reviews.head(10)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3
5,10 Best Foods for You,Best way,Positive,1.0,0.3
6,10 Best Foods for You,Amazing,Positive,0.6,0.9
8,10 Best Foods for You,"Looking forward app,",Neutral,0.0,0.0
9,10 Best Foods for You,It helpful site ! It help foods get !,Neutral,0.0,0.0
10,10 Best Foods for You,good you.,Positive,0.7,0.6
11,10 Best Foods for You,Useful information The amount spelling errors ...,Positive,0.2,0.1


In [6]:
# clean text data
def clean_text(sentence):
    sent = sentence.lower()  # lowercase
    sent = re.sub(r'[^\w\s]',' ',sent) # remove punctuation
    sent = sent.replace(os.linesep,"")  # remove line break
    sent = re.sub(r'\d+','',sent)  # remove digits
    sent = ' '.join([tok for tok in sent.split() if tok not in STOP_WORDS]) # remove stop words
    return sent

In [3]:
# encode sentiment into numeric values
conditions = [
    (reviews['Sentiment'] == 'Positive'),
    (reviews['Sentiment'] == 'Neutral'),
    (reviews['Sentiment'] == 'Negative')]

choices = [1, 0, -1]
reviews['Sentiment_encode'] = np.select(conditions, choices, default= None)

In [11]:
reviews['reviews'] = reviews.Translated_Review.apply(clean_text) 
reviews.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_encode,reviews
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,1,like eat delicious food s m cooking food case ...
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,1,help eating healthy exercise regular basis
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,1,works great especially going grocery store
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3,1,best idea
5,10 Best Foods for You,Best way,Positive,1.0,0.3,1,best way


In [12]:
# take a look at the sentiment distribution
reviews.head(10)
reviews.Sentiment_encode.value_counts() 

 1    23998
-1     8271
 0     5158
Name: Sentiment_encode, dtype: int64

In [5]:
# split data into training and test data
# train, test = train_test_split(reviews,test_size = 0.3,random_state = 0) 
# print(train.shape)
# print(test.shape)

(26198, 6)
(11229, 6)


In [8]:
# train['reviews'] = train.Translated_Review.apply(clean_text) 
# train.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_encode,reviews
8671,Apex Launcher,Been using paid version years now. Originally ...,Positive,0.34,0.68,1,paid version years originally brilliant update...
29070,ConvertPad - Unit Converter,"I love app, using ages, however latest ver 3.1...",Positive,0.425,0.55,1,love app ages latest ver scaling issues samsun...
58115,H&M,I hate 2 weeks waiting items I find NOT gettin...,Negative,-0.9,0.95,-1,hate weeks waiting items find getting items pu...
12111,Bagan - Myanmar Keyboard,The best,Positive,1.0,0.3,1,best
2609,AC - Tips & News for Android™,New TOS data collection.. I'm out!!! (Uninstal...,Positive,0.266335,0.454545,1,new tos data collection m uninstalled


In [26]:
# save data to csv file
save_df = reviews[['Sentiment_encode','reviews']]
save_df = save_df.rename(columns={'Sentiment_encode':'sentiment'})
save_df = save_df.dropna()
save_df.to_csv('clean_reviews.csv',encoding = 'utf-8',index = False)

In [27]:
# read csv file again
df = pd.read_csv('clean_reviews.csv')
df.head()

Unnamed: 0,sentiment,reviews
0,1,like eat delicious food s m cooking food case ...
1,1,help eating healthy exercise regular basis
2,1,works great especially going grocery store
3,1,best idea
4,1,best way


In [32]:
# modification: try countVectorize the whole data set
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['reviews'])
print(X.shape)

y = df['sentiment']
print(y.shape)

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state =0)

NameError: name 'y' is not defined

In [70]:
# Naive Bayes
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

text_clf_NB = Pipeline([('vect', CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])

text_clf_NB.fit(train.reviews.values, train.Sentiment_encode.astype('int'))  # 注意这里要specify type为 "int"
predicted_NB = text_clf_NB.predict(test.reviews.values)
predicted_NB

array([ 1,  1, -1, ...,  1,  1,  1])

In [84]:
print(train.reviews.values.shape)
train.Sentiment_encode.astype('int').shape
train.reviews.values[0]

(26198,)


'paid version years originally brilliant update introduced bugs tangible improvements m point default icons displaying monochrome folder blank page change system launcher access home page badge notifications work years ago iffy depressing brilliant launcher'

In [85]:
print(X_train.toarray().shape)
print(y_train.astype('int').shape)
print(X_test.toarray().shape)   # 因为train跟test是分开去countVectorize的，所以col数不一样 (word feature 数量不一致)

(26198, 17331)
(26198,)
(11229, 11890)


In [76]:
# 另一种方法 Naive Bayes
# Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
clf = MultinomialNB()
clf.fit(X_train.toarray(),y_train.astype('int'))
y_pred = clf.predict(X_test.toarray())
y_pred[:10]

ValueError: shapes (11229,11890) and (17331,3) not aligned: 11890 (dim 1) != 17331 (dim 0)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

# clf.score(X_test.toarray(), y_test)