# Supervised Learning Use Case: Product Review Sentiment Classifier

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn 

#### Steps completed:
1. Read in data
2. Remove null values, address NA values (not an issue with this dataset)
3. Split positive/negative categories

#### Next Steps:
1. Preprocess data (stem, lemmatize, remove stopwords)
2. Undersample to address class imbalance
3. Bag-of-words approach: Calculate TF * IDF vector
4. Use supervised techniques to classify valence

## Read in Data

In [2]:
# Load Amazon review data 
df = pd.read_json('reviews_Electronics_5.json',lines=True)

In [3]:
# Preview data set
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,528881469,"[0, 0]",5,We got this GPS for my husband who is an (OTR)...,"06 2, 2013",AO94DHGC771SJ,amazdnu,Gotta have GPS!,1370131200
1,528881469,"[12, 15]",1,"I'm a professional OTR truck driver, and I bou...","11 25, 2010",AMO214LNFCEI4,Amazon Customer,Very Disappointed,1290643200
2,528881469,"[43, 45]",3,"Well, what can I say. I've had this unit in m...","09 9, 2010",A3N7T0DY83Y4IG,C. A. Freeman,1st impression,1283990400
3,528881469,"[9, 10]",2,"Not going to write a long review, even thought...","11 24, 2010",A1H8PY3QHMQQA0,"Dave M. Shaw ""mack dave""","Great grafics, POOR GPS",1290556800
4,528881469,"[0, 0]",1,I've had mine for a year and here's what we go...,"09 29, 2011",A24EV6RXELQZ63,Wayne Smith,"Major issues, only excuses for support",1317254400


In [4]:
len(df)

1689188

In [5]:
df.isnull().sum()

asin                  0
helpful               0
overall               0
reviewText            0
reviewTime            0
reviewerID            0
reviewerName      24730
summary               0
unixReviewTime        0
dtype: int64

In [6]:
df.drop(['asin','reviewerName','reviewTime'],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,helpful,overall,reviewText,reviewerID,summary,unixReviewTime
0,"[0, 0]",5,We got this GPS for my husband who is an (OTR)...,AO94DHGC771SJ,Gotta have GPS!,1370131200
1,"[12, 15]",1,"I'm a professional OTR truck driver, and I bou...",AMO214LNFCEI4,Very Disappointed,1290643200
2,"[43, 45]",3,"Well, what can I say. I've had this unit in m...",A3N7T0DY83Y4IG,1st impression,1283990400
3,"[9, 10]",2,"Not going to write a long review, even thought...",A1H8PY3QHMQQA0,"Great grafics, POOR GPS",1290556800
4,"[0, 0]",1,I've had mine for a year and here's what we go...,A24EV6RXELQZ63,"Major issues, only excuses for support",1317254400


In [8]:
len(df.reviewerID.unique())

192403

In [20]:
df = df.loc[df['overall'] != 3]

In [21]:
df['valence'] = np.where(df['overall'] >= 4, '1', '0')

In [22]:
df.head()

Unnamed: 0,helpful,overall,reviewText,reviewerID,summary,unixReviewTime,valence,textLength
0,"[0, 0]",5,We got this GPS for my husband who is an (OTR)...,AO94DHGC771SJ,Gotta have GPS!,1370131200,1,805
1,"[12, 15]",1,"I'm a professional OTR truck driver, and I bou...",AMO214LNFCEI4,Very Disappointed,1290643200,0,2175
3,"[9, 10]",2,"Not going to write a long review, even thought...",A1H8PY3QHMQQA0,"Great grafics, POOR GPS",1290556800,0,2246
4,"[0, 0]",1,I've had mine for a year and here's what we go...,A24EV6RXELQZ63,"Major issues, only excuses for support",1317254400,0,1076
5,"[3, 3]",5,I am using this with a Nook HD+. It works as d...,A2JXAZZI9PHK9Z,HDMI Nook adapter cable,1388707200,1,109


In [24]:
# Need to address class imbalance 
df.valence.value_counts()

1    1356067
0     190864
Name: valence, dtype: int64

In [25]:
df.head(1)

Unnamed: 0,helpful,overall,reviewText,reviewerID,summary,unixReviewTime,valence,textLength
0,"[0, 0]",5,We got this GPS for my husband who is an (OTR)...,AO94DHGC771SJ,Gotta have GPS!,1370131200,1,805


In [47]:
X = df['reviewText']
y = df['valence']

## Examine Data 

In [48]:
df.isnull().sum()

helpful           0
overall           0
reviewText        0
reviewerID        0
summary           0
unixReviewTime    0
valence           0
textLength        0
dtype: int64

## Model

In [49]:
from sklearn.cross_validation import train_test_split
 
def train(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
 
    classifier.fit(X_train, y_train)
    print ("Accuracy: %s" % classifier.score(X_test, y_test))
    return classifier

In [50]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
 
trial1 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB()),
])
 
train(trial1, X, y)

Accuracy: 0.876421200156


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...      vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [51]:
from nltk.corpus import stopwords
 
t2 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('classifier', MultinomialNB()),
])
 
train(t2, X, y)

 

Accuracy: 0.877401204449


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...      vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [52]:
t3 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('classifier', MultinomialNB(alpha=0.05)),
])
 
train(t3, X, y)

 

Accuracy: 0.89665996954


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...     vocabulary=None)), ('classifier', MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True))])

In [53]:
t4 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'),
                             min_df=5)),
    ('classifier', MultinomialNB(alpha=0.05)),
])
 
train(t4, X, y)

Accuracy: 0.898485518433


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...     vocabulary=None)), ('classifier', MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True))])