In [1]:
# Load libraries needed
import sqlite3
import amazonFn as af # user defined
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

#import matplotlib.pyplot as plt

In [2]:
# load data
conn = sqlite3.connect('amazon_reviews.db')
df = pd.read_sql_query(
    """
    SELECT *
    FROM amazon_reviews
    """, conn)
df.head(3)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,21269168,RSH1OZ87OYK92,B013PURRZW,603406193,Madden NFL 16 - Xbox One Digital Code,Digital_Video_Games,2,2,3,N,N,A slight improvement from last year.,I keep buying madden every year hoping they ge...,2015-08-31
1,US,133437,R1WFOQ3N9BO65I,B00F4CEHNK,341969535,Xbox Live Gift Card,Digital_Video_Games,5,0,0,N,Y,Five Stars,Awesome,2015-08-31
2,US,45765011,R3YOOS71KM5M9,B00DNHLFQA,951665344,Command & Conquer The Ultimate Collection [Ins...,Digital_Video_Games,5,0,0,N,Y,Hail to the great Yuri!,If you are prepping for the end of the world t...,2015-08-31


In [3]:
# Preprocessing
# Drop rows containing NA
df_na = df[['star_rating', 'review_headline']].dropna()
df_na = df_na.reset_index()
df_na = df_na.drop('index', 1)

In [4]:
# Clean up textual data using user defined function 
for i in range(len(df_na['review_headline'])):
    df_na['review_headline'][i] = af.removeTags(df_na['review_headline'][i])
    df_na['review_headline'][i] = af.removeAccents(df_na['review_headline'][i])
    df_na['review_headline'][i] = af.appendContractions(df_na['review_headline'][i])
    df_na['review_headline'][i] = af.lemmatizeWords(df_na['review_headline'][i])
#    df_na['review_headline'][i] = af.removeStopwords(df_na['review_headline'][i]) excluded since it lowers score
    df_na['review_headline'][i] = af.removeWhitespaces(df_na['review_headline'][i])    
    df_na['review_headline'][i] = df_na['review_headline'][i].lower()

In [6]:
# Transform ratings ranging from 1 to 5 into two factors 
rating = df_na['star_rating']
rating = rating.map(lambda x: "pos" if x > 3 else "neg")
review = df_na['review_headline']

# Split the data set with fixed seed
X_train, X_test, Y_train, Y_test = train_test_split(review, rating, test_size=0.2, random_state=42) 

In [7]:
# Convert data into tf-idf
cv = CountVectorizer()

X_trainCV = cv.fit_transform(X_train)
tfidfTransformer = TfidfTransformer()
X_trainTfidf = tfidfTransformer.fit_transform(X_trainCV)

X_testCV = cv.transform(X_test)
X_testTfidf = tfidfTransformer.transform(X_testCV)

In [8]:
# Fit the classifier model to training data and predict
pred = {}

logisticRegression = linear_model.LogisticRegression()
logisticRegression.fit(X_trainTfidf, Y_train)
pred['logistic']=logisticRegression.predict(X_testTfidf)

randomForest = RandomForestClassifier(n_estimators=100)
randomForest.fit(X_trainTfidf, Y_train)
pred['randomForest']= randomForest.predict(X_testTfidf)

svm = SGDClassifier(loss='modified_huber')
svm.fit(X_trainTfidf, Y_train)
pred['svm'] = svm.predict(X_testTfidf)

In [9]:
# Show the result utilizing AUC-ROC
rocAucScore = {}

factor = lambda x: 1 if x == 'pos' else 0
apply = np.vectorize(factor)

rocAucScore['logistic'] = metrics.roc_auc_score(Y_test.map(factor), apply(pred['logistic']))
rocAucScore['randomForest'] = metrics.roc_auc_score(Y_test.map(factor), apply(pred['randomForest']))
rocAucScore['svm'] = metrics.roc_auc_score(Y_test.map(factor), apply(pred['svm']))

result = pd.Series(rocAucScore, name='AUC-ROC')
result.index.name = 'Method'
result.reset_index()

Unnamed: 0,Method,AUC-ROC
0,logistic,0.82673
1,randomForest,0.84037
2,svm,0.829839
