In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('Amazon.csv')
print(data.shape) #print the (rows,cols) of matrix
data.head(5) #print 1st 5 rows

(455000, 13)


Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,helpScore,helpful
0,138806,138807,B000E63LME,A1CQGW1AOD0LF2,"Alena K. ""Alena""",1,2,2,1294185600,Not as pictured.,I was looking forward to try cranberry apple f...,0.5,False
1,469680,469681,B004ZIH4KM,A37S7U1OX2MCWI,Becky Cole,0,0,5,1349740800,seeds,"TY for everything. The seeds arrived quickly,...",,False
2,238202,238203,B003ZXE9QA,A2OM6G73E64EQ9,jeff,0,0,5,1329264000,I'm addicted!,I've finally found the best cereal in the worl...,,False
3,485307,485308,B001RVFERK,A25W349EE97NBK,Tangent4,1,1,4,1248307200,I wanted to love these...,I originally bought these chips because I'd he...,1.0,False
4,375283,375284,B000OQZNTS,A3CPPW0HUC07YS,Amy Nicolai,0,0,5,1333238400,Excellent chamomile tea,"Really excellent tea, flowers are visible in t...",,False


In [3]:
# features from Amazon.csv to add to feature set

In [4]:
# add review length col to dataset (pandas)
data['ReviewLen'] = data['Text'].str.len()

# fix summary NaNs
data.Summary = data.Summary.fillna(' ') # put a space instead

# summary length
data['SummaryLen'] = data.Summary.str.len()

In [5]:
# NOTE: REMOVED THIS STEP
# # add cols for one and five star booleans
# data['FiveStars'] = 0 # set all rows to 0
# data['OneStar'] = 0

# # ix == if then
# data.ix[data['Score'] == 1, 'OneStar'] = 1 # if score is 1, set to 1
# data.ix[data['Score'] == 5, 'FiveStars'] = 1 # if score is 5, set to 1

In [6]:
# # maybe plot stars vs helpfulness to see if correlation - can't figure it out

#data['HelpfulInt'] = data.helpful.astype(int)
#data.head(10)

#plt.plot(data['Score'],data['HelpfulBool'])
#plt.xlabel('Score')
#plt.ylabel('HelpfulBool')
#plt.show()

In [7]:
# convert unix timestamp to datetime
data['DateTime'] = pd.to_datetime(data['Time'], unit='s')
# all these Times are midnight of a day

# find day of week (0-6, Mon-Sun)
data['DayOfWeek'] = data['DateTime'].dt.weekday
data['Month'] = data['DateTime'].dt.month
data['Year'] = data['DateTime'].dt.year



In [8]:
# n reviews per product ID
data['NReviewsProduct'] = data.groupby(['ProductId'])['Id'].transform('count')

# n reviews per user ID
data['NReviewsUser'] = data.groupby(['UserId'])['UserId'].transform('count')

# avg score of product
data['AvgScore'] = data.groupby(['ProductId'])['Score'].transform('mean')
#'lambda x: x.mean()'

# # order of this review on item (first = 0, second = 1, etc.)

data['ReviewOrder'] = data.sort_values(['Time']).groupby(['ProductId']).cumcount() # cumcount() returns order # within group
data[data.ProductId == 'B003ZXE9QA'].sort_values(['Time']).head(20) # check if worked on one productId

Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,...,ReviewLen,SummaryLen,DateTime,DayOfWeek,Month,Year,NReviewsProduct,NReviewsUser,AvgScore,ReviewOrder
247216,238193,238194,B003ZXE9QA,A1CMDD3M56MJ2K,aphex2win,6,7,5,1289865600,What fiber cheerios would taste like (if there...,...,1268,65,2010-11-16,1,11,2010,20,1,4.6,0
379827,238211,238212,B003ZXE9QA,A1VCJWLAJAFRGP,"K. Rose ""any-weather friend ;)""",0,0,5,1308268800,Yum!!,...,246,5,2011-06-17,4,6,2011,20,1,4.6,1
191183,238196,238197,B003ZXE9QA,AC7AY7GU4GKID,Mom of the Year,1,1,5,1309132800,My kids like them,...,207,17,2011-06-27,0,6,2011,20,5,4.6,2
243688,238210,238211,B003ZXE9QA,ADJMETBPYJL7Q,"D. Campbell ""Doll Parts""",0,0,5,1310083200,Love This Stuff.,...,553,16,2011-07-08,4,7,2011,20,9,4.6,3
108289,238192,238193,B003ZXE9QA,A25UZ7MA72SMKM,Brent Butler,6,6,5,1310342400,"Good taste, good nutrition, low calories (even...",...,1698,64,2011-07-11,0,7,2011,20,22,4.6,4
311949,238214,238215,B003ZXE9QA,A1BW2GDOT61RIT,Cheapo,1,5,2,1310342400,Tastes bland,...,1827,12,2011-07-11,0,7,2011,20,1,4.6,5
161311,238209,238210,B003ZXE9QA,A1KPPFZ4Z59HM5,Ricky Nigro,0,0,5,1314835200,"Great tasting, great source of fiber!",...,298,37,2011-09-01,3,9,2011,20,2,4.6,6
46038,238208,238209,B003ZXE9QA,A39EH0SU5305W6,Karen A. Daniels,0,0,5,1315008000,Love this cereal!!,...,98,18,2011-09-03,5,9,2011,20,4,4.6,7
367610,238207,238208,B003ZXE9QA,A30PXP11RAT9NZ,NeverTooOld,0,0,5,1323993600,Fantastic,...,197,9,2011-12-16,4,12,2011,20,1,4.6,8
174554,238206,238207,B003ZXE9QA,A1H8E342ICEQ6W,Amazon Regular,0,0,5,1325116800,Very tasty and ton of fiber.,...,380,28,2011-12-29,3,12,2011,20,7,4.6,9


In [26]:
# time between this review and oldest review
# col for oldest review
data['FirstReviewTime'] = data.groupby(['ProductId'])['Time'].cummin()

Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,...,SummaryLen,DateTime,DayOfWeek,Month,Year,NReviewsProduct,NReviewsUser,AvgScore,ReviewOrder,FirstReviewTime
0,138806,138807,B000E63LME,A1CQGW1AOD0LF2,"Alena K. ""Alena""",1,2,2,1294185600,Not as pictured.,...,16,2011-01-05,2,1,2011,34,1,4.147059,14,1294185600
1,469680,469681,B004ZIH4KM,A37S7U1OX2MCWI,Becky Cole,0,0,5,1349740800,seeds,...,5,2012-10-09,1,10,2012,2,1,4.5,1,1349740800
2,238202,238203,B003ZXE9QA,A2OM6G73E64EQ9,jeff,0,0,5,1329264000,I'm addicted!,...,13,2012-02-15,2,2,2012,20,1,4.6,13,1329264000
3,485307,485308,B001RVFERK,A25W349EE97NBK,Tangent4,1,1,4,1248307200,I wanted to love these...,...,25,2009-07-23,3,7,2009,461,8,4.32321,45,1248307200
4,375283,375284,B000OQZNTS,A3CPPW0HUC07YS,Amy Nicolai,0,0,5,1333238400,Excellent chamomile tea,...,23,2012-04-01,6,4,2012,7,5,4.571429,5,1333238400


In [27]:
data[data.ProductId == 'B003ZXE9QA'].sort_values(['Time']).head(20) # check if worked on one productId
# not working???

Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,...,SummaryLen,DateTime,DayOfWeek,Month,Year,NReviewsProduct,NReviewsUser,AvgScore,ReviewOrder,FirstReviewTime
247216,238193,238194,B003ZXE9QA,A1CMDD3M56MJ2K,aphex2win,6,7,5,1289865600,What fiber cheerios would taste like (if there...,...,65,2010-11-16,1,11,2010,20,1,4.6,0,1289865600
379827,238211,238212,B003ZXE9QA,A1VCJWLAJAFRGP,"K. Rose ""any-weather friend ;)""",0,0,5,1308268800,Yum!!,...,5,2011-06-17,4,6,2011,20,1,4.6,1,1289865600
191183,238196,238197,B003ZXE9QA,AC7AY7GU4GKID,Mom of the Year,1,1,5,1309132800,My kids like them,...,17,2011-06-27,0,6,2011,20,5,4.6,2,1309132800
243688,238210,238211,B003ZXE9QA,ADJMETBPYJL7Q,"D. Campbell ""Doll Parts""",0,0,5,1310083200,Love This Stuff.,...,16,2011-07-08,4,7,2011,20,9,4.6,3,1309132800
108289,238192,238193,B003ZXE9QA,A25UZ7MA72SMKM,Brent Butler,6,6,5,1310342400,"Good taste, good nutrition, low calories (even...",...,64,2011-07-11,0,7,2011,20,22,4.6,4,1310342400
311949,238214,238215,B003ZXE9QA,A1BW2GDOT61RIT,Cheapo,1,5,2,1310342400,Tastes bland,...,12,2011-07-11,0,7,2011,20,1,4.6,5,1289865600
161311,238209,238210,B003ZXE9QA,A1KPPFZ4Z59HM5,Ricky Nigro,0,0,5,1314835200,"Great tasting, great source of fiber!",...,37,2011-09-01,3,9,2011,20,2,4.6,6,1310342400
46038,238208,238209,B003ZXE9QA,A39EH0SU5305W6,Karen A. Daniels,0,0,5,1315008000,Love this cereal!!,...,18,2011-09-03,5,9,2011,20,4,4.6,7,1315008000
367610,238207,238208,B003ZXE9QA,A30PXP11RAT9NZ,NeverTooOld,0,0,5,1323993600,Fantastic,...,9,2011-12-16,4,12,2011,20,1,4.6,8,1289865600
174554,238206,238207,B003ZXE9QA,A1H8E342ICEQ6W,Amazon Regular,0,0,5,1325116800,Very tasty and ton of fiber.,...,28,2011-12-29,3,12,2011,20,7,4.6,9,1310342400


In [29]:
# count punctuation

data['countPeriods'] = data['Text'].str.count('.')
data['countBangs'] = data['Text'].str.count('!')
data['countQuestions'] = data['Text'].str.count('\?')
data['countCommas'] = data['Text'].str.count(',')
data['countSpaces'] = data['Text'].str.count(' ')

In [30]:
# convert score + reviewLen cols to vectors
# [:, 7] denotes all rows in col 7

XScore = data.Score.values.reshape(data.shape[0], 1)
XTime = data.Time.values.reshape(data.shape[0], 1)
XReviewLen = data.ReviewLen.values.reshape(data.shape[0], 1)
XSummaryLen = data.SummaryLen.values.reshape(data.shape[0], 1)
XDayOfWeek = data.DayOfWeek.values.reshape(data.shape[0], 1)
XMonth = data.Month.values.reshape(data.shape[0], 1)
XYear = data.Year.values.reshape(data.shape[0], 1)
XNReviewsProduct = data.NReviewsProduct.values.reshape(data.shape[0], 1)
XNReviewsUser = data.NReviewsUser.values.reshape(data.shape[0], 1)
XAvgScore = data.AvgScore.values.reshape(data.shape[0], 1)
XReviewOrder = data.ReviewOrder.values.reshape(data.shape[0], 1)
XCountPeriods = data.countPeriods.values.reshape(data.shape[0], 1)
XCountBangs = data.countBangs.values.reshape(data.shape[0], 1)
XCountQuestions = data.countQuestions.values.reshape(data.shape[0], 1)
XCountCommas = data.countCommas.values.reshape(data.shape[0], 1)
XCountSpaces = data.countSpaces.values.reshape(data.shape[0], 1)

# concatenate to numpy dataset
XToAdd = np.concatenate((XScore, XTime, XReviewLen, XDayOfWeek, XMonth, XYear, XNReviewsProduct, XNReviewsUser, XAvgScore, XReviewOrder, XCountPeriods, XCountBangs, XCountQuestions, XCountCommas, XCountSpaces), axis=1)

XToAdd.shape

(455000, 15)

In [31]:
# vectorize Bag of Words from review text; as sparse matrix
from sklearn.feature_extraction.text import HashingVectorizer
# Text
hvText = HashingVectorizer(stop_words='english', n_features=2 ** 17, non_negative=True)
XText = hvText.transform(data.Text)
# Summary
hvSummary = HashingVectorizer(stop_words='english', n_features=2 ** 17, non_negative=True)
XSummary = hvSummary.transform(data.Summary)

In [32]:
# Tf-idf transformer with pipeline from hash vectorizer?
#### example ####
# hashing = HashingVectorizer(non_negative=True, norm=None)
# tfidf = TfidfTransformer()
# hashing_tfidf = Pipeline([("hashing", hashing), ("tidf", tfidf)])

In [33]:
# convert additional features to sparse matrix and concatenate onto the bag of words sparse matrix
from scipy.sparse import csr_matrix, hstack
XToAddSparse = csr_matrix(XToAdd)
XFinal = hstack([XText, XToAddSparse])
X = csr_matrix(XFinal)

In [34]:
# size of feature set
print(X.shape)

(455000, 131087)


In [35]:
# define y
y = data.helpful.values
y.shape

(455000,)

In [36]:
# create training and test sets
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.3, random_state=0)

In [37]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [38]:
# report on training and test sets
def print_results():
    print('Error rate on training set: ')
    print((y_train != y_pred).sum() / X_train.shape[0])
    print('Accuracy rate on training set: ')
    print(1 - (y_train != y_pred).sum() / X_train.shape[0])
    print('True positive rate on training set:')
    print(((y_train==True) & (y_pred==True)).sum() / y_train.sum())
    print('**************')
    print('Error rate on test set: ')
    print((y_test != y_pred_test).sum() / X_test.shape[0])
    print('Accuracy rate on test set: ')
    print(1 - (y_test != y_pred_test).sum() / X_test.shape[0])
    print('True positive rate on test set')
    print(((y_test==True) & (y_pred_test==True)).sum() / y_test.sum())
    print('True negative rate on test set')
    print(((y_test==False) & (y_pred_test==False)).sum() / (y_test.shape[0] - y_test.sum()))

In [39]:
# MODEL: SVM, linear
from sklearn import linear_model
clf = linear_model.SGDClassifier()
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.067601255887
Accuracy rate on training set: 
0.932398744113
True positive rate on training set:
0.517392801791
**************
Error rate on test set: 
0.0971941391941
Accuracy rate on test set: 
0.902805860806
True positive rate on test set
0.322374337963
True negative rate on test set
0.948724435344


In [40]:
# MODEL: logistic regression
from sklearn import linear_model
clf = linear_model.SGDClassifier(loss='log', n_iter=50, alpha=0.00001)
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.0455604395604
Accuracy rate on training set: 
0.95443956044
True positive rate on training set:
0.397881866713
**************
Error rate on test set: 
0.0734212454212
Accuracy rate on test set: 
0.926578754579
True positive rate on test set
0.177575697012
True negative rate on test set
0.985833208162


In [41]:
# MODEL: Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.108040816327
Accuracy rate on training set: 
0.891959183673
True positive rate on training set:
0.581754778715
**************
Error rate on test set: 
0.148593406593
Accuracy rate on test set: 
0.851406593407
True positive rate on test set
0.323973218747
True negative rate on test set
0.8931324263


In [42]:
# Perceptron
from sklearn import linear_model
clf = linear_model.SGDClassifier(loss='perceptron')
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.0679591836735
Accuracy rate on training set: 
0.932040816327
True positive rate on training set:
0.0683657654555
**************
Error rate on test set: 
0.0732820512821
Accuracy rate on test set: 
0.926717948718
True positive rate on test set
0.0271809733187
True negative rate on test set
0.997881305685
