In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('Amazon.csv')
print(data.shape) #print the (rows,cols) of matrix
data.head(5) #print 1st 5 rows

(455000, 13)


Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,helpScore,helpful
0,138806,138807,B000E63LME,A1CQGW1AOD0LF2,"Alena K. ""Alena""",1,2,2,1294185600,Not as pictured.,I was looking forward to try cranberry apple f...,0.5,False
1,469680,469681,B004ZIH4KM,A37S7U1OX2MCWI,Becky Cole,0,0,5,1349740800,seeds,"TY for everything. The seeds arrived quickly,...",,False
2,238202,238203,B003ZXE9QA,A2OM6G73E64EQ9,jeff,0,0,5,1329264000,I'm addicted!,I've finally found the best cereal in the worl...,,False
3,485307,485308,B001RVFERK,A25W349EE97NBK,Tangent4,1,1,4,1248307200,I wanted to love these...,I originally bought these chips because I'd he...,1.0,False
4,375283,375284,B000OQZNTS,A3CPPW0HUC07YS,Amy Nicolai,0,0,5,1333238400,Excellent chamomile tea,"Really excellent tea, flowers are visible in t...",,False


In [3]:
# features from Amazon.csv to add to feature set

In [4]:
# add review length col to dataset (pandas)
data['ReviewLen'] = data['Text'].str.len()

# fix summary NaNs
data.Summary = data.Summary.fillna(' ') # put a space instead

# summary length
data['SummaryLen'] = data.Summary.str.len()

In [5]:
# NOTE: REMOVED THIS STEP
# # add cols for one and five star booleans
# data['FiveStars'] = 0 # set all rows to 0
# data['OneStar'] = 0

# # ix == if then
# data.ix[data['Score'] == 1, 'OneStar'] = 1 # if score is 1, set to 1
# data.ix[data['Score'] == 5, 'FiveStars'] = 1 # if score is 5, set to 1

# # maybe plot stars vs helpfulness to see if correlation
# use matplotlib?

In [6]:
# convert unix timestamp to datetime
data['DateTime'] = pd.to_datetime(data['Time'], unit='s')
# all these Times are midnight of a day

# find day of week (0-6, Mon-Sun)
data['DayOfWeek'] = data['DateTime'].dt.weekday
data['Month'] = data['DateTime'].dt.month
data['Year'] = data['DateTime'].dt.year

# col for oldest review date

In [7]:
# n reviews per product ID
data['NReviewsProduct'] = data.groupby(['ProductId'])['Id'].transform('count')

# n reviews per user ID
data['NReviewsUser'] = data.groupby(['UserId'])['UserId'].transform('count')

# avg score of product
data['AvgScore'] = data.groupby(['ProductId'])['Score'].transform('mean')
#'lambda x: x.mean()'

# # order of this review on item (first = 0, second = 1, etc.)
# data['ReviewOrder'] = 0
# # for each row
# # group data by ProductId and then order by timestamp

# # then save order to 'ReviewOrder'
# productId_group = data.groupby('ProductId')

In [9]:
# count punctuation

data['countPeriods'] = data['Text'].str.count('.')
data['countBangs'] = data['Text'].str.count('!')
data['countQuestions'] = data['Text'].str.count('\?')
data['countCommas'] = data['Text'].str.count(',')
data['countSpaces'] = data['Text'].str.count(' ')

In [11]:
# convert score + reviewLen cols to vectors
# [:, 7] denotes all rows in col 7

XScore = data.Score.values.reshape(data.shape[0], 1)
XTime = data.Time.values.reshape(data.shape[0], 1)
XReviewLen = data.ReviewLen.values.reshape(data.shape[0], 1)
XSummaryLen = data.SummaryLen.values.reshape(data.shape[0], 1)
XDayOfWeek = data.DayOfWeek.values.reshape(data.shape[0], 1)
XMonth = data.Month.values.reshape(data.shape[0], 1)
XYear = data.Year.values.reshape(data.shape[0], 1)
XNReviewsProduct = data.NReviewsProduct.values.reshape(data.shape[0], 1)
XNReviewsUser = data.NReviewsUser.values.reshape(data.shape[0], 1)
XCountPeriods = data.countPeriods.values.reshape(data.shape[0], 1)
XCountBangs = data.countBangs.values.reshape(data.shape[0], 1)
XCountQuestions = data.countQuestions.values.reshape(data.shape[0], 1)
XCountCommas = data.countCommas.values.reshape(data.shape[0], 1)
XCountSpaces = data.countSpaces.values.reshape(data.shape[0], 1)

# concatenate to numpy dataset
XToAdd = np.concatenate((XScore, XTime, XReviewLen, XDayOfWeek, XMonth, XYear, XNReviewsProduct, XNReviewsUser, XCountPeriods, XCountBangs, XCountQuestions, XCountCommas, XCountSpaces), axis=1)

XToAdd.shape

(455000, 13)

In [12]:
# vectorize Bag of Words from review text; as sparse matrix
from sklearn.feature_extraction.text import HashingVectorizer
# Text
hvText = HashingVectorizer(stop_words='english', n_features=2 ** 17, non_negative=True)
XText = hvText.transform(data.Text)
# Summary
hvSummary = HashingVectorizer(stop_words='english', n_features=2 ** 17, non_negative=True)
XSummary = hvSummary.transform(data.Summary)

In [13]:
# Tf-idf transformer with pipeline from hash vectorizer?
#### example ####
# hashing = HashingVectorizer(non_negative=True, norm=None)
# tfidf = TfidfTransformer()
# hashing_tfidf = Pipeline([("hashing", hashing), ("tidf", tfidf)])

In [14]:
# convert additional features to sparse matrix and concatenate onto the bag of words sparse matrix
from scipy.sparse import csr_matrix, hstack
XToAddSparse = csr_matrix(XToAdd)
XFinal = hstack([XText, XToAddSparse])
X = csr_matrix(XFinal)

In [15]:
# size of feature set
print(X.shape)

(455000, 131085)


In [16]:
# define y
y = data.helpful.values
y.shape

(455000,)

In [17]:
# create training and test sets
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.3, random_state=0)

In [18]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [19]:
# report on training and test sets
def print_results():
    print('Error rate on training set: ')
    print((y_train != y_pred).sum() / X_train.shape[0])
    print('Accuracy rate on training set: ')
    print(1 - (y_train != y_pred).sum() / X_train.shape[0])
    print('True positive rate on training set:')
    print(((y_train==True) & (y_pred==True)).sum() / y_train.sum())
    print('**************')
    print('Error rate on test set: ')
    print((y_test != y_pred_test).sum() / X_test.shape[0])
    print('Accuracy rate on test set: ')
    print(1 - (y_test != y_pred_test).sum() / X_test.shape[0])
    print('True positive rate on test set')
    print(((y_test==True) & (y_pred_test==True)).sum() / y_test.sum())
    print('True negative rate on test set')
    print(((y_test==False) & (y_pred_test==False)).sum() / (y_test.shape[0] - y_test.sum()))

In [20]:
# MODEL: SVM, linear
from sklearn import linear_model
clf = linear_model.SGDClassifier()
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.0661569858713
Accuracy rate on training set: 
0.933843014129
True positive rate on training set:
0.507404856208
**************
Error rate on test set: 
0.0957802197802
Accuracy rate on test set: 
0.90421978022
True positive rate on test set
0.308583991206
True negative rate on test set
0.951341180935


In [21]:
# MODEL: logistic regression
from sklearn import linear_model
clf = linear_model.SGDClassifier(loss='log', n_iter=50, alpha=0.00001)
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.0642480376766
Accuracy rate on training set: 
0.935751962323
True positive rate on training set:
0.719907008783
**************
Error rate on test set: 
0.112161172161
Accuracy rate on test set: 
0.887838827839
True positive rate on test set
0.438792845008
True negative rate on test set
0.923363348169


In [22]:
# MODEL: Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.108185243328
Accuracy rate on training set: 
0.891814756672
True positive rate on training set:
0.581582572757
**************
Error rate on test set: 
0.148622710623
Accuracy rate on test set: 
0.851377289377
True positive rate on test set
0.324073148796
True negative rate on test set
0.893092898421


In [23]:
# Perceptron
from sklearn import linear_model
clf = linear_model.SGDClassifier(loss='perceptron')
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.0689481946625
Accuracy rate on training set: 
0.931051805338
True positive rate on training set:
0.0547184432581
**************
Error rate on test set: 
0.0734945054945
Accuracy rate on test set: 
0.926505494505
True positive rate on test set
0.0208853802338
True negative rate on test set
0.998150095262
