In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('Amazon.csv')
print(data.shape) #print the (rows,cols) of matrix
data.head(5) #print 1st 5 rows

(455000, 13)


Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,helpScore,helpful
0,138806,138807,B000E63LME,A1CQGW1AOD0LF2,"Alena K. ""Alena""",1,2,2,1294185600,Not as pictured.,I was looking forward to try cranberry apple f...,0.5,False
1,469680,469681,B004ZIH4KM,A37S7U1OX2MCWI,Becky Cole,0,0,5,1349740800,seeds,"TY for everything. The seeds arrived quickly,...",,False
2,238202,238203,B003ZXE9QA,A2OM6G73E64EQ9,jeff,0,0,5,1329264000,I'm addicted!,I've finally found the best cereal in the worl...,,False
3,485307,485308,B001RVFERK,A25W349EE97NBK,Tangent4,1,1,4,1248307200,I wanted to love these...,I originally bought these chips because I'd he...,1.0,False
4,375283,375284,B000OQZNTS,A3CPPW0HUC07YS,Amy Nicolai,0,0,5,1333238400,Excellent chamomile tea,"Really excellent tea, flowers are visible in t...",,False


In [3]:
#### extract new feature columns in pandas dataframe: ###


## text features: ##

# fix summary NaNs
data.Summary = data.Summary.fillna(' ') # put a space instead

# summary length
data['SummaryLen'] = data.Summary.str.len()

# review length
data['ReviewLen'] = data['Text'].str.len()

# count punctuation
data['countPeriods'] = data['Text'].str.count('.')
data['countBangs'] = data['Text'].str.count('!')
data['countQuestions'] = data['Text'].str.count('\?')
data['countCommas'] = data['Text'].str.count(',')
data['countSpaces'] = data['Text'].str.count(' ')


## time features: ##

# convert unix timestamp to datetime
data['DateTime'] = pd.to_datetime(data['Time'], unit='s')
# all these Times are midnight of a day

# find day of week (0-6, Mon-Sun)
data['DayOfWeek'] = data['DateTime'].dt.weekday
data['Month'] = data['DateTime'].dt.month
data['Year'] = data['DateTime'].dt.year


## product and user features: ##

# n reviews per product ID
data['NReviewsProduct'] = data.groupby(['ProductId'])['Id'].transform('count')

# n reviews per user ID
data['NReviewsUser'] = data.groupby(['UserId'])['UserId'].transform('count')

# avg score of product
data['AvgScore'] = data.groupby(['ProductId'])['Score'].transform('mean')

# order of this review on product by time posted (first = 0, second = 1, etc.)
data['ReviewOrder'] = data.sort_values(['Time']).groupby(['ProductId']).cumcount() # cumcount() returns order # within group

# oldest review time on product
data['FirstReviewTime'] = data.sort_values(['Time']).groupby(['ProductId'])['Time'].cummin() # cummin() returns min value

# time between this review and oldest review:
data['TimeSinceFirstReview'] = data['Time'].sub(data['FirstReviewTime'], axis=0) # subtract cols, comparing by row

# data[data.ProductId == 'B003ZXE9QA'].sort_values(['Time']).head(20) # check if worked on one productId


In [4]:
# NOTE: REMOVED THIS STEP
# # add cols for one and five star booleans
# data['FiveStars'] = 0 # set all rows to 0
# data['OneStar'] = 0

# # ix == if then
# data.ix[data['Score'] == 1, 'OneStar'] = 1 # if score is 1, set to 1
# data.ix[data['Score'] == 5, 'FiveStars'] = 1 # if score is 5, set to 1

In [5]:
# convert cols to vectors

# given features
XScore = data.Score.values.reshape(data.shape[0], 1)
XTime = data.Time.values.reshape(data.shape[0], 1)

# text features
XReviewLen = data.ReviewLen.values.reshape(data.shape[0], 1)
XSummaryLen = data.SummaryLen.values.reshape(data.shape[0], 1)
XCountPeriods = data.countPeriods.values.reshape(data.shape[0], 1)
XCountBangs = data.countBangs.values.reshape(data.shape[0], 1)
XCountQuestions = data.countQuestions.values.reshape(data.shape[0], 1)
XCountCommas = data.countCommas.values.reshape(data.shape[0], 1)
XCountSpaces = data.countSpaces.values.reshape(data.shape[0], 1)

# time features
XDayOfWeek = data.DayOfWeek.values.reshape(data.shape[0], 1)
XMonth = data.Month.values.reshape(data.shape[0], 1)
XYear = data.Year.values.reshape(data.shape[0], 1)

# product/user features
XNReviewsProduct = data.NReviewsProduct.values.reshape(data.shape[0], 1)
XNReviewsUser = data.NReviewsUser.values.reshape(data.shape[0], 1)
XAvgScore = data.AvgScore.values.reshape(data.shape[0], 1)
XReviewOrder = data.ReviewOrder.values.reshape(data.shape[0], 1)
XFirstReviewTime = data.FirstReviewTime.values.reshape(data.shape[0], 1)
XTimeSinceFirstReview = data.TimeSinceFirstReview.reshape(data.shape[0], 1)

# concatenate to numpy dataset
XToAdd = np.concatenate((XScore, XTime, XReviewLen, XSummaryLen, XCountPeriods, XCountBangs, XCountQuestions, XCountCommas, XCountSpaces, XDayOfWeek, XMonth, XYear, XNReviewsProduct, XNReviewsUser, XAvgScore, XReviewOrder, XFirstReviewTime, XTimeSinceFirstReview), axis=1)

XToAdd.shape

(455000, 18)

In [6]:
# vectorize Bag of Words from review text; as sparse matrix
from sklearn.feature_extraction.text import HashingVectorizer
# Text
hvText = HashingVectorizer(stop_words='english', n_features=2 ** 17, non_negative=True)
XText = hvText.transform(data.Text)
# Summary
hvSummary = HashingVectorizer(stop_words='english', n_features=2 ** 17, non_negative=True)
XSummary = hvSummary.transform(data.Summary)

In [7]:
# Tf-idf transformer with pipeline from hash vectorizer?
#### example ####
# hashing = HashingVectorizer(non_negative=True, norm=None)
# tfidf = TfidfTransformer()
# hashing_tfidf = Pipeline([("hashing", hashing), ("tidf", tfidf)])

In [8]:
# convert additional features to sparse matrix and concatenate onto the bag of words sparse matrix
from scipy.sparse import csr_matrix, hstack
XToAddSparse = csr_matrix(XToAdd)
XFinal = hstack([XText, XToAddSparse])
X = csr_matrix(XFinal)

In [9]:
# size of feature set
print(X.shape)

(455000, 131090)


In [10]:
# define y
y = data.helpful.values
y.shape

(455000,)

In [11]:
# create training and test sets
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.3, random_state=0)

In [12]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [13]:
# report on training and test sets
def print_results():
    print('Error rate on training set: ')
    print((y_train != y_pred).sum() / X_train.shape[0])
    print('Accuracy rate on training set: ')
    print(1 - (y_train != y_pred).sum() / X_train.shape[0])
    print('True positive rate on training set:')
    print(((y_train==True) & (y_pred==True)).sum() / y_train.sum())
    print('True negative rate on training set')
    print(((y_train==False) & (y_pred==False)).sum() / (y_train.shape[0] - y_train.sum()))
    print('**************')
    print('Error rate on test set: ')
    print((y_test != y_pred_test).sum() / X_test.shape[0])
    print('Accuracy rate on test set: ')
    print(1 - (y_test != y_pred_test).sum() / X_test.shape[0])
    print('True positive rate on test set')
    print(((y_test==True) & (y_pred_test==True)).sum() / y_test.sum())
    print('True negative rate on test set')
    print(((y_test==False) & (y_pred_test==False)).sum() / (y_test.shape[0] - y_test.sum()))

In [14]:
# MODEL: SVM, linear
from sklearn import linear_model
clf = linear_model.SGDClassifier()
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.121864992151
Accuracy rate on training set: 
0.878135007849
True positive rate on training set:
0.713492336835
True negative rate on training set
0.891086862283
**************
Error rate on test set: 
0.155172161172
Accuracy rate on test set: 
0.844827838828
True positive rate on test set
0.5314280004
True negative rate on test set
0.869621243863


In [16]:
# MODEL: logistic regression
from sklearn import linear_model
clf = linear_model.SGDClassifier(loss='log', n_iter=50, alpha=0.00001)
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.208656200942
Accuracy rate on training set: 
0.791343799058
True positive rate on training set:
0.918589633201
True negative rate on training set
0.781333821019
**************
Error rate on test set: 
0.254996336996
Accuracy rate on test set: 
0.745003663004
True positive rate on test set
0.722594184071
True negative rate on test set
0.746776501466


In [17]:
# MODEL: Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.10787755102
Accuracy rate on training set: 
0.89212244898
True positive rate on training set:
0.581711727226
True negative rate on training set
0.916541358476
**************
Error rate on test set: 
0.148322344322
Accuracy rate on test set: 
0.851677655678
True positive rate on test set
0.324073148796
True negative rate on test set
0.893417027029


In [23]:
# Perceptron
from sklearn import linear_model
clf = linear_model.SGDClassifier(loss='perceptron')
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.0585337519623
Accuracy rate on training set: 
0.941466248038
True positive rate on training set:
0.205484759773
True negative rate on training set
0.999363298924
**************
Error rate on test set: 
0.0725201465201
Accuracy rate on test set: 
0.92747985348
True positive rate on test set
0.0914359948036
True negative rate on test set
0.993620200327


In [24]:
# BEST MODEL:

# MODEL: logistic regression
from sklearn import linear_model
clf = linear_model.SGDClassifier(loss='log', n_iter=50, alpha=0.00001)
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.138087912088
Accuracy rate on training set: 
0.861912087912
True positive rate on training set:
0.866066815912
True negative rate on training set
0.86158525021
**************
Error rate on test set: 
0.188249084249
Accuracy rate on test set: 
0.811750915751
True positive rate on test set
0.63385630059
True negative rate on test set
0.825824353917
