In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('../Amazon.csv')
print(data.shape) #print the (rows,cols) of matrix
#data.head(50) #print 1st 50 rows

(455000, 13)


In [3]:
# features from Amazon.csv to add to feature set

In [4]:
# add review length col to dataset (pandas)
data['ReviewLen'] = data['Text'].str.len()

# fix summary NaNs
# fix Summary NaNs
data.Summary = data.Summary.fillna(' ')

# summary length
data['SummaryLen'] = data.Summary.str.len()


In [5]:
# add cols for one and five star booleans
data['FiveStars'] = 0 # set all rows to 0
data['OneStar'] = 0

# ix == if then
data.ix[data['Score'] == 1, 'OneStar'] = 1 # if score is 1, set to 1
data.ix[data['Score'] == 5, 'FiveStars'] = 1 # if score is 5, set to 1
# maybe visualize the score distribution for helpful reviews

# add col for avg stars of Product
# data['AvgStars'] = data.groupby('ProductId')['Stars'].transform(lambda x: x.mean())
# not sure how to do this, or if it's even useful...

In [6]:
# convert unix timestamp to datetime
data['DateTime'] = pd.to_datetime(data['Time'], unit='s')
# all these Times are midnight of a day

# find day of week (0-6, Mon-Sun)
data['DayOfWeek'] = data['DateTime'].dt.weekday
data['Month'] = data['DateTime'].dt.month
data['Year'] = data['DateTime'].dt.year

# col for oldest review date

In [7]:
# n reviews per product ID
data['NReviewsProduct'] = data.groupby('ProductId')['ProductId'].transform('count')

# # order of this review on item (first = 0, second = 1, etc.)
# data['ReviewOrder'] = 0
# # for each row
# # group data by ProductId and then order by timestamp

# # then save order to 'ReviewOrder'
# productId_group = data.groupby('ProductId')


# n reviews per user ID
data['NReviewsUser'] = data.groupby('UserId')['UserId'].transform('count')

In [8]:
# convert score + reviewLen cols to vectors
# [:, 7] denotes all rows in col 7

XScore = data.Score.values.reshape(data.shape[0], 1)
XTime = data.Time.values.reshape(data.shape[0], 1)
XReviewLen = data.ReviewLen.values.reshape(data.shape[0], 1)
XSummaryLen = data.SummaryLen.values.reshape(data.shape[0], 1)
XFiveStars = data.FiveStars.values.reshape(data.shape[0], 1)
XOneStar = data.OneStar.values.reshape(data.shape[0], 1)
XDayOfWeek = data.DayOfWeek.values.reshape(data.shape[0], 1)
XMonth = data.Month.values.reshape(data.shape[0], 1)
XYear = data.Year.values.reshape(data.shape[0], 1)
XNReviewsProduct = data.NReviewsProduct.values.reshape(data.shape[0], 1)
XNReviewsUser = data.NReviewsUser.values.reshape(data.shape[0], 1)

# concatenate to numpy dataset
XToAdd = np.concatenate((XScore, XOneStar, XFiveStars, XTime, XReviewLen, XDayOfWeek, XMonth, XYear, XNReviewsProduct, XNReviewsUser), axis=1)

XToAdd.shape

(455000, 10)

In [9]:
# vectorize Bag of Words from review text; as sparse matrix
from sklearn.feature_extraction.text import HashingVectorizer
# Text
hvText = HashingVectorizer(stop_words='english', n_features=2 ** 17, non_negative=True)
XText = hvText.transform(data.Text)
# Summary
hvSummary = HashingVectorizer(stop_words='english', n_features=2 ** 17, non_negative=True)
XSummary = hvSummary.transform(data.Summary)

In [10]:
# Tf-idf transformer with pipeline from hash vectorizer?
#### example ####
# hashing = HashingVectorizer(non_negative=True, norm=None)
# tfidf = TfidfTransformer()
# hashing_tfidf = Pipeline([("hashing", hashing), ("tidf", tfidf)])

In [11]:
# convert additional features to sparse matrix and concatenate onto the bag of words sparse matrix
from scipy.sparse import csr_matrix, hstack
XToAddSparse = csr_matrix(XToAdd)
XFinal = hstack([XText, XToAddSparse])
X = csr_matrix(XFinal)

In [12]:
# size of feature set
print(X.shape)

(455000, 131082)


In [13]:
# define y
y = data.helpful.values
y.shape

(455000,)

In [14]:
# create training and test sets
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.3, random_state=0)

In [15]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [16]:
# report on training and test sets
def print_results():
    print('Error rate on training set: ')
    print((y_train != y_pred).sum() / X_train.shape[0])
    print('Accuracy rate on training set: ')
    print(1 - (y_train != y_pred).sum() / X_train.shape[0])
    print('True positive rate on training set:')
    print(((y_train==True) & (y_pred==True)).sum() / y_train.sum())
    print('**************')
    print('Error rate on test set: ')
    print((y_test != y_pred_test).sum() / X_test.shape[0])
    print('Accuracy rate on test set: ')
    print(1 - (y_test != y_pred_test).sum() / X_test.shape[0])
    print('True positive rate on test set')
    print(((y_test==True) & (y_pred_test==True)).sum() / y_test.sum())
    print('True negative rate on test set')
    print(((y_test==False) & (y_pred_test==False)).sum() / (y_test.shape[0] - y_test.sum()))

In [17]:
# MODEL: SVM, linear
from sklearn import linear_model
clf = linear_model.SGDClassifier()
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.0720031397174
Accuracy rate on training set: 
0.927996860283
True positive rate on training set:
0.0127001894266
**************
Error rate on test set: 
0.0741025641026
Accuracy rate on test set: 
0.925897435897
True positive rate on test set
0.00559608274208
True negative rate on test set
0.998703485568


In [18]:
# MODEL: logistic regression
from sklearn import linear_model
clf = linear_model.SGDClassifier(loss='log', n_iter=50, alpha=0.00001)
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.0468477237049
Accuracy rate on training set: 
0.953152276295
True positive rate on training set:
0.372825899776
**************
Error rate on test set: 
0.0729010989011
Accuracy rate on test set: 
0.927098901099
True positive rate on test set
0.164384930549
True negative rate on test set
0.98743804005


In [19]:
# MODEL: Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.108150706436
Accuracy rate on training set: 
0.891849293564
True positive rate on training set:
0.581496469778
**************
Error rate on test set: 
0.148666666667
Accuracy rate on test set: 
0.851333333333
True positive rate on test set
0.323773358649
True negative rate on test set
0.893069181694


In [20]:
# Perceptron
from sklearn import linear_model
clf = linear_model.SGDClassifier(loss='perceptron')
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.0564615384615
Accuracy rate on training set: 
0.943538461538
True positive rate on training set:
0.424487687274
**************
Error rate on test set: 
0.0837289377289
Accuracy rate on test set: 
0.916271062271
True positive rate on test set
0.230338762866
True negative rate on test set
0.970535918984
