In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv('Amazon.csv')
print(data.shape)
data.head(5)

In [None]:
# features from Amazon.csv to add to feature set
#below we are in pandas - place to be for data wrangling, subsetting, aggregations
data['reviewLen'] = data['Text'].str.len()
data['hasEP'] = data['Text'].str.contains('!')
data['hasQM'] = data['Text'].str.contains('\?')
data['hasAST'] = data['Text'].str.contains('\*')
data['hasDS'] = data['Text'].str.contains('$')
data['hasSC'] = data['Text'].str.contains(';')
data.head(5)


In [None]:
#turns colum into vector - here columns 7 and 13
XScore = data.iloc[:, 7].values.reshape(data.shape[0], 1)
XreviewLen = data.iloc[:, 13].values.reshape(data.shape[0], 1)
XhasEP = data.iloc[:, 14].values.reshape(data.shape[0], 1)
XhasDS = data.iloc[:, 15].values.reshape(data.shape[0], 1)
XhasSC = data.iloc[:, 16].values.reshape(data.shape[0], 1)
XhasQM = data.iloc[:, 17].values.reshape(data.shape[0], 1)
XhasAST = data.iloc[:, 18].values.reshape(data.shape[0], 1)
    

Xtoadd = np.concatenate((XScore, XreviewLen, XhasEP, XhasDS, XhasSC, XhasQM, XhasAST), axis=1)

In [None]:
# report on training and test sets
def print_results():
    print('Error rate on training set: ')
    print((y_train != y_pred).sum() / X_train.shape[0])
    print('Accuracy rate on training set: ')
    print(1 - (y_train != y_pred).sum() / X_train.shape[0])
    print('True positive rate on training tet:')
    print(((y_train==True) & (y_pred==True)).sum() / y_train.sum())
    print('**************')
    print('Error rate on test set: ')
    print((y_test != y_pred_test).sum() / X_test.shape[0])
    print('Accuracy rate on test set: ')
    print(1 - (y_test != y_pred_test).sum() / X_test.shape[0])
    print('True positive rate on test set')
    print(((y_test==True) & (y_pred_test==True)).sum() / y_test.sum())
    print('True negative rate on test set')
    print(((y_test==False) & (y_pred_test==False)).sum() / (y_test.shape[0] - y_test.sum()))

In [9]:
# vectorize Bag of Words from review text; as sparse matrix
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=2 ** 17, non_negative=True)
X = hv.transform(data.Text)

#if you restrict it to 2 to the power of 17 features is a few extra, way to restrict number of features created 

In [8]:
# convert additional features to sparse matrix and concatenate onto the bag of words sparse matrix
from scipy.sparse import csr_matrix, hstack
XtoaddSparse = csr_matrix(Xtoadd)
Xfinal = hstack([X, XtoaddSparse])
X = csr_matrix(Xfinal)

In [10]:
# size of feature set
print(X.shape)

(455000, 131072)


In [11]:
# define y
y = data.iloc[:, 12].values
y.shape

(455000,)

In [12]:
# create training and test sets
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.3, random_state=0)

In [13]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [14]:
# MODEL: SVM, linear
#will also go through each row as many times as you want it to - default is 5, making changes to weights, trying to
#move them closer and closer to lessen cost i think

from sklearn import linear_model
clf = linear_model.SGDClassifier()
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

#getting only 25% of positives correctly classified 

Error rate on training set: 
0.0756797488226
Accuracy rate on training set: 
0.924320251177
True positive rate on training tet:
0.446960564836
**************
Error rate on test set: 
0.117377289377
Accuracy rate on test set: 
0.882622710623
True positive rate on test set
0.234635754972
True negative rate on test set
0.933885669563


In [15]:
# MODEL: logistic regression
from sklearn import linear_model
#alpha, heavily penalizes the weights 
clf = linear_model.SGDClassifier(loss='log', n_iter=50, alpha=0.00001)
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

Error rate on training set: 
0.0716452119309
Accuracy rate on training set: 
0.928354788069
True positive rate on training tet:
0.487558119511
**************
Error rate on test set: 
0.11936996337
Accuracy rate on test set: 
0.88063003663
True positive rate on test set
0.244229039672
True negative rate on test set
0.930976417667


In [None]:
# MODEL: Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

In [None]:
# Perceptron
from sklearn import linear_model
clf = linear_model.SGDClassifier(loss='perceptron')
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()