In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('Amazon.csv')
print(data.shape)
data.head(5)

(455000, 13)


Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,helpScore,helpful
0,138806,138807,B000E63LME,A1CQGW1AOD0LF2,"Alena K. ""Alena""",1,2,2,1294185600,Not as pictured.,I was looking forward to try cranberry apple f...,0.5,False
1,469680,469681,B004ZIH4KM,A37S7U1OX2MCWI,Becky Cole,0,0,5,1349740800,seeds,"TY for everything. The seeds arrived quickly,...",,False
2,238202,238203,B003ZXE9QA,A2OM6G73E64EQ9,jeff,0,0,5,1329264000,I'm addicted!,I've finally found the best cereal in the worl...,,False
3,485307,485308,B001RVFERK,A25W349EE97NBK,Tangent4,1,1,4,1248307200,I wanted to love these...,I originally bought these chips because I'd he...,1.0,False
4,375283,375284,B000OQZNTS,A3CPPW0HUC07YS,Amy Nicolai,0,0,5,1333238400,Excellent chamomile tea,"Really excellent tea, flowers are visible in t...",,False


In [3]:
# features from Amazon.csv to add to feature set
data['reviewLen'] = data['Text'].str.len()
XScore = data.iloc[:, 7].values.reshape(data.shape[0], 1)
XreviewLen = data.iloc[:, 13].values.reshape(data.shape[0], 1)
Xtoadd = np.concatenate((XScore, XreviewLen), axis=1)

In [4]:
# report on training and test sets
def print_results():
    print('Error rate on training set: ')
    print((y_train != y_pred).sum() / X_train.shape[0])
    print('Accuracy rate on training set: ')
    print(1 - (y_train != y_pred).sum() / X_train.shape[0])
    print('True positive rate on training tet:')
    print(((y_train==True) & (y_pred==True)).sum() / y_train.sum())
    print('**************')
    print('Error rate on test set: ')
    print((y_test != y_pred_test).sum() / X_test.shape[0])
    print('Accuracy rate on test set: ')
    print(1 - (y_test != y_pred_test).sum() / X_test.shape[0])
    print('True positive rate on test set')
    print(((y_test==True) & (y_pred_test==True)).sum() / y_test.sum())
    print('True negative rate on test set')
    print(((y_test==False) & (y_pred_test==False)).sum() / (y_test.shape[0] - y_test.sum()))

In [5]:
# vectorize Bag of Words from review text; as sparse matrix
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=2 ** 17, non_negative=True)
X = hv.transform(data.Text)

In [6]:
# convert additional features to sparse matrix and concatenate onto the bag of words sparse matrix
from scipy.sparse import csr_matrix, hstack
XtoaddSparse = csr_matrix(Xtoadd)
Xfinal = hstack([X, XtoaddSparse])
X = csr_matrix(Xfinal)

In [7]:
# size of feature set
print(X.shape)

(455000, 131074)


In [8]:
# define y
y = data.iloc[:, 12].values
y.shape

(455000,)

## Create an index set
When creating training and test data sets, the order of the rows is shuffled. You can keep track of their original position (so that it can be matched back to the original Amazon.csv file) by also creating a test and training set that just holds the original index. 

In [9]:
# create training and test sets
from sklearn.cross_validation import train_test_split

indices = np.arange(X.shape[0])

X_train, X_test, y_train, y_test, i_train, i_test = train_test_split(
         X, y, indices, test_size=0.3, random_state=0)

In [10]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [11]:
# MODEL: SVM, linear
from sklearn import linear_model
model_svm = linear_model.SGDClassifier()
#model_svm.fit(X_train_std, y_train)
y_pred = model_svm.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = model_svm.predict(X_test_std)
print_results()

Error rate on training set: 
0.0729419152276
Accuracy rate on training set: 
0.927058084772
True positive rate on training tet:
0.464310315137
**************
Error rate on test set: 
0.114659340659
Accuracy rate on test set: 
0.885340659341
True positive rate on test set
0.240631557909
True negative rate on test set
0.936344303637


## Merge to original data frame and report on classification results
### Assumes Amazon.csv has not been modified
Merge y predictions from the model to the raw data frame (Amazon.com) and show samples by classification category (true negatives, true positives, false negatives, false positives)

In [12]:
class ReportResults():
    """Report on classification results to see actual reviews by their classification."""
    
    def __init__(self):
        """initialize attributes"""
        
    def printRep(self, dataInx, n_samples, report_title):
        """print reports"""
        dataIn = dataInx.sample(n_samples)
        print(75 * '*')
        print(report_title.upper())
        print(75 * '*')
        
        for i in range(0, dataIn.shape[0]):
            print(report_title[:-1].lower())
            for j in range(1, dataIn.shape[1]):
                if (j == 1):
                    print('ProductId:', dataIn.iloc[i, j])
                elif (j == 2):
                    print('HelpfulnessNumerator:', dataIn.iloc[i, j])
                elif (j == 3):
                    print('HelpfulnessDenominator:', dataIn.iloc[i, j])
                elif (j == 4):
                    print('Score:', dataIn.iloc[i, j])
                elif (j == 5):
                    print('Summary:', dataIn.iloc[i, j])
                elif (j == 6):
                    print('Text: ', dataIn.iloc[i, j])
                elif (j == 7):
                    print('helpScore:', dataIn.iloc[i, j])
                elif (j == 8):
                    print('helpful:', dataIn.iloc[i, j])
            print(75 * '-')

    def write_report(self, y_pred_test, index_test, raw_data, n_to_report):
        """Write report to show reports of correctly and incorrectly classified reviews.
            y_pred_test is the vector of the model's y predictions
            index_test is the index of the test data set
            raw_data is the data frame with the original Amazon.csv file
            n_to_report is the number of samples you want to see in each classification category
            """
        forMerge = np.transpose(np.reshape(np.hstack((y_pred_test, index_test)), (2, index_test.shape[0])))
        dfForMerge = pd.DataFrame(forMerge, index=forMerge[:, 1])
        forReports = dfForMerge.merge(raw_data, how='left', left_index=True, right_index=True)
        # remove redundant columns
        forReports_final = forReports.drop(forReports.columns[[1, 2, 3, 5, 6, 10, 15]], axis=1)
        # rename columns
        forReports_final.columns = ['predicted_class', 'ProductId', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Summary', 'Text', 'helpScore', 'helpful']
        tndf = forReports_final[(forReports_final.predicted_class == 0) & (forReports_final.helpful == False)]
        self.printRep(tndf, n_to_report, 'True Negatives')
        tpdf = forReports_final[(forReports_final.predicted_class == 1) & (forReports_final.helpful == True)]
        self.printRep(tpdf, n_to_report, 'True Positives')
        fndf = forReports_final[(forReports_final.predicted_class == 0) & (forReports_final.helpful == True)]
        self.printRep(fndf, n_to_report, 'False Negatives')
        fpdf = forReports_final[(forReports_final.predicted_class == 1) & (forReports_final.helpful == False)]
        self.printRep(fpdf, n_to_report, 'False Positives')
        


In [13]:
rep = ReportResults()
rep.write_report(y_pred_test, i_test, data, 10)

***************************************************************************
TRUE NEGATIVES
***************************************************************************
true negative
ProductId: B000EMQG4I
HelpfulnessNumerator: 0
HelpfulnessDenominator: 0
Score: 4
Summary: Not QUITE a chocolate lover's delight
Text:  My family started putting chocolate chips in a chocolate cake mix quite a while ago, then usually cook them in cupcake pans for chocolate-chocolate-chip muffins.  When I saw this mix, I thought, right on! They've done it for me!  Unfortunately, they use mini chocolate chips (which pretty much blend into the surrounding cake) and not enough of them, so it doesn't quite add up to the Chocolate Lover's Dream Snack/Dessert.  We don't frost them, however; maybe that's the extra needed "push" to make it a 5 star rating.  Frosting costs more than the cake mix, however, so I try to stay away from it.  If they would have used regular sized chocolate chips, and a few more of them, you 