In [1]:
# imports and specifications
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

#To validate/cross validate--create training and test data
from sklearn.cross_validation import train_test_split

In [2]:
# Call data
amazon = pd.read_csv('Amazon.csv')

In [4]:
#create 4 object for labels and features of TRAIN and TEST

raw_data_train, raw_data_test, y_train, y_test = train_test_split(amazon, amazon['helpful'],
                                                                 test_size=0.2, random_state = 71)

In [5]:
print(raw_data_train.shape)
print(raw_data_test.shape)
print(y_train.shape)
print(y_test.shape)

(364000, 13)
(91000, 13)
(364000,)
(91000,)


In [6]:
raw_data_train.to_csv('raw_data_train.csv')
raw_data_test.to_csv('raw_data_test.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')

### Week 6 notebook Training Set

In [7]:
#import numpy as np
#import pandas as pd
#import matplotlib.pyplot as plt
from sklearn.externals import joblib
#%matplotlib inline

In [8]:
class BinaryClassificationPerformance():
    '''Performance measures to evaluate the fit of a binary classification model'''
    
    def __init__(self, predictions, labels, desc, probabilities=None):
        '''Initialize attributes: predictions-vector of predicted values for Y, labels-vector of labels for Y'''
        '''probabilities-optional, probability that Y is equal to True'''
        self.probabilities = probabilities
        self.performance_df = pd.concat([pd.DataFrame(predictions), pd.DataFrame(labels)], axis=1)
        self.performance_df.columns = ['preds', 'labls']
        self.desc = desc
        self.performance_measures = {}
  
    def compute_measures(self):
        '''Compute performance measures defined by Flach p. 57'''
        self.performance_measures['Pos'] = self.performance_df['preds'].sum()
        self.performance_measures['Neg'] = self.performance_df.shape[0] - self.performance_df['preds'].sum()
        self.performance_measures['TP'] = ((self.performance_df['preds'] == True) & (self.performance_df['labls'] == True)).sum()
        self.performance_measures['TN'] = ((self.performance_df['preds'] == False) & (self.performance_df['labls'] == False)).sum()
        self.performance_measures['FP'] = ((self.performance_df['preds'] == True) & (self.performance_df['labls'] == False)).sum()
        self.performance_measures['FN'] = ((self.performance_df['preds'] == False) & (self.performance_df['labls'] == True)).sum()
        self.performance_measures['Accuracy'] = (self.performance_measures['TP'] + self.performance_measures['TN']) / (self.performance_measures['Pos'] + self.performance_measures['Neg'])

### Read raw training data

In [9]:
amazon = pd.read_csv('raw_data_test.csv')
print(amazon.shape)

(91000, 14)


In [10]:
print(amazon.head())
print(amazon['helpful'].mean())

   Unnamed: 0  Unnamed: 0.1      Id   ProductId          UserId  \
0      400196        411245  411246  B0040WCR6O  A3FFKU2MTCOBM1   
1       38020        110761  110762  B003XUJ3RK   AC2SMT7WEOBQM   
2      366458        192489  192490  B006GA666U  A39FOS1KTT1T8Z   
3       43625        544264  544265  B00125PX8Q  A1XZXAV5OXD08P   
4      211610        494698  494699  B000BZZKVS  A15P774MWM8W4R   

             ProfileName  HelpfulnessNumerator  HelpfulnessDenominator  Score  \
0  new yorker "drealyea"                     0                       0      1   
1                   Bill                     0                       0      5   
2                 kendon                     0                       0      2   
3               Good 4 U                     0                       0      5   
4             D. Dutcher                     8                      11      2   

         Time                            Summary  \
0  1340323200           Overpriced, disapointing   
1  132

### Feature extraction on natural language data

In [13]:
# vectorize Bag of Words from review text; as sparse matrix
#from sklearn.feature_extraction.text import HashingVectorizer
#hv = HashingVectorizer(n_features=2 ** 17, non_negative=True)

#Get the instance
hv = joblib.load('hv.pkl')
X_hv = hv.fit_transform(amazon.Text)
print(X_hv.shape)

(91000, 131072)


In [14]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
#from sklearn.feature_extraction.text import TfidfTransformer
#transformer = TfidfTransformer()

#Get the transformer
transformer = joblib.load('transformer.pkl')
X_tfidf = transformer.fit_transform(X_hv)



In [27]:
print(type(X_tfidf))

<class 'scipy.sparse.csr.csr_matrix'>


### Create additional quantitative features

In [30]:
# features from Amazon.csv to add to feature set
amazon['reviewLen'] = amazon['Text'].str.len()

X_quant_features = amazon[["Score", "reviewLen"]]
print(X_quant_features.head(10))




   Score  reviewLen
0      5        519
1      5       1395
2      5        821
3      5         82
4      5        169
5      5        159
6      4        297
7      2        628
8      1        607
9      3       3188


### try person who wrote it
### try sentiment analysis

### Combine all quantitative features into a single sparse matrix

In [32]:
from scipy.sparse import csr_matrix, hstack
X_quant_features_csr = csr_matrix(X_quant_features)
X_combined = hstack([X_tfidf, X_quant_features_csr])
X_matrix = csr_matrix(X_combined) # convert to sparse matrix
print(X_matrix.shape)

(364000, 131074)


### Create `X`, scaled matrix of features


In [34]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X = sc.fit_transform(X_matrix)
print(X.shape)

joblib.dump(sc, 'sc.pkl') # pickle

(364000, 131074)


['sc.pkl', 'sc.pkl_01.npy', 'sc.pkl_02.npy', 'sc.pkl_03.npy']

### create `y`, vector of Labels

In [36]:
y = amazon['helpful'].values
print(type(y))

<class 'numpy.ndarray'>


### fit models


In [38]:
# MODEL: SVM, linear
from sklearn import linear_model
svm = linear_model.SGDClassifier()
svm.fit(X, y)
joblib.dump(svm, 'svm.pkl') # pickle

svm_performance = BinaryClassificationPerformance(svm.predict(X), y, 'svm')
svm_performance.compute_measures()
print(svm_performance.performance_measures)

{'Neg': 337477, 'TP': 12194, 'FN': 14426, 'TN': 323051, 'FP': 14329, 'Pos': 26523, 'Accuracy': 0.92100274725274722}


In [39]:
# MODEL: logistic regression
from sklearn import linear_model
lgs = linear_model.SGDClassifier(loss='log', n_iter=50, alpha=0.00001)
lgs.fit(X, y)
joblib.dump(lgs, 'lgs.pkl') # pickle

lgs_performance = BinaryClassificationPerformance(lgs.predict(X), y, 'lgs')
lgs_performance.compute_measures()
print(lgs_performance.performance_measures)

{'Neg': 338281, 'TP': 13336, 'FN': 13284, 'TN': 324997, 'FP': 12383, 'Pos': 25719, 'Accuracy': 0.92948626373626375}


In [40]:
# MODEL: Naive Bayes
from sklearn.naive_bayes import MultinomialNB
nbs = MultinomialNB()
nbs.fit(X, y)
joblib.dump(nbs, 'nbs.pkl') # pickle

nbs_performance = BinaryClassificationPerformance(nbs.predict(X), y, 'nbs')
nbs_performance.compute_measures()
print(nbs_performance.performance_measures)

{'Neg': 305222, 'TP': 17117, 'FN': 9503, 'TN': 295719, 'FP': 41661, 'Pos': 58778, 'Accuracy': 0.85943956043956049}
