In [7]:
# imports and specifications
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

#To validate/cross validate--create training and test data
from sklearn.cross_validation import train_test_split

In [4]:
# Call data
amazon = pd.read_csv('Amazon.csv')

In [10]:
#create 4 object for labels and features of TRAIN and TEST

raw_data_train, raw_data_test, y_train, y_test = train_test_split(amazon, amazon['helpful'],
                                                                 test_size=0.2, random_state = 71)

In [12]:
print(raw_data_train.shape)
print(raw_data_test.shape)
print(y_train.shape)
print(y_test.shape)

(364000, 13)
(91000, 13)
(364000,)
(91000,)


In [13]:
raw_data_train.to_csv('raw_data_train.csv')
raw_data_test.to_csv('raw_data_test.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')

### Week 6 notebook Training Set

In [14]:
#import numpy as np
#import pandas as pd
#import matplotlib.pyplot as plt
from sklearn.externals import joblib
#%matplotlib inline

In [16]:
class BinaryClassificationPerformance():
    '''Performance measures to evaluate the fit of a binary classification model'''
    
    def __init__(self, predictions, labels, desc, probabilities=None):
        '''Initialize attributes: predictions-vector of predicted values for Y, labels-vector of labels for Y'''
        '''probabilities-optional, probability that Y is equal to True'''
        self.probabilities = probabilities
        self.performance_df = pd.concat([pd.DataFrame(predictions), pd.DataFrame(labels)], axis=1)
        self.performance_df.columns = ['preds', 'labls']
        self.desc = desc
        self.performance_measures = {}
  
    def compute_measures(self):
        '''Compute performance measures defined by Flach p. 57'''
        self.performance_measures['Pos'] = self.performance_df['preds'].sum()
        self.performance_measures['Neg'] = self.performance_df.shape[0] - self.performance_df['preds'].sum()
        self.performance_measures['TP'] = ((self.performance_df['preds'] == True) & (self.performance_df['labls'] == True)).sum()
        self.performance_measures['TN'] = ((self.performance_df['preds'] == False) & (self.performance_df['labls'] == False)).sum()
        self.performance_measures['FP'] = ((self.performance_df['preds'] == True) & (self.performance_df['labls'] == False)).sum()
        self.performance_measures['FN'] = ((self.performance_df['preds'] == False) & (self.performance_df['labls'] == True)).sum()
        self.performance_measures['Accuracy'] = (self.performance_measures['TP'] + self.performance_measures['TN']) / (self.performance_measures['Pos'] + self.performance_measures['Neg'])

### Read raw training data

In [18]:
amazon = pd.read_csv('raw_data_train.csv')
print(amazon.shape)

(364000, 14)


In [19]:
print(amazon.head())
print(amazon['helpful'].mean())

   Unnamed: 0  Unnamed: 0.1      Id   ProductId          UserId  \
0      188941         73792   73793  B000HDOPZG   AWIW6ZQ47MNJH   
1      220592        200455  200456  B008O2EHNC  A21S0K5PU4YO9L   
2       20265        529969  529970  B000O2APH2  A3621NVN1FSGMO   
3      265113        197131  197132  B000FNH1C2  A2SHXT0YBG49TO   
4       14678         57724   57725  B000EVOSE4  A1JFIH71386GBV   

                          ProfileName  HelpfulnessNumerator  \
0                          S F Norman                     1   
1                      third time mom                    17   
2  Norman J. Pieniazek "Orchid lover"                     1   
3                            K. Lantz                     1   
4             Jennifer Anderson "Jen"                     0   

   HelpfulnessDenominator  Score        Time  \
0                       1      5  1333929600   
1                      17      5  1338854400   
2                       1      5  1245888000   
3                       1 

### Feature extraction on natural language data

In [20]:
# # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer()
# corpus = amazon.Text.as_matrix()
# X_bag_of_words = vectorizer.fit_transform(corpus)
# print(X_bag_of_words.toarray())

In [22]:
# vectorize Bag of Words from review text; as sparse matrix
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=2 ** 17, non_negative=True)
X_hv = hv.fit_transform(amazon.Text)
print(X_hv.shape)

(364000, 131072)


In [23]:
# We want to be able to use this model fit on other data (the test set)
# So let's save a copy of this instance of HashingVectorizer to be able to transform other data with this fit
# http://scikit-learn.org/stable/modules/model_persistence.html
joblib.dump(hv, 'hv.pkl') # pickle

['hv.pkl']

In [25]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(X_hv)

joblib.dump(transformer, 'transformer.pkl') # pickle

['transformer.pkl', 'transformer.pkl_01.npy', 'transformer.pkl_02.npy']

In [27]:
print(type(X_tfidf))

<class 'scipy.sparse.csr.csr_matrix'>


### Create additional quantitative features

In [30]:
# features from Amazon.csv to add to feature set
amazon['reviewLen'] = amazon['Text'].str.len()

X_quant_features = amazon[["Score", "reviewLen"]]
print(X_quant_features.head(10))




   Score  reviewLen
0      5        519
1      5       1395
2      5        821
3      5         82
4      5        169
5      5        159
6      4        297
7      2        628
8      1        607
9      3       3188


### try person who wrote it
### try sentiment analysis