# Training Set

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.externals import joblib
%matplotlib inline   

In [2]:
import os 
os.system('ps aux | grep wolfm2')
#os.system('killall -s SIGKILL -u wolfm2')
#os.system('cp /home/wolfm2/job.sh .; echo test 1>&2') #; cp ../job.log ../jerbb.txt')

0

### Read raw training data

In [3]:
amazon = pd.read_csv('/home/wolfm2/amazon_data/raw_data_train.csv')
print(amazon.shape)

(364000, 14)


In [4]:
print(amazon.head())
print(amazon['helpful'].mean())

   Unnamed: 0  Unnamed: 0.1      Id   ProductId          UserId  \
0      150581        487850  487851  B0025UCD76  A28B2M0XRXHXIG   
1      334018         21518   21519  B002QWP89S   A7JJX3KMDZD2F   
2       76657        319457  319458  B001GVIUX6  A2S8RJ6DRKGYON   
3      357903        248851  248852  B0009JRH1C  A1FLQ698D9C0C8   
4      301824        394613  394614  B001B4VOQI  A2KJO9EPX17ZXE   

                   ProfileName  HelpfulnessNumerator  HelpfulnessDenominator  \
0                         B622                     0                       0   
1  Shinichi Isozaki "shincyan"                     1                       2   
2                   M. Ronning                     1                       2   
3                     G. Zhang                     4                       8   
4                    Musical E                     0                       0   

   Score        Time                                            Summary  \
0      5  1313020800                     

### Feature extraction on natural language data

In [5]:
# # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer()
# corpus = amazon.Text.as_matrix()
# X_bag_of_words = vectorizer.fit_transform(corpus)
# print(X_bag_of_words.toarray())

In [6]:
import nltk
# nltk.download('punkt')
# nltk.download('popular')

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [7]:
# vectorize Bag of Words from review text; as sparse matrix
from sklearn.feature_extraction.text import HashingVectorizer
#hv = HashingVectorizer(n_features=2 ** 17, non_negative=True)
#  analyzer=stemmed_words,

# look at the text 
hv0 = HashingVectorizer(n_features=2 ** 19, non_negative=True, strip_accents=ascii, tokenizer=LemmaTokenizer(), stop_words={'english'}, 
                           ngram_range=(1,3)) #, token_pattern = r'\b[a-zA-Z0-9]{3,}\b')
X_hv0 = hv0.fit_transform(amazon.Text) # mw adds uid as token

# and a second domain where we look at the summary
amazon['summaryFilter'] = amazon['Summary'].apply(lambda x: " " if x is np.nan else x) # some were np.nans
hv1 = HashingVectorizer(n_features=2 ** 18, non_negative=True, strip_accents=ascii, tokenizer=LemmaTokenizer(), stop_words={'english'}, 
                           ngram_range=(1,3), token_pattern = r'\b[a-zA-Z0-9]{3,}\b')
X_hv1 = hv1.fit_transform(amazon.summaryFilter) 

# Another hash domain we want to count but not scale
amazon['timeFilter'] = amazon['Time'].apply(lambda x: str(int(x)%(86400 * 7))) # converts to day of week
hv2 = HashingVectorizer(n_features=2 ** 17, non_negative=True, strip_accents=ascii, 
                           ngram_range=(1,1)) 
X_hv2 = hv2.fit_transform(amazon.timeFilter + " " + amazon.ProductId + " " + amazon.UserId) # mw adds uid as token


# hv0 = HashingVectorizer(n_features=2 ** 17, non_negative=True)
# X_hv0 = hv0.fit_transform(amazon.ProductId + " " + amazon.UserId + " " + amazon.Text) # mw adds uid as token

# amazon['summaryFilter'] = amazon['Summary'].apply(lambda x: " " if x is np.nan else x) # some were np.nans

# hv1 = HashingVectorizer(n_features=2 ** 17, non_negative=True)
# X_hv1 = hv1.fit_transform(amazon.summaryFilter)

import scipy.sparse as sp
X_hv = sp.hstack([X_hv0, X_hv1], format='csr')

print(X_hv.shape)







(364000, 786432)


In [8]:
# x = amazon.UserId + " " +  amazon.Text
# x.head(10)

In [9]:
# We want to be able to use this model fit on other data (the test set)
# So let's save a copy of this instance of HashingVectorizer to be able to transform other data with this fit
# http://scikit-learn.org/stable/modules/model_persistence.html
joblib.dump(hv0, 'hv0.pkl') # pickle
joblib.dump(hv1, 'hv1.pkl') # pickle
joblib.dump(hv2, 'hv2.pkl') # pickle

['hv2.pkl']

In [10]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(X_hv)

joblib.dump(transformer, 'transformer.pkl') # pickle

['transformer.pkl']

In [11]:
print(type(X_tfidf))

<class 'scipy.sparse.csr.csr_matrix'>


### Create additional quantitative features

In [12]:
# features from Amazon.csv to add to feature set
amazon['reviewLen'] = amazon['Text'].str.len()
amazon['summaryLen'] = amazon['summaryFilter'].str.len()

amazon['rlMeanDist'] = amazon['reviewLen'].apply(lambda x: abs(x-80)) # 80 is avg summary len. Thx George!
amazon['slMeanDist'] = amazon['summaryLen'].apply(lambda x: abs(x-8)) # 8. just guessing here.

#import zlib
#amazon['nameHash'] = zlib.crc32(str(amazon['UserId']).encode('utf8'))
#amazon['nameHash'] = amazon['UserId'].apply(lambda x: zlib.crc32(str(x).encode('utf8'))) # bad. don't do it this way

X_quant_features = amazon[["Score", "reviewLen", "summaryLen", "rlMeanDist", "slMeanDist"]]
print(X_quant_features.head(10))
print(type(X_quant_features))

   Score  reviewLen  summaryLen  rlMeanDist  slMeanDist
0      5        110          10          30           2
1      5        140          30          60          22
2      2        471          55         391          47
3      5      10800          33       10720          25
4      5        152          30          72          22
5      4        231          60         151          52
6      5        271          22         191          14
7      5        320          19         240          11
8      2        362          58         282          50
9      5        283          16         203           8
<class 'pandas.core.frame.DataFrame'>


### Combine all quantitative features into a single sparse matrix

In [13]:
from scipy.sparse import csr_matrix, hstack
X_quant_features_csr = csr_matrix(X_quant_features)
X_combined = hstack([X_tfidf, X_quant_features_csr, X_hv2])  # we dont want to penalize hv2 w tfidf MW
X_matrix = csr_matrix(X_combined) # convert to sparse matrix
print(X_matrix.shape)

(364000, 917509)


### Create `X`, scaled matrix of features

In [14]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X = sc.fit_transform(X_matrix)
print(X.shape)

joblib.dump(sc, 'sc.pkl') # pickle

(364000, 917509)


['sc.pkl']

### create `y`, vector of Labels

In [15]:
y = amazon['helpful'].values
print(type(y))

<class 'numpy.ndarray'>


### fit models

In [16]:
from my_measures import BinaryClassificationPerformance

In [17]:
# # MODEL: SVM, linear
# from sklearn import linear_model
# svm = linear_model.SGDClassifier()
# svm.fit(X, y)
# joblib.dump(svm, 'svm.pkl') # pickle

# svm_performance = BinaryClassificationPerformance(svm.predict(X), y, 'svm')
# svm_performance.compute_measures()
# print(svm_performance.performance_measures)

In [18]:
# # MODEL: logistic regression
# from sklearn import linear_model
# #lgs = linear_model.SGDClassifier(loss='log', n_iter=50, alpha=0.00001)
# lgs = linear_model.SGDClassifier(loss='log', n_iter=1000, alpha=0.1)

# lgs.fit(X, y)
# joblib.dump(lgs, 'lgs.pkl') # pickle

# lgs_performance = BinaryClassificationPerformance(lgs.predict(X), y, 'lgs')
# lgs_performance.compute_measures()
# print(lgs_performance.performance_measures)

In [19]:
# # MODEL: Naive Bayes
# from sklearn.naive_bayes import MultinomialNB
# nbs = MultinomialNB()
# nbs.fit(X, y)
# joblib.dump(nbs, 'nbs.pkl') # pickle

# nbs_performance = BinaryClassificationPerformance(nbs.predict(X), y, 'nbs')
# nbs_performance.compute_measures()
# print(nbs_performance.performance_measures)

In [20]:
# # MODEL: Ridge Regression Classifier
# from sklearn import linear_model
# rdg = linear_model.RidgeClassifier()
# rdg.fit(X, y)
# joblib.dump(rdg, 'rdg.pkl') # pickle

# rdg_performance = BinaryClassificationPerformance(rdg.predict(X), y, 'rdg')
# rdg_performance.compute_measures()
# print(rdg_performance.performance_measures)

In [21]:
# # MODEL: Perceptron
# from sklearn import linear_model
# prc = linear_model.SGDClassifier(loss='perceptron')
# prc.fit(X, y)
# joblib.dump(prc, 'prc.pkl') # pickle

# prc_performance = BinaryClassificationPerformance(prc.predict(X), y, 'prc')
# prc_performance.compute_measures()
# print(prc_performance.performance_measures)

In [22]:
import json
import datetime
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB

from sklearn.neural_network import MLPClassifier # mw

# prepare a range of alpha values to test
# alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0.00001])
Cs = np.array([0.001, 0.01, 0.1, 1, 10, 100, 1000])
# create and fit a ridge regression model, testing each alpha
# model = linear_model.SGDClassifier(loss='perceptron', max_iter=50) # max_iter 1000

mlp = MLPClassifier(random_state=0)
svm = linear_model.SGDClassifier(n_iter=1000)
lgs = linear_model.SGDClassifier(loss='log', n_iter=1000)
nbs = MultinomialNB()
rdg = linear_model.RidgeClassifier()
prc = linear_model.SGDClassifier(loss='perceptron', n_iter=1000)

for model in [[svm,"svm"], [lgs,"lgs"], [prc,"prc"], [nbs,"nbs"], [rdg,"rdg"]]: 
# for model in []: 
# for model in [rdg]:    
  fh = open("GridSearch.txt", "a")
  grid = GridSearchCV(estimator=model[0], param_grid=dict(alpha=alphas), n_jobs=2) #
  grid.fit(X, y)
  print(grid)
  # summarize the results of the grid search
  print(grid.cv_results_)
  print(grid.best_score_)
  print(grid.best_estimator_.alpha)

  fh.write('\n########\n')
  fh.write(str(datetime.datetime.now()))
  fh.write('\n########\n')
  fh.write(str(model[0]) + '\n')  
  fh.write(str(grid.cv_results_).replace(", '", ",\n'") + '\n')
  fh.write(str(grid.best_score_) + '\n')  
  fh.write(str(grid.best_estimator_.alpha) + '\n')
  fh.close()

  # MODEL: BEST
  best = grid.best_estimator_

  best.fit(X, y)
  joblib.dump(best, 'best.{}.pkl'.format(model[1])) # pickle

  best_performance = BinaryClassificationPerformance(best.predict(X), y, 'best')
  best_performance.compute_measures()
  print(best_performance.performance_measures)
  







































GridSearchCV(cv=None, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=1000,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   1.00000e-05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
{'mean_fit_time': array([ 2513.23689262,  2499.95041291,  2497.95401637,  2513.98580631,
        2497.55488173,  2471.29775286]), 'std_fit_time': array([ 16.54327622,  12.61934305,  15.50059945,  17.37879491,
        19.0491099 ,  14.34067115]), 'mean_score_time': array([ 1.22955362,  1.30818216,  1.20221313,  1.24536784,  1.232784



{'Pos': 26647, 'Neg': 337353, 'TP': 26143, 'TN': 337327, 'FP': 26, 'FN': 504, 'Accuracy': 0.99854395604395607, 'Precision': 0.99900645802285148, 'Recall': 0.98108605096258494, 'desc': 'best'}








































GridSearchCV(cv=None, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=1000,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   1.00000e-05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
{'mean_fit_time': array([ 2891.60017276,  2890.00271749,  2858.03176506,  2926.17372123,
        2809.0748469 ,  2772.55405347]), 'std_fit_time': array([ 24.37347354,   6.49542949,  30.53582074,  19.41614357,
        13.343644  ,   8.84672036]), 'mean_score_time': array([ 1.30648287,  1.22621608,  1.24368231,  1.1911068 ,  1.22464561



{'Pos': 26647, 'Neg': 337353, 'TP': 26501, 'TN': 337338, 'FP': 15, 'FN': 146, 'Accuracy': 0.99955769230769231, 'Precision': 0.99943430381656362, 'Recall': 0.99452095920741546, 'desc': 'best'}








































GridSearchCV(cv=None, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='perceptron', max_iter=None,
       n_iter=1000, n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   1.00000e-05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
{'mean_fit_time': array([ 2363.17973932,  2330.3597362 ,  2389.79790775,  2354.92683005,
        2411.35398134,  2400.76961613]), 'std_fit_time': array([  4.03379046,  48.28158492,   5.10521196,  50.49719111,
        21.08067747,  17.29612845]), 'mean_score_time': array([ 1.16373118,  1.20839723,  1.1319387 ,  1.1865507 ,  1.1



{'Pos': 26647, 'Neg': 337353, 'TP': 26641, 'TN': 337231, 'FP': 122, 'FN': 6, 'Accuracy': 0.99964835164835164, 'Precision': 0.99544146769794117, 'Recall': 0.99977483394003075, 'desc': 'best'}


GridSearchCV(cv=None, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   1.00000e-05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
{'mean_fit_time': array([ 17.4908127 ,  17.36153571,  18.20632513,  17.60894529,
        18.19629669,  19.13013204]), 'std_fit_time': array([ 0.34601582,  0.84865423,  0.85637034,  0.76416732,  0.48943123,
        0.47556047]), 'mean_score_time': array([ 2.76591516,  2.53903723,  2.6690534 ,  2.73185579,  2.7658968 ,
        2.94625457]), 'std_score_time': array([ 0.11146337,  0.07639836,  0.07262699,  0.08822257,  0.09161791,
        0.07540718]), 'param_alpha': masked_array(data = [1.0 0.10000000000000001 0.01 0.001 0.0001 1.0000000000000001e-05],
             mask = [False False False Fal

{'Pos': 26647, 'Neg': 337353, 'TP': 26533, 'TN': 335760, 'FP': 1593, 'FN': 114, 'Accuracy': 0.99531043956043952, 'Precision': 0.94336201379506501, 'Recall': 0.99572184486058468, 'desc': 'best'}


GridSearchCV(cv=None, error_score='raise',
       estimator=RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   1.00000e-05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
{'mean_fit_time': array([ 9689.98248696,  4231.48856195,  1652.46376975,  1245.72842964,
        1212.90305376,  1201.69976815]), 'std_fit_time': array([ 2189.45656102,   975.21519271,   320.53345847,   245.82696036,
         290.44430492,   274.13659547]), 'mean_score_time': array([ 0.99151977,  0.25327977,  0.13016494,  0.10057418,  0.09996764,
        0.09316635]), 'std_score_time': array([ 0.20606783,  0.0820606 ,  0.01850097,  0.00128297,  0.0008448 ,
        0.01048558]), 'p

{'Pos': 26647, 'Neg': 337353, 'TP': 26616, 'TN': 337339, 'FP': 14, 'FN': 31, 'Accuracy': 0.9998763736263736, 'Precision': 0.99947427713105519, 'Recall': 0.99883664202349232, 'desc': 'best'}


In [23]:
'''
pg = {'learning_rate': ["constant", "invscaling", "adaptive"],
'hidden_layer_sizes': [(100,1), (100,2), (100,3)],
#'alpha': [10.0 ** -np.arange(1, 7)],
'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
'activation': ["logistic", "relu", "Tanh"],
'tol': [1e-2, 1e-4, 1e-6],
'epsilon': [1e-3, 1e-7, 1e-8, 1e-9, 1e-8]
}

fh = open("GridSearch.txt", "a")
grid = GridSearchCV(estimator=mlp, param_grid=pg, n_jobs=2) #
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.cv_results_)
print(grid.best_score_)
print(grid.best_estimator_.alpha)

fh.write('\n########\n')
fh.write(str(datetime.datetime.now()))
fh.write('\n########\n')
fh.write(str(model) + '\n')  
fh.write(str(grid.cv_results_).replace(", '", ",\n'") + '\n')
fh.write(str(grid.best_score_) + '\n')  
fh.write(str(grid.best_estimator_.alpha) + '\n')
fh.close()
'''

'\npg = {\'learning_rate\': ["constant", "invscaling", "adaptive"],\n\'hidden_layer_sizes\': [(100,1), (100,2), (100,3)],\n#\'alpha\': [10.0 ** -np.arange(1, 7)],\n\'alpha\': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001],\n\'activation\': ["logistic", "relu", "Tanh"],\n\'tol\': [1e-2, 1e-4, 1e-6],\n\'epsilon\': [1e-3, 1e-7, 1e-8, 1e-9, 1e-8]\n}\n\nfh = open("GridSearch.txt", "a")\ngrid = GridSearchCV(estimator=mlp, param_grid=pg, n_jobs=2) #\ngrid.fit(X, y)\nprint(grid)\n# summarize the results of the grid search\nprint(grid.cv_results_)\nprint(grid.best_score_)\nprint(grid.best_estimator_.alpha)\n\nfh.write(\'\n########\n\')\nfh.write(str(datetime.datetime.now()))\nfh.write(\'\n########\n\')\nfh.write(str(model) + \'\n\')  \nfh.write(str(grid.cv_results_).replace(", \'", ",\n\'") + \'\n\')\nfh.write(str(grid.best_score_) + \'\n\')  \nfh.write(str(grid.best_estimator_.alpha) + \'\n\')\nfh.close()\n'

In [24]:
'''
# MODEL: BEST
best = grid.best_estimator_

best.fit(X, y)
joblib.dump(best, 'best.pkl') # pickle

best_performance = BinaryClassificationPerformance(best.predict(X), y, 'best')
best_performance.compute_measures()
print(best_performance.performance_measures)
'''

"\n# MODEL: BEST\nbest = grid.best_estimator_\n\nbest.fit(X, y)\njoblib.dump(best, 'best.pkl') # pickle\n\nbest_performance = BinaryClassificationPerformance(best.predict(X), y, 'best')\nbest_performance.compute_measures()\nprint(best_performance.performance_measures)\n"

### ROC plot to compare performance of various models and fits

In [25]:
# #fits = [svm_performance, lgs_performance, nbs_performance, rdg_performance, prc_performance]
# fits = [svm_performance, lgs_performance, rdg_performance, prc_performance]

# for fit in fits:
#     plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
#              fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'ro')
#     plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
#              fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)
# plt.axis([0, 1, 0, 1])
# plt.title('ROC plot: training set')
# plt.xlabel('False positive rate')
# plt.ylabel('True positive rate')
# plt.show()