In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVR, LinearSVR
from sklearn.grid_search import GridSearchCV
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from time import time

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

In [7]:
reviews_features = pd.read_csv('data/eletronic_sample_counts.csv.gz')
reviews_features = reviews_features[reviews_features.overall==3]
reviews_features.shape

(2075, 19)

In [8]:
reviews_features.columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'asin', u'helpful', u'overall',
       u'reviewText', u'reviewTime', u'reviewerID', u'reviewerName',
       u'summary', u'unixReviewTime', u'helpfulness', u'tot', u'word_count',
       u'sentence_count', u'unigram_count', u'pos_tag', u'adj', u'noun'],
      dtype='object')

In [9]:
df_columns = reviews_features.select_dtypes(include=['float64','int','int64']).columns
df_columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'overall', u'unixReviewTime',
       u'helpfulness', u'tot', u'word_count', u'sentence_count',
       u'unigram_count', u'adj', u'noun'],
      dtype='object')

## grid search LinearSVM

In [10]:
features = reviews_features[list(['word_count','sentence_count','unigram_count','adj','noun'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5, verbose=3)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] ......................... epsilon=0.0001, score=0.450530 -   0.8s
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.359869 -   0.8s
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.393454 -   0.9s
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.419248 -   0.9s
[CV] epsilon=0.001 ...................................................
[CV] ............

[Parallel(n_jobs=5)]: Done  15 out of  15 | elapsed:    3.0s finished


GridSearchCV took 3.56 seconds for 3 candidates


[mean: 0.39959, std: 0.03227, params: {'epsilon': 0.0001},
 mean: 0.34223, std: 0.09847, params: {'epsilon': 0.001},
 mean: 0.22808, std: 0.10260, params: {'epsilon': 0.01}]

## grid search SVM RBF

In [12]:
gamma_range = 10 ** np.arange(-4,4,1)
C_range = 10.0 ** np.arange(-4,4,1)
param_grid = [{'kernel': ['rbf'], 'gamma': gamma_range, 'C': C_range}]
grid_search = GridSearchCV(SVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5, verbose=3)

start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.244784 -   4.0s
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.144896 -   4.0s
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.187381 -   4.0s
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.191689 -   4.1s
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.200493 -   4.0s
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] kernel=rbf

[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:   15.0s


[CV] .......... kernel=rbf, C=0.0001, gamma=1, score=0.188140 -   2.2s
[CV] kernel=rbf, C=0.0001, gamma=10 ..................................
[CV] .......... kernel=rbf, C=0.0001, gamma=1, score=0.131657 -   2.4s
[CV] kernel=rbf, C=0.0001, gamma=10 ..................................
[CV] ......... kernel=rbf, C=0.0001, gamma=10, score=0.127145 -   2.1s
[CV] kernel=rbf, C=0.0001, gamma=10 ..................................
[CV] .......... kernel=rbf, C=0.0001, gamma=1, score=0.166986 -   2.7s
[CV] kernel=rbf, C=0.0001, gamma=100 .................................
[CV] ......... kernel=rbf, C=0.0001, gamma=10, score=0.004099 -   2.4s
[CV] kernel=rbf, C=0.0001, gamma=100 .................................
[CV] ......... kernel=rbf, C=0.0001, gamma=10, score=0.036446 -   2.3s
[CV] kernel=rbf, C=0.0001, gamma=100 .................................
[CV] ......... kernel=rbf, C=0.0001, gamma=10, score=0.001452 -   2.2s
[CV] kernel=rbf, C=0.0001, gamma=100 .................................
[CV] .

[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:  1.1min


[CV] .............. kernel=rbf, C=0.01, gamma=1000, score=nan -   1.8s
[CV] kernel=rbf, C=0.1, gamma=0 ......................................
[CV] .............. kernel=rbf, C=0.01, gamma=1000, score=nan -   1.9s
[CV] kernel=rbf, C=0.1, gamma=0 ......................................
[CV] ............. kernel=rbf, C=0.1, gamma=0, score=0.184622 -   2.3s
[CV] ............. kernel=rbf, C=0.1, gamma=0, score=0.209909 -   2.6s
[CV] kernel=rbf, C=0.1, gamma=0 ......................................
[CV] kernel=rbf, C=0.1, gamma=0 ......................................
[CV] ............. kernel=rbf, C=0.1, gamma=0, score=0.243772 -   2.9s
[CV] kernel=rbf, C=0.1, gamma=0 ......................................
[CV] ............. kernel=rbf, C=0.1, gamma=0, score=0.157489 -   2.4s
[CV] kernel=rbf, C=0.1, gamma=0 ......................................
[CV] ............. kernel=rbf, C=0.1, gamma=0, score=0.198083 -   2.5s
[CV] kernel=rbf, C=0.1, gamma=0 ......................................
[CV] .

[Parallel(n_jobs=5)]: Done 278 tasks      | elapsed:  2.5min


[CV] ............. kernel=rbf, C=100.0, gamma=1000, score=nan -   2.8s
[CV] kernel=rbf, C=1000.0, gamma=0 ...................................
[CV] ............. kernel=rbf, C=100.0, gamma=1000, score=nan -   2.5s
[CV] kernel=rbf, C=1000.0, gamma=0 ...................................
[CV] .......... kernel=rbf, C=1000.0, gamma=0, score=0.202003 -   3.6s
[CV] kernel=rbf, C=1000.0, gamma=0 ...................................
[CV] .......... kernel=rbf, C=1000.0, gamma=0, score=0.164263 -   4.3s
[CV] kernel=rbf, C=1000.0, gamma=0 ...................................
[CV] .......... kernel=rbf, C=1000.0, gamma=0, score=0.141901 -   3.7s
[CV] kernel=rbf, C=1000.0, gamma=0 ...................................
[CV] .......... kernel=rbf, C=1000.0, gamma=0, score=0.161439 -   3.5s
[CV] kernel=rbf, C=1000.0, gamma=0 ...................................
[CV] .......... kernel=rbf, C=1000.0, gamma=0, score=0.126215 -   3.8s
[CV] kernel=rbf, C=1000.0, gamma=0 ...................................
[CV] .

[Parallel(n_jobs=5)]: Done 320 out of 320 | elapsed:  2.9min finished


GridSearchCV took 176.55 seconds for 64 candidates


[mean: 0.19385, std: 0.03188, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 0},
 mean: 0.19385, std: 0.03188, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 0},
 mean: 0.19385, std: 0.03188, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 0},
 mean: 0.19385, std: 0.03188, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 0},
 mean: 0.17743, std: 0.03526, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 1},
 mean: 0.04015, std: 0.04572, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 10},
 mean: nan, std: nan, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 100},
 mean: nan, std: nan, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 1000},
 mean: 0.19551, std: 0.03275, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.19551, std: 0.03275, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.19551, std: 0.03275, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.19551, std: 0.03275, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.17899, std: 0.04246, 