In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVR, LinearSVR
from sklearn.grid_search import GridSearchCV
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from time import time

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

In [2]:
reviews_features = pd.read_csv('data/eletronic_sample_counts.csv.gz')
reviews_features = reviews_features[reviews_features.overall==3]
reviews_features.shape

(2075, 19)

In [3]:
reviews_features.columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'asin', u'helpful', u'overall',
       u'reviewText', u'reviewTime', u'reviewerID', u'reviewerName',
       u'summary', u'unixReviewTime', u'helpfulness', u'tot', u'word_count',
       u'sentence_count', u'unigram_count', u'pos_tag', u'adj', u'noun'],
      dtype='object')

In [4]:
df_columns = reviews_features.select_dtypes(include=['float64','int','int64']).columns
df_columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'overall', u'unixReviewTime',
       u'helpfulness', u'tot', u'word_count', u'sentence_count',
       u'unigram_count', u'adj', u'noun'],
      dtype='object')

## grid search LinearSVM

In [7]:
features = reviews_features[list(['word_count','sentence_count','unigram_count','adj','noun'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=10, n_jobs=5)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

GridSearchCV took 5.87 seconds for 3 candidates


[mean: 0.23765, std: 0.09746, params: {'epsilon': 0.0001},
 mean: 0.37238, std: 0.05263, params: {'epsilon': 0.001},
 mean: 0.35469, std: 0.11936, params: {'epsilon': 0.01}]

## grid search SVM RBF

In [6]:
gamma_range = 10 ** np.arange(-4,4,1)
C_range = 10.0 ** np.arange(-4,4,1)
param_grid = [{'kernel': ['rbf'], 'gamma': gamma_range, 'C': C_range}]
grid_search = GridSearchCV(SVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=10, n_jobs=5, verbose=3)

start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

Fitting 10 folds for each of 64 candidates, totalling 640 fits
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.230829 -   3.2s
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.222947 -   3.5s
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.111148 -   3.6s
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.307791 -   3.6s
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] .........

[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:   16.4s


[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.222947 -   3.4s
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.136014 -   3.2s
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.230829 -   3.5s
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.241082 -   3.2s
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.137385 -   3.3s
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.132021 -   3.2s
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] .......... kernel=rbf, C=0.0001, gamma=0, score=0.279863 -   3.4s
[CV] kernel=rbf, C=0.0001, gamma=0 ...................................
[CV] .

[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:  1.4min


[CV] ........... kernel=rbf, C=0.001, gamma=0, score=0.285351 -   4.3s
[CV] kernel=rbf, C=0.001, gamma=1 ....................................
[CV] ........... kernel=rbf, C=0.001, gamma=0, score=0.183292 -   3.9s
[CV] kernel=rbf, C=0.001, gamma=1 ....................................
[CV] ........... kernel=rbf, C=0.001, gamma=1, score=0.296710 -   2.9s
[CV] kernel=rbf, C=0.001, gamma=1 ....................................
[CV] ........... kernel=rbf, C=0.001, gamma=1, score=0.092228 -   3.1s
[CV] kernel=rbf, C=0.001, gamma=1 ....................................
[CV] ........... kernel=rbf, C=0.001, gamma=1, score=0.197632 -   3.1s
[CV] kernel=rbf, C=0.001, gamma=1 ....................................
[CV] ........... kernel=rbf, C=0.001, gamma=1, score=0.291821 -   3.3s
[CV] kernel=rbf, C=0.001, gamma=1 ....................................
[CV] ........... kernel=rbf, C=0.001, gamma=1, score=0.109435 -   3.3s
[CV] kernel=rbf, C=0.001, gamma=1 ....................................
[CV] .

[Parallel(n_jobs=5)]: Done 278 tasks      | elapsed:  3.0min


[CV] ............. kernel=rbf, C=0.1, gamma=0, score=0.278797 -   4.5s
[CV] kernel=rbf, C=0.1, gamma=1 ......................................
[CV] ............. kernel=rbf, C=0.1, gamma=0, score=0.179597 -   4.1s
[CV] kernel=rbf, C=0.1, gamma=1 ......................................
[CV] ............. kernel=rbf, C=0.1, gamma=1, score=0.279582 -   2.9s
[CV] kernel=rbf, C=0.1, gamma=1 ......................................
[CV] ............. kernel=rbf, C=0.1, gamma=1, score=0.117691 -   2.8s
[CV] kernel=rbf, C=0.1, gamma=1 ......................................
[CV] ............. kernel=rbf, C=0.1, gamma=1, score=0.178204 -   2.5s
[CV] kernel=rbf, C=0.1, gamma=1 ......................................
[CV] ............. kernel=rbf, C=0.1, gamma=1, score=0.279538 -   2.4s
[CV] kernel=rbf, C=0.1, gamma=1 ......................................
[CV] ............. kernel=rbf, C=0.1, gamma=1, score=0.119572 -   2.3s
[CV] kernel=rbf, C=0.1, gamma=1 ......................................
[CV] .

[Parallel(n_jobs=5)]: Done 502 tasks      | elapsed:  5.7min


[CV] ........... kernel=rbf, C=100.0, gamma=0, score=0.194976 -   4.0s
[CV] kernel=rbf, C=100.0, gamma=0 ....................................
[CV] ........... kernel=rbf, C=100.0, gamma=0, score=0.102349 -   4.4s
[CV] kernel=rbf, C=100.0, gamma=0 ....................................
[CV] ........... kernel=rbf, C=100.0, gamma=0, score=0.193196 -   3.8s
[CV] kernel=rbf, C=100.0, gamma=0 ....................................
[CV] ........... kernel=rbf, C=100.0, gamma=0, score=0.118954 -   3.7s
[CV] kernel=rbf, C=100.0, gamma=0 ....................................
[CV] ........... kernel=rbf, C=100.0, gamma=0, score=0.242103 -   3.5s
[CV] kernel=rbf, C=100.0, gamma=0 ....................................
[CV] ........... kernel=rbf, C=100.0, gamma=0, score=0.086522 -   3.9s
[CV] kernel=rbf, C=100.0, gamma=0 ....................................
[CV] ........... kernel=rbf, C=100.0, gamma=0, score=0.236201 -   3.1s
[CV] kernel=rbf, C=100.0, gamma=0 ....................................
[CV] .

[Parallel(n_jobs=5)]: Done 640 out of 640 | elapsed:  7.2min finished


GridSearchCV took 435.33 seconds for 64 candidates


[mean: 0.19827, std: 0.06476, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 0},
 mean: 0.19827, std: 0.06476, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 0},
 mean: 0.19827, std: 0.06476, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 0},
 mean: 0.19827, std: 0.06476, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 0},
 mean: 0.18714, std: 0.07264, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 1},
 mean: 0.06432, std: 0.04556, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 10},
 mean: nan, std: nan, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 100},
 mean: nan, std: nan, params: {'kernel': 'rbf', 'C': 0.0001, 'gamma': 1000},
 mean: 0.20005, std: 0.06545, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.20005, std: 0.06545, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.20005, std: 0.06545, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.20005, std: 0.06545, params: {'kernel': 'rbf', 'C': 0.001, 'gamma': 0},
 mean: 0.19039, std: 0.07687, 