In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVR, LinearSVR
from sklearn.grid_search import GridSearchCV
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from time import time

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

In [2]:
reviews_features = pd.read_csv('data/eletronic_sample_counts.csv.gz')
reviews_features.shape

(19756, 19)

In [5]:
reviews_features.columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'asin', u'helpful', u'overall',
       u'reviewText', u'reviewTime', u'reviewerID', u'reviewerName',
       u'summary', u'unixReviewTime', u'helpfulness', u'tot', u'word_count',
       u'sentence_count', u'unigram_count', u'pos_tag', u'adj', u'noun'],
      dtype='object')

In [10]:
df_columns = reviews_features.select_dtypes(include=['float64','int','int64']).columns
df_columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'overall', u'unixReviewTime',
       u'helpfulness', u'tot', u'word_count', u'sentence_count',
       u'unigram_count', u'adj', u'noun'],
      dtype='object')

## grid search LinearSVM

In [11]:
features = reviews_features[list(['word_count','sentence_count','unigram_count','adj','noun'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5, verbose=3)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] ......................... epsilon=0.0001, score=0.287399 -  12.4s
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.055373 -  12.6s
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.364452 -  12.8s
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.131618 -  13.0s
[CV] epsilon=0.001 ...................................................
[CV] ............

[Parallel(n_jobs=5)]: Done  15 out of  15 | elapsed:   35.3s finished


GridSearchCV took 39.60 seconds for 3 candidates


[mean: 0.19226, std: 0.11489, params: {'epsilon': 0.0001},
 mean: 0.33102, std: 0.05025, params: {'epsilon': 0.001},
 mean: 0.26378, std: 0.03323, params: {'epsilon': 0.01}]

## grid search SVM RBF

In [None]:
gamma_range = 10 ** np.arange(-4,4,1)
C_range = 10.0 ** np.arange(-3,4,1)
param_grid = [{'kernel': ['rbf'], 'gamma': gamma_range, 'C': C_range}]
grid_search = GridSearchCV(SVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5, verbose=3)

start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_