In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVR, LinearSVR
from sklearn.grid_search import GridSearchCV
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from time import time

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

In [9]:
reviews_features = pd.read_csv('data/eletronic_sample_counts.csv.gz')
reviews_features.shape

(19756, 19)

In [10]:
df_columns = reviews_features.select_dtypes(include=['float64','int','int64']).columns
df_columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'overall', u'unixReviewTime',
       u'helpfulness', u'tot', u'word_count', u'sentence_count',
       u'unigram_count', u'adj', u'noun'],
      dtype='object')

## try LinearSVM: word_count

In [11]:
features = reviews_features[list(['word_count'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

GridSearchCV took 29.74 seconds for 3 candidates


[mean: 0.37010, std: 0.03261, params: {'epsilon': 0.0001},
 mean: 0.37010, std: 0.03261, params: {'epsilon': 0.001},
 mean: 0.37010, std: 0.03261, params: {'epsilon': 0.01}]

## try LinearSVM: sentence_count

In [12]:
features = reviews_features[list(['sentence_count'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

GridSearchCV took 27.24 seconds for 3 candidates


[mean: 0.34157, std: 0.03231, params: {'epsilon': 0.0001},
 mean: 0.34157, std: 0.03231, params: {'epsilon': 0.001},
 mean: 0.34157, std: 0.03231, params: {'epsilon': 0.01}]

## try LinearSVM: unigram_count

In [13]:
features = reviews_features[list(['unigram_count'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

GridSearchCV took 28.20 seconds for 3 candidates


[mean: 0.37295, std: 0.03276, params: {'epsilon': 0.0001},
 mean: 0.37295, std: 0.03276, params: {'epsilon': 0.001},
 mean: 0.37295, std: 0.03276, params: {'epsilon': 0.01}]

## try LinearSVM: adj

In [14]:
features = reviews_features[list(['adj'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

GridSearchCV took 29.67 seconds for 3 candidates


[mean: 0.38692, std: 0.03720, params: {'epsilon': 0.0001},
 mean: 0.38692, std: 0.03720, params: {'epsilon': 0.001},
 mean: 0.38692, std: 0.03720, params: {'epsilon': 0.01}]

## try LinearSVM: noun

In [15]:
features = reviews_features[list(['noun'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

GridSearchCV took 28.70 seconds for 3 candidates


[mean: 0.36956, std: 0.03443, params: {'epsilon': 0.0001},
 mean: 0.36956, std: 0.03443, params: {'epsilon': 0.001},
 mean: 0.36956, std: 0.03443, params: {'epsilon': 0.01}]

## try LinearSVM: all

In [16]:
features = reviews_features[list(['word_count','sentence_count','unigram_count','adj','noun'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=5, n_jobs=5, verbose=3)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] ......................... epsilon=0.0001, score=0.392014 -  12.9s
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.197241 -  13.4s
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.402102 -  13.8s
[CV] epsilon=0.001 ...................................................
[CV] ......................... epsilon=0.0001, score=0.369468 -  14.0s
[CV] epsilon=0.001 ...................................................
[CV] ............

[Parallel(n_jobs=5)]: Done  15 out of  15 | elapsed:   40.5s finished


GridSearchCV took 44.51 seconds for 3 candidates


[mean: 0.34130, std: 0.07461, params: {'epsilon': 0.0001},
 mean: 0.30566, std: 0.15644, params: {'epsilon': 0.001},
 mean: 0.33222, std: 0.02457, params: {'epsilon': 0.01}]