In [23]:
import pandas as pd
import numpy as np

from sklearn.svm import SVR, LinearSVR
from sklearn.grid_search import GridSearchCV
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from time import time

def simple_spearman(x,y): return np.abs(spearmanr(x,y)[0])
spearmanr_scorer = make_scorer(simple_spearman)

In [19]:
reviews_features = pd.read_csv('data/eletronic_sample_counts.csv.gz')
reviews_features = reviews_features[reviews_features.overall==3]
reviews_features.shape

(2075, 19)

In [20]:
df_columns = reviews_features.select_dtypes(include=['float64','int','int64']).columns
df_columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'overall', u'unixReviewTime',
       u'helpfulness', u'tot', u'word_count', u'sentence_count',
       u'unigram_count', u'adj', u'noun'],
      dtype='object')

## try LinearSVM: word_count

In [24]:
features = reviews_features[list(['word_count'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=10, n_jobs=5)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

GridSearchCV took 5.01 seconds for 3 candidates


[mean: 0.41164, std: 0.04278, params: {'epsilon': 0.0001},
 mean: 0.41164, std: 0.04278, params: {'epsilon': 0.001},
 mean: 0.41164, std: 0.04278, params: {'epsilon': 0.01}]

## try LinearSVM: sentence_count

In [5]:
features = reviews_features[list(['sentence_count'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=10, n_jobs=5)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

GridSearchCV took 6.32 seconds for 3 candidates


[mean: 0.37242, std: 0.05953, params: {'epsilon': 0.0001},
 mean: 0.37242, std: 0.05953, params: {'epsilon': 0.001},
 mean: 0.37242, std: 0.05953, params: {'epsilon': 0.01}]

## try LinearSVM: unigram_count

In [6]:
features = reviews_features[list(['unigram_count'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=10, n_jobs=5)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

GridSearchCV took 5.18 seconds for 3 candidates


[mean: 0.41163, std: 0.04380, params: {'epsilon': 0.0001},
 mean: 0.41163, std: 0.04380, params: {'epsilon': 0.001},
 mean: 0.41163, std: 0.04380, params: {'epsilon': 0.01}]

## try LinearSVM: adj

In [7]:
features = reviews_features[list(['adj'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=10, n_jobs=5)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

GridSearchCV took 5.74 seconds for 3 candidates


[mean: 0.40161, std: 0.03868, params: {'epsilon': 0.0001},
 mean: 0.40161, std: 0.03868, params: {'epsilon': 0.001},
 mean: 0.40161, std: 0.03868, params: {'epsilon': 0.01}]

## try LinearSVM: noun

In [8]:
features = reviews_features[list(['noun'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=10, n_jobs=5)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

GridSearchCV took 4.99 seconds for 3 candidates


[mean: 0.40895, std: 0.03813, params: {'epsilon': 0.0001},
 mean: 0.40895, std: 0.03813, params: {'epsilon': 0.001},
 mean: 0.40895, std: 0.03813, params: {'epsilon': 0.01}]

## try LinearSVM: all

In [9]:
features = reviews_features[list(['word_count','sentence_count','unigram_count','adj','noun'])].values
labels = reviews_features["helpfulness"].values

param_grid = [{'epsilon':[10**i for i in range(-4,-1)]},]
grid_search = GridSearchCV(LinearSVR(), param_grid=param_grid, scoring=spearmanr_scorer, cv=10, n_jobs=5, verbose=3)
start = time()
grid_search.fit(features, labels)
print("GridSearchCV took %.2f seconds for %d candidates" % (time() - start, len(grid_search.grid_scores_)))
grid_search.grid_scores_

Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] ......................... epsilon=0.0001, score=0.217566 -   1.0s
[CV] epsilon=0.0001 ..................................................
[CV] ......................... epsilon=0.0001, score=0.429599 -   1.1s
[CV] ......................... epsilon=0.0001, score=0.307221 -   1.1s
[CV] epsilon=0.0001 ..................................................
[CV] epsilon=0.0001 ..................................................
[CV] ......................... epsilon=0.0001, score=0.367094 -   1.2s
[CV] epsilon=0.0001 ..................................................
[CV] ...........

[Parallel(n_jobs=5)]: Done  30 out of  30 | elapsed:    7.3s finished


GridSearchCV took 7.86 seconds for 3 candidates


[mean: 0.20818, std: 0.14411, params: {'epsilon': 0.0001},
 mean: 0.20783, std: 0.08509, params: {'epsilon': 0.001},
 mean: 0.37574, std: 0.08043, params: {'epsilon': 0.01}]