In [1]:
# Load data
import numpy as np

data = np.load('data/cleaned_data_no_outliers.npy')
train = data[:, :-1]
label = data[:, -1]

In [3]:
from sklearn.svm import LinearSVR
from sklearn.model_selection import cross_val_score

svm_reg = LinearSVR()
scores = cross_val_score(svm_reg, train, label, scoring="neg_mean_squared_error", cv=10)
svm_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation:", scores.std())

display_scores(svm_rmse_scores)

Scores: [ 324.09318612  325.02120801  326.52471775  325.74765356  325.6904589
  321.94107636  323.77029721  323.55783441  322.6798445   322.96970632]
Mean: 324.199598314
Standard Deviation: 1.42235911333


In [4]:
# hmm, a lot higher than linear regression. Let's try grid search

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'epsilon': [0.1, 1, 10]},
]

svm_reg = LinearSVR()

grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train, label)

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'epsilon': [0.1, 1, 10]}], pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [5]:
# best epsilon?
grid_search.best_params_

{'epsilon': 0.1}

In [6]:
# hm let's do another grid search..
param_grid = [
    {'epsilon': [0, 0.01, 0.1]},
]

svm_reg = LinearSVR()

grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train, label)

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'epsilon': [0, 0.01, 0.1]}], pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [7]:
# best epsilon?
grid_search.best_params_

{'epsilon': 0}

In [9]:
# test on data without outliers

svm_reg = grid_search.best_estimator_

from sklearn.metrics import mean_squared_error
predictions = svm_reg.predict(train)
lin_mse = mean_squared_error(label, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmsle = np.log(lin_rmse)

print(lin_mse)
print(lin_rmsle)

105051.235197
5.78110173219


In [10]:
# test on data with outliers
train_outliers = np.load('data/cleaned_train.npy')
label_outliers = np.load('data/cleaned_label.npy')

predictions = svm_reg.predict(train_outliers)
lin_mse = mean_squared_error(label_outliers, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmsle = np.log(lin_rmse)

print(lin_mse)
print(lin_rmsle)

27364428.5003
8.56237724901


In [11]:
# save it

from sklearn.externals import joblib
joblib.dump(svm_reg, 'models/svm_reg_no_outlier.pkl')

['models/svm_reg_no_outlier.pkl']