## Important points:

* Scaled the data and no other preprocessing of data
* Used mean squared error as metric (Ignore use of word accuracy as metric)
* Performed RandomizedSearchCV to get better $\gamma$ and C values
* Achieved 0.48 mse on training data and 0.55 mse on validation 

In [17]:
import numpy as np
import pandas as pd 
import os
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

np.random.seed(42)
%matplotlib inline

In [8]:
# Workound for urllib error
# https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error
# import ssl

# ssl._create_default_https_context = ssl._create_unverified_context

In [9]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()

x = data.data 
y = data.target

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /home/hellraizer/scikit_learn_data


In [11]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.3, random_state=42)

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_valid = scaler.transform(x_valid)

In [20]:
def score(y_true, y_pred, train=False):
    accuracy = np.sqrt(mean_squared_error(y_true, y_pred)) 
    if train:
        print("Train accuracy:{}".format(accuracy))
    else:
        print("Val accuracy:{}".format(accuracy))

In [15]:
from sklearn.svm import SVR

svr = SVR(kernel='linear')
svr.fit(x_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [22]:
# Looks like it is working well
score(y_train, svr.predict(x_train), True)
score(y_valid, svr.predict(x_valid))

Train accuracy:0.8409929728284017
Val accuracy:0.7445613979865312


In [24]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform
# Taken from solutions
params = {"gamma": reciprocal(0.0001, 5), "C": uniform(1, 10), 
         "kernel": ['linear', 'rbf']}
rand_search = RandomizedSearchCV(SVR(), params, n_iter = 30, verbose=2,
                                cv = 4, random_state=42, n_jobs=3)

In [25]:
rand_search.fit(x_train, y_train)

Fitting 4 folds for each of 30 candidates, totalling 120 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  2.6min
[Parallel(n_jobs=3)]: Done 120 out of 120 | elapsed: 15.4min finished


RandomizedSearchCV(cv=4, error_score=nan,
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='scale', kernel='rbf',
                                 max_iter=-1, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='deprecated', n_iter=30, n_jobs=3,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f5d6c02f950>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f5d6c02f350>,
                                        'kernel': ['linear', 'rbf']},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring=None, verbose=2)

In [26]:
rand_search.best_estimator_

SVR(C=10.695846277645586, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma=0.4388598863942794, kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [28]:
# Working better than the solution :-)
y_pred = rand_search.best_estimator_.predict(x_valid)
score(y_valid, y_pred)

Val accuracy:0.5516755305773655


In [29]:
y_pred = rand_search.best_estimator_.predict(x_train)
score(y_train, y_pred, True)

Train accuracy:0.4821623953075446
