# California Housing with SVM Regression

## Imports

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [65]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVR, LinearSVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

## Loading Data

In [41]:
def load_data():
    return fetch_california_housing()

In [46]:
housing = load_data()

In [47]:
housing.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [48]:
X, y = housing['data'], housing['target']

In [51]:
X.shape, y.shape

((20640, 8), (20640,))

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Building a model

### Non-polynomial features for linear svr

In [90]:
def display_score(y_pred):
    print(f'mse : {mean_squared_error(y_train, y_pred)}')
    print(f'sqrt_mse : {np.sqrt(mean_squared_error(y_train, y_pred))}')

In [91]:
X_train_scaled = Pipeline([
    ('scaling', StandardScaler()),
]).fit_transform(X_train)

model = LinearSVR().fit(X_train_scaled, y_train)
y_pred = model.predict(X_train_scaled)

display_score(y_pred)

mse : 0.9591993038983119
sqrt_mse : 0.9793872083595496




### polynomial features for linear svr

In [92]:
X_train_scaled = Pipeline([
    ('poly_features', PolynomialFeatures(degree=2)),
    ('scaling', StandardScaler()),
]).fit_transform(X_train)

model = LinearSVR().fit(X_train_scaled, y_train)
y_pred = model.predict(X_train_scaled)

display_score(y_pred)

mse : 0.5367865653348606
sqrt_mse : 0.7326571949655996




###  SVR with Polynomial features

In [153]:
np.random.seed(42)
X_train_scaled = Pipeline([
    ('poly_features', PolynomialFeatures(degree=2)),
    ('scaling', StandardScaler()),
]).fit_transform(X_train)

model_2 = SVR().fit(X_train_scaled, y_train)
y_pred = model_2.predict(X_train_scaled)

display_score(y_pred)

mse : 0.3478474367126868
sqrt_mse : 0.5897859244782693


###  SVR without Polynomial features

In [162]:
X_train_scaled = Pipeline([
    ('scaling', StandardScaler()),
]).fit_transform(X_train)

model_3 = SVR().fit(X_train_scaled, y_train)
y_pred = model_3.predict(X_train_scaled)

display_score(y_pred)

mse : 0.3361301529185807
sqrt_mse : 0.5797673265358964


### Randomized Search CV

**SVR without Polynomial features** is the best model, performing RandomizedSearchCV to get the optimal hyperparameter values

In [123]:
grid_param = {
    'kernel' : ['rbf','linear', 'poly'],
    'degree' : [2, 3, 4, 6, 10],
    'C' : [0.001, 0.01, 1, 10, 100],
    'epsilon' : [0.001, 0.01, 1, 10, 100], 'gamma' :['scale', 'auto']
}
estimator = SVR()
model = RandomizedSearchCV(estimator=estimator, param_distributions=grid_param,
                          scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

In [124]:
model.fit(X_train_scaled[:3000], y_train[:3000])

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='scale', kernel='rbf',
                                 max_iter=-1, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='deprecated', n_iter=10, n_jobs=-1,
                   param_distributions={'C': [0.001, 0.01, 1, 10, 100],
                                        'degree': [2, 3, 4, 6, 10],
                                        'epsilon': [0.001, 0.01, 1, 10, 100],
                                        'gamma': ['scale', 'auto'],
                                        'kernel': ['rbf', 'linear', 'poly']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring='neg_mean_squared_error',
                   verbose=0)

In [125]:
model.best_params_

{'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.001, 'degree': 2, 'C': 1}

In [126]:
y_pred = model.best_estimator_.predict(X_train_scaled)
display_score((-y_pred))

mse : 0.3780249726399801
sqrt_mse : 0.6148373546231394


## Running the model on the Test Set

In [163]:
scaler = Pipeline([
    ('scaling', StandardScaler())
])
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_test_pred = model_3.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_test_pred)
print(mse)

0.3570026426754463


In [164]:
np.sqrt(mse)

0.5974969813107396