## Support Vector Machines for Regression and Classification

Working with the housing price per district data to training SVM models, both for regression and classification.

#### Getting the data 

In [1]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [2]:
fetch_housing_data()

In [3]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [4]:
housing = load_housing_data()

### Fixing categories for categorical variable

In [5]:
d = {'<1H OCEAN':'LESS_1H_OCEAN', 'INLAND':'INLAND', 'ISLAND':'ISLAND', 'NEAR BAY':'NEAR_BAY', 'NEAR OCEAN':'NEAR_OCEAN'}
housing['ocean_proximity'] = housing['ocean_proximity'].map(lambda s: d[s])

### Adding two more features

In [6]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["population_per_household"]=housing["population"]/housing["households"]

### Fix missing data with median values

In [7]:
median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median, inplace=True) 

### Creating dummy variables

In [8]:
one_hot = pd.get_dummies(housing['ocean_proximity'])
housing = housing.drop('ocean_proximity', axis=1)
housing = housing.join(one_hot)

### Creating training and test set

70% for training set as both training and validation by using cross-validation.


In [10]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.3, random_state=42)

### Features

In [11]:
target = 'median_house_value'
features = list(train_set.columns)
features = [f for f in features if f!=target]

In [12]:
X_tr = train_set[features]
y_tr = train_set[[target]]

X_te = test_set[features]
y_te = test_set[[target]]

### Scaling features

Using StandardScaler to normalize the training and testing data, using training data

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_tr)
X_tr = scaler.transform(X_tr)
X_te = scaler.transform(X_te)

#### Comparing models

In [14]:
from sklearn.model_selection import cross_val_score
import numpy as np

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())

### Linear regression as a benchmark

In [15]:
from sklearn.linear_model import LinearRegression
lin_scores = cross_val_score(LinearRegression(), train_set[features], train_set[target], scoring="neg_mean_squared_error", cv=4)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [ 70142.55721218  67456.39127204  67318.3258893   70866.26065275]
Mean: 68945.8837566


### 1. Support Vector Machines for Regression

In [16]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

C_vals = [60000, 65000, 70000] ## YOUR VALUES FOR C ##
gamma_vals = [0.01, 0.1, 1.0, 1.5] ## YOUR VALUES FOR gamma ## 

param_grid = [{'C':C_vals, 'gamma':gamma_vals}]
grid_search_rbf = GridSearchCV(SVR(kernel='rbf'), param_grid, cv=3,scoring='neg_mean_squared_error')
grid_search_rbf.fit(X_tr, np.ravel(y_tr))

GridSearchCV(cv=3, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [60000, 65000, 70000], 'gamma': [0.01, 0.1, 1.0, 1.5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [17]:
print(grid_search_rbf.best_params_)
print(np.sqrt(-grid_search_rbf.best_score_))

{'C': 70000, 'gamma': 0.1}
57787.8239704


SVM was created with a MSE less than that of regression

### Performance on Test Set

In [18]:
from sklearn.metrics import mean_squared_error

final_model = grid_search_rbf.best_estimator_  

y_te_estimation = final_model.predict(X_te)

final_mse = mean_squared_error(y_te, y_te_estimation)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

57018.4815687


### 2. SVM for Classification

Transforming the target into a binary variable, indicating if the price is above the average $179700


In [20]:
from sklearn.metrics import accuracy_score

In [21]:
np.median(housing[['median_house_value']])

179700.0

#### Binary target variable

In [22]:
y_tr_b = 1*np.ravel(y_tr>=179700.0)
y_te_b = 1*np.ravel(y_te>=179700.0)

#### Linear SVM for classification

In [23]:
from sklearn.svm import LinearSVC

In [24]:
lin_clf = LinearSVC(random_state=42)
lin_clf.fit(X_tr, y_tr_b)
y_pred = lin_clf.predict(X_tr)
accuracy_score(y_tr_b, y_pred)

0.83845514950166111

### SVC with default hyper-parameters

In [25]:
from sklearn.svm import SVC

In [26]:
svc = SVC(random_state = 42)
svc.fit(X_tr, y_tr_b)
y_pred = svc.predict(X_tr)
accuracy_score(y_tr_b, y_pred)

0.86614064230343302

SVC with default hyper-parameters improves the performance at an accuracy of 0.87 vs 0.84

### Using randomized search to tune SVC

In [27]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

In [28]:
param_distribs = {"gamma": reciprocal(0.001, 1.1), "C": uniform(1, 10)}
rnd_search = RandomizedSearchCV(SVC(), param_distribs, scoring ='accuracy', n_iter=5, random_state=42)
rnd_search.fit(X_tr, y_tr_b)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=5, n_jobs=1,
          param_distributions={'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001C89B1DBCF8>, 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001C89B1DBE80>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=0)

In [30]:
rnd_search.best_score_

SVC(C=7.0111501174320878, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.14240549858304632,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [31]:
y_pred_rnd = rnd_search.predict(X_tr)
accuracy_score(y_tr_b, y_pred_rnd)

0.89258028792912514