# Exercises for Chapter 2
### Using the housing data from the chapter:

#### 1) Try a SVM regressor with various hyperparameters (kernel, C, gamma values). Don't worry about what the values mean right now.

(data loading code copied from chapter notebook)

In [1]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [2]:
fetch_housing_data()

In [3]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

I know I'm not following the best way of doing things here. I haven't created functions, or made a Pipeline, OR created my own transformers to dummify the data. That will be something I practice in one of the next few exercises.

In [6]:
from sklearn.model_selection import train_test_split

housing = load_housing_data()
housing = pd.get_dummies(housing)
housing.fillna(0, inplace=True)
print(list(housing.columns))
label = 'median_house_value'
X, y = housing.drop([label], axis=1), housing[[label]]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.8)
train_X.shape, test_X.shape, train_y.shape, test_y.shape

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND', 'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN']


((4128, 13), (16512, 13), (4128, 1), (16512, 1))

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
import scipy.stats as sp
import numpy as np

In [8]:
c = [c * 0.2 for c in range(1, 15, 4)]
print(c)

n_features = len(housing.columns)
tail = 5
g = [1/g for g in range(n_features - tail, n_features + tail, 5)]
print(g)

cv = 2

print("set to do {} runs...".format(len(c) * cv + len(c) * len(g) * cv))

[0.2, 1.0, 1.8, 2.6]
[0.1111111111111111, 0.07142857142857142]
set to do 24 runs...


In [9]:
param_grid = [
    {'kernel': ['linear'], 'C': c},
    {'kernel': ['rbf'], 'C': c, 'gamma': g}
]
svm = SVR()
grid = GridSearchCV(svm, param_grid=param_grid, verbose=2, scoring='neg_mean_squared_error', cv=cv)

grid.fit(train_X, train_y.values.ravel())

Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV] C=0.2, kernel=linear ............................................
[CV] ............................. C=0.2, kernel=linear, total=   0.7s
[CV] C=0.2, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV] ............................. C=0.2, kernel=linear, total=   0.8s
[CV] C=1.0, kernel=linear ............................................
[CV] ............................. C=1.0, kernel=linear, total=   2.2s
[CV] C=1.0, kernel=linear ............................................
[CV] ............................. C=1.0, kernel=linear, total=   1.6s
[CV] C=1.8, kernel=linear ............................................
[CV] ............................. C=1.8, kernel=linear, total=   3.9s
[CV] C=1.8, kernel=linear ............................................
[CV] ............................. C=1.8, kernel=linear, total=   3.2s
[CV] C=2.6, kernel=linear ............................................
[CV] ............................. C=2.6, kernel=linear, total=   4.6s
[CV] C=2.6, kernel=linear ............................................
[CV] ............................. C=2.6, kernel=linear, total=   4.1s
[CV] C=0.2, gamma=0.1111111111111111, kernel=rbf .....................
[CV] .

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:   48.9s finished


GridSearchCV(cv=2, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['linear'], 'C': [0.2, 1.0, 1.8, 2.6]}, {'kernel': ['rbf'], 'C': [0.2, 1.0, 1.8, 2.6], 'gamma': [0.1111111111111111, 0.07142857142857142]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=2)

In [10]:
grid.best_estimator_, np.sqrt(-grid.score(train_X, train_y)), np.sqrt(-grid.score(test_X, test_y))

(SVR(C=2.6, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
   kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
 97102.89414925371,
 97877.6489075402)

#### 2) Replace GridSearchCV with RandomizedSearchCV

In [11]:
rand_params = {'kernel': ['rbf'], 'C': sp.randint(1,5), 'gamma': sp.uniform()}

svm_2 = SVR()

rand_grid = RandomizedSearchCV(svm_2, param_distributions=rand_params, verbose=2, scoring='neg_mean_squared_error')

In [12]:
rand_grid.fit(train_X, train_y.values.ravel())
rand_grid.best_estimator_, np.sqrt(-rand_grid.score(train_X, train_y)), np.sqrt(-rand_grid.score(test_X, test_y))

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=4, gamma=0.9095899782412715, kernel=rbf .......................
[CV] ........ C=4, gamma=0.9095899782412715, kernel=rbf, total=   1.7s
[CV] C=4, gamma=0.9095899782412715, kernel=rbf .......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.6s remaining:    0.0s


[CV] ........ C=4, gamma=0.9095899782412715, kernel=rbf, total=   1.5s
[CV] C=4, gamma=0.9095899782412715, kernel=rbf .......................
[CV] ........ C=4, gamma=0.9095899782412715, kernel=rbf, total=   1.4s
[CV] C=3, gamma=0.07388134683634917, kernel=rbf ......................
[CV] ....... C=3, gamma=0.07388134683634917, kernel=rbf, total=   1.4s
[CV] C=3, gamma=0.07388134683634917, kernel=rbf ......................
[CV] ....... C=3, gamma=0.07388134683634917, kernel=rbf, total=   1.5s
[CV] C=3, gamma=0.07388134683634917, kernel=rbf ......................
[CV] ....... C=3, gamma=0.07388134683634917, kernel=rbf, total=   1.6s
[CV] C=2, gamma=0.7975096398613276, kernel=rbf .......................
[CV] ........ C=2, gamma=0.7975096398613276, kernel=rbf, total=   1.5s
[CV] C=2, gamma=0.7975096398613276, kernel=rbf .......................
[CV] ........ C=2, gamma=0.7975096398613276, kernel=rbf, total=   1.6s
[CV] C=2, gamma=0.7975096398613276, kernel=rbf .......................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.3min finished


(SVR(C=3, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
   gamma=0.07388134683634917, kernel='rbf', max_iter=-1, shrinking=True,
   tol=0.001, verbose=False), 117858.69439644736, 118365.82082701934)

#### 3) Try adding a transformer that selects the most important attributes

##### Get Most important features

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder

forest_reg = RandomForestRegressor(random_state=42, max_features=3)
forest_reg.fit(train_X, train_y.values.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [83]:
feature_importances = forest_reg.feature_importances_

##### Build Transformer

In [98]:
from sklearn.base import BaseEstimator, TransformerMixin

def top_indices(feature_importances, k):
    feature_tuples = [(k, v) for k, v in zip(feature_importances, range(0,len(feature_importances)))]
    sorted_tuples = sorted(feature_tuples, reverse=True)
    return [sorted_tuples[i][1] for i in range(len(sorted_tuples)) if i < k]
    
class ImportantColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances_, k_features_):
        self.feature_importances = feature_importances_
        self.k_features = k_features_
    
    def fit(self, X, y=None):
        self.feature_indices = top_indices(self.feature_importances, self.k_features)
        return self
    
    def transform(self, X, y=None):
        return X.iloc[:, self.feature_indices]
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

num_features = 5
tformer = ImportantColumnsTransformer(feature_importances, num_features)
tformer.fit(train_X)
transformer_output = tformer.transform(train_X)

In [99]:
transformer_output.columns

Index(['median_income', 'longitude', 'ocean_proximity_INLAND', 'latitude',
       'population'],
      dtype='object')

#### 4) Create a pipeline that does the full data preparation plus the final prediction

In [104]:
from sklearn.linear_model import Ridge

tformer = ImportantColumnsTransformer(feature_importances, num_features)

pipe = Pipeline([
    ('tformr', tformer),
    ('model', Ridge())
])

{'feature_importances': array([0.08900182, 0.08236674, 0.05339859, 0.06350528, 0.0428013 ,
       0.06461588, 0.05228723, 0.41621742, 0.02658445, 0.08433808,
       0.        , 0.01060011, 0.0142831 ]), 'k_features': 5}


In [101]:
pipe.fit(train_X, train_y.values.ravel())

Pipeline(memory=None,
     steps=[('tformr', ImportantColumnsTransformer(feature_importances_=None, k_features_=None)), ('model', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

#### 5) Automatically explore with GridSearchCV

In [102]:
param_grid_alpha = [{'model__alpha': [0.1, 0.5, 0.7]}]

grid = GridSearchCV(pipe, param_grid_alpha)

In [103]:
grid.fit(train_X, train_y.values.ravel())

TypeError: object of type 'NoneType' has no len()