In [1]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/ageron/handson-ml/master/'
HOUSING_PATH = 'datasets/housing'
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + '/housing.tgz'

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, 'housing.tgz')
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [2]:
fetch_housing_data()

In [3]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)

In [4]:
housing = load_housing_data()

In [5]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [6]:
import numpy as np

housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [8]:
for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis=1, inplace=True)

In [9]:
housing = strat_train_set.copy()

In [10]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_rooms"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

In [11]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [12]:
housing_num = housing.drop("ocean_proximity", axis=1)

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms","total_bedrooms","population","households")]

In [14]:
def add_extra_features(X, add_bedrooms_per_room=True):
    rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
    population_per_household = X[:, population_ix] / X[:, household_ix]
    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household,
                     bedrooms_per_room]

In [15]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
        ('std_scaler', StandardScaler()),
    ])

In [16]:
try:
    from sklearn.compose import ColumnTransformer
except ImportError:
    from future_encoders import ColumnTransformer # Scikit-Learn < 0.20

In [17]:
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

1. Try a Support Vector Machine regressor (sklearn.svm.SVR), with various hyperparameters such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Don’t worry about what these hyperparameters mean for now. How does the best SVR predictor perform?

In [18]:
from sklearn.svm import SVR

svr = SVR()
svr.fit(housing_prepared, housing_labels)

In [19]:
from sklearn.metrics import mean_squared_error
housing_predictions = svr.predict(housing_prepared)
svr_mse = mean_squared_error(housing_labels,housing_predictions)
svr_rmse = np.sqrt(svr_mse)
svr_rmse

118578.69234925653

In [20]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(svr, housing_prepared, housing_labels,
                        scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [21]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [22]:
display_scores(rmse_scores)

Scores: [116729.13778306 120113.99351281 113925.04340616 120399.11878641
 114687.49942071 122785.64737282 119853.79338279 118280.31108193
 120230.82615529 118840.1885232 ]
Mean: 118584.55594251942
Standard deviation: 2609.6120823493407


In [23]:
from sklearn.model_selection import GridSearchCV

param_grid = [
     {'kernel': ['linear', 'rbf'],
      'C':[1,2,3],
      'gamma': ['auto','scale']}
 ]

grid_search = GridSearchCV(svr, param_grid, cv=5,
                          scoring='neg_mean_squared_error')

In [24]:
grid_search.fit(housing_prepared, housing_labels)

In [25]:
grid_search.best_params_

{'C': 3, 'gamma': 'auto', 'kernel': 'linear'}

In [31]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres['params']):
    print(np.sqrt(-mean_score), params)

112571.06378605746 {'C': 1, 'gamma': 'auto', 'kernel': 'linear'}
118639.96425886606 {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
112571.06378605746 {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
118638.40200558837 {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
107140.2596216261 {'C': 2, 'gamma': 'auto', 'kernel': 'linear'}
118370.28672307904 {'C': 2, 'gamma': 'auto', 'kernel': 'rbf'}
107140.2596216261 {'C': 2, 'gamma': 'scale', 'kernel': 'linear'}
118362.48332089635 {'C': 2, 'gamma': 'scale', 'kernel': 'rbf'}
102376.3300964536 {'C': 3, 'gamma': 'auto', 'kernel': 'linear'}
118094.00233470283 {'C': 3, 'gamma': 'auto', 'kernel': 'rbf'}
102376.3300964536 {'C': 3, 'gamma': 'scale', 'kernel': 'linear'}
118079.45501913907 {'C': 3, 'gamma': 'scale', 'kernel': 'rbf'}


In [39]:
np.sqrt(-grid_search.best_score_)

102376.3300964536

Try replacing GridSearchCV with RandomizedSearchCV.

In [27]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = [
     {'kernel': ['linear', 'rbf'],
      'C':[1,2,3],
      'gamma': ['auto','scale']}
]

rand_search = RandomizedSearchCV(svr, param_grid, cv=5,
                                scoring="neg_mean_squared_error")

In [28]:
rand_search.fit(housing_prepared, housing_labels)

In [29]:
rand_search.best_params_

{'kernel': 'linear', 'gamma': 'scale', 'C': 3}

In [40]:
rand = rand_search.cv_results_
for mean_score, params in zip(rand["mean_test_score"], rand['params']):
    print(np.sqrt(-mean_score), params)

112571.06378605746 {'kernel': 'linear', 'gamma': 'scale', 'C': 1}
107140.2596216261 {'kernel': 'linear', 'gamma': 'auto', 'C': 2}
112571.06378605746 {'kernel': 'linear', 'gamma': 'auto', 'C': 1}
118639.96425886606 {'kernel': 'rbf', 'gamma': 'auto', 'C': 1}
107140.2596216261 {'kernel': 'linear', 'gamma': 'scale', 'C': 2}
118370.28672307904 {'kernel': 'rbf', 'gamma': 'auto', 'C': 2}
102376.3300964536 {'kernel': 'linear', 'gamma': 'scale', 'C': 3}
102376.3300964536 {'kernel': 'linear', 'gamma': 'auto', 'C': 3}
118094.00233470283 {'kernel': 'rbf', 'gamma': 'auto', 'C': 3}
118638.40200558837 {'kernel': 'rbf', 'gamma': 'scale', 'C': 1}


In [42]:
print(np.sqrt(-rand_search.best_score_),np.sqrt(-grid_search.best_score_))

102376.3300964536 102376.3300964536


Try adding a transformer in the preparation pipeline to select only the most important attributes.

In [47]:
feature_importances = grid_search.

In [48]:
feature_importances

[{'kernel': ['linear', 'rbf'], 'C': [1, 2, 3], 'gamma': ['auto', 'scale']}]