# Load Library

In [2]:
import pandas as pd
import os
import numpy as np

# Load Data

In [3]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [4]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [5]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# Split into train test

In [6]:
housing['income_cat'] = pd.cut(housing['median_income'], bins=[0, 1.5, 3.0, 4.5, 6.0, np.inf], labels=[1,2,3,4,5])

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
    train = housing.loc[train_index]
    test = housing.loc[test_index]

In [8]:
for set_ in (train, test):
    set_.drop("income_cat", axis=1, inplace=True)

In [9]:
train_labels = train["median_house_value"].copy()
train = train.drop("median_house_value", axis=1)

test_labels = test["median_house_value"].copy()
test = test.drop("median_house_value", axis=1)

# Prepare the data

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

from sklearn.compose import ColumnTransformer

train_num = train.drop('ocean_proximity', axis=1)
num_attribs = list(train_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

In [18]:
train_prepared = full_pipeline.fit_transform(train)

In [20]:
from sklearn.svm import SVR

regr = SVR(C=1.0, kernel='linear')
regr.fit(train_prepared, train_labels)

SVR(kernel='linear')

In [23]:
from sklearn.model_selection import cross_val_score

scores1 = cross_val_score(regr, train_prepared, train_labels, scoring='neg_mean_squared_error')
scores1_rmse = np.sqrt(-scores1)

In [27]:
regr2 = SVR(C=3.0, kernel='rbf')
regr2.fit(train_prepared, train_labels)
scores2 = cross_val_score(regr, train_prepared, train_labels, scoring='neg_mean_squared_error')
scores2_rmse = np.sqrt(-scores2)

In [28]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [29]:
display_scores(scores1_rmse)
display_scores(scores2_rmse)

Scores: [109554.16396803 112422.0947246  113970.08259411 113427.52864218
 113430.29743241]
Mean: 112560.8334722674
Standard deviation: 1584.4507648248702
Scores: [109554.16396803 112422.0947246  113970.08259411 113427.52864218
 113430.29743241]
Mean: 112560.8334722674
Standard deviation: 1584.4507648248702
