In [1]:
import numpy as np
import pandas as pd

In [2]:
import os

housing_path = os.path.join("..", "datasets", "housing", "housing.csv")
housing_data = pd.read_csv(housing_path)

In [3]:
housing_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [4]:
housing_data["income_cat"] = pd.cut(
    housing_data["median_income"],
    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5],
)

In [5]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(
    housing_data, test_size=0.2, random_state=42, stratify=housing_data["income_cat"]
)

for set in (train_set, test_set):
    set.drop("income_cat", axis=1, inplace=True)

In [6]:
housing: pd.DataFrame = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_idx, bedrooms_idx, population_idx, households_idx = 3, 4, 5, 6


class CombineAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self  # nothing to fit

    def transform(self, X):
        rooms_per_households = X[:, rooms_idx] / X[:, households_idx]
        population_per_households = X[:, population_idx] / X[:, households_idx]
        if self.add_bedrooms_per_room:
            bedrooms_per_rooms = X[:, bedrooms_idx] / X[:, rooms_idx]
            return np.c_[
                X, rooms_per_households, population_per_households, bedrooms_per_rooms
            ]
        else:
            return np.c_[X, rooms_per_households, population_per_households]


attr_adder = CombineAttributeAdder()
housing_extra_attribs = attr_adder.fit_transform(housing.values)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="mean")),
        ("attr_adder", CombineAttributeAdder()),
        ("std_scaler", StandardScaler()),
    ]
)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

housing_num = housing.select_dtypes(include=[np.number])
housing_cat = housing.select_dtypes(exclude=[np.number])


num_attr = list(housing_num)
cat_attr = list(housing_cat)

full_pipeline = ColumnTransformer(
    [
        ("num", num_pipeline, num_attr),
        ("cat", OneHotEncoder(), cat_attr),
    ]
)

In [10]:
housing_prepared = full_pipeline.fit_transform(housing)

In [11]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

svr = SVR()
svr.fit(housing_prepared, housing_labels)
svr_predict = svr.predict(housing_prepared)
svr_rmse = np.sqrt(mean_squared_error(svr_predict, housing_labels))
svr_rmse

118577.65197235334

In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {"kernel": ["linear"], "C": [1e4, 1e5, 1e6]}


grid_search = GridSearchCV(
    svr, param_grid=param_grid, scoring="neg_mean_squared_error", cv=5
)

grid_search.fit(housing_prepared, housing_labels)

In [None]:
grid_search.best_params_

{'C': 10000.0, 'kernel': 'linear'}

In [None]:
np.sqrt(-grid_search.best_score_)

70593.18919824316