In [1]:
import sklearn
from sklearn.model_selection import train_test_split

from pathlib import Path
import pandas as pd
import numpy as np
import tarfile
import urllib.request

from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted


def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))


housing = load_housing_data()
housing["income_cat"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])

strat_train_set, strat_test_set = train_test_split(housing, test_size=0.2, stratify=housing["income_cat"],
                                                   random_state=42)
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [2]:
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel


class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer


def column_ratio(X):
    return X[:, [0]] / X[:, [1]]


def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out


def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())


log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessing = ColumnTransformer([
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
    ("people_per_house", ratio_pipeline(), ["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                           "households", "median_income"]),
    ("geo", cluster_simil, ["latitude", "longitude"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
],
    remainder=default_num_pipeline)  # one column remaining: housing_median_age

## 1

Exercise: _Try a Support Vector Machine regressor (`sklearn.svm.SVR`) with various hyperparameters, such as `kernel="linear"` (with various values for the `C` hyperparameter) or `kernel="rbf"` (with various values for the `C` and `gamma` hyperparameters). Note that SVMs don't scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross-validation, or else it will take hours. Don't worry about what the hyperparameters mean for now (see the SVM notebook if you're interested). How does the best `SVR` predictor perform?_

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline

param_grid = [
    {'svr__kernel': ['linear'], 'svr__C': [10., 30., 100., 300., 1000.,
                                           3000., 10000., 30000.0]},
    {'svr__kernel': ['rbf'], 'svr__C': [1.0, 3.0, 10., 30., 100., 300.,
                                        1000.0],
     'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
]

svr_pipeline = Pipeline([("preprocessing", preprocessing), ("svr", SVR())])
grid_search = GridSearchCV(svr_pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error')
grid_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [5]:
grid_search.best_params_

{'svr__C': 10000.0, 'svr__kernel': 'linear'}

In [6]:
svr_grid_search_rmse = -grid_search.best_score_
svr_grid_search_rmse

69062.0651706999

## 2

Exercise: _Try replacing the `GridSearchCV` with a `RandomizedSearchCV`._

In [7]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, reciprocal, expon

param_distribs = {
    'svr__kernel': ['linear', 'rbf'],
    'svr__C': reciprocal(20, 200_000),
    'svr__gamma': expon(scale=1.0),
}

rnd_search = RandomizedSearchCV(
    svr_pipeline, param_distributions=param_distribs, n_iter=10, cv=3,
    scoring='neg_root_mean_squared_error', random_state=42)

rnd_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [8]:
svr_grid_search_rmse = -rnd_search.best_score_
svr_grid_search_rmse

56313.77847736149

In [9]:
rnd_search.best_params_

{'svr__C': 157055.10989448498,
 'svr__gamma': 0.26497040005002437,
 'svr__kernel': 'rbf'}

## 3

Exercise: _Try adding a `SelectFromModel` transformer in the preparation pipeline to select only the most important attributes._

In [10]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

select_from_model = SelectFromModel(RandomForestRegressor(), threshold=0.01)
new_pipeline = Pipeline([("preprocessing", preprocessing), ("selection", select_from_model),
                         ("svr", SVR(kernel="rbf", gamma=0.26497040005002437, C=57055.10989448498))])

new_pipeline.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [11]:
out_features = preprocessing.get_feature_names_out()[select_from_model.get_support()]

In [12]:
out_features

array(['bedrooms__ratio', 'rooms_per_house__ratio',
       'people_per_house__ratio', 'log__total_bedrooms',
       'log__total_rooms', 'log__median_income',
       'geo__Cluster 0 similarity', 'geo__Cluster 2 similarity',
       'geo__Cluster 3 similarity', 'geo__Cluster 4 similarity',
       'geo__Cluster 5 similarity', 'geo__Cluster 7 similarity',
       'geo__Cluster 8 similarity', 'cat__ocean_proximity_INLAND',
       'remainder__housing_median_age'], dtype=object)

In [13]:
from sklearn.model_selection import cross_val_score

selector_rmses = -cross_val_score(new_pipeline,
                                  housing.iloc[:5000],
                                  housing_labels.iloc[:5000],
                                  scoring="neg_root_mean_squared_error",
                                  cv=3)
pd.Series(selector_rmses).describe()

count        3.000000
mean     56551.290740
std       1689.944626
min      55159.645145
25%      55611.052416
50%      56062.459687
75%      57247.113537
max      58431.767387
dtype: float64

## 4

Exercise: _Try creating a custom transformer that trains a k-Nearest Neighbors regressor (`sklearn.neighbors.KNeighborsRegressor`) in its `fit()` method, and outputs the model's predictions in its `transform()` method. Then add this feature to the preprocessing pipeline, using latitude and longitude as the inputs to this transformer. This will add a feature in the model that corresponds to the housing median price of the nearest districts._

In [62]:
from sklearn.neighbors import KNeighborsRegressor
import numpy as np


class KNeighborsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_neighbors: int = 5):
        self.n_neighbors = n_neighbors
        self.regressor_ = None

    def fit(self, X, y=None, sample_weight=None):
        self.regressor_ = KNeighborsRegressor(n_neighbors=self.n_neighbors)
        self.regressor_.fit(X, y)
        return self

    def transform(self, X):
        arr = self.regressor_.predict(X)
        return arr[:, np.newaxis]

    def get_feature_names_out(self, names=None):
        return ["district_avg_price"]

knn = KNeighborsTransformer()

In [64]:
res = knn.fit_transform(housing[['longitude', 'latitude']], housing_labels)
pd.DataFrame(res, columns=knn.get_feature_names_out())

Unnamed: 0,district_avg_price
0,447500.6
1,366500.0
2,108880.0
3,106320.0
4,312160.0
...,...
16507,452020.2
16508,92600.0
16509,111020.0
16510,500001.0


In [65]:
preprocessing2 = ColumnTransformer([
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
    ("people_per_house", ratio_pipeline(), ["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                           "households", "median_income"]),
    ("geo", cluster_simil, ["latitude", "longitude"]),
    ("knn", knn, ["latitude", "longitude"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
],
    remainder=default_num_pipeline)  # one column remaining: housing_median_age

In [66]:
transformed = preprocessing2.fit_transform(housing, housing_labels)

In [67]:
pd.DataFrame(transformed, columns=preprocessing2.get_feature_names_out())

Unnamed: 0,bedrooms__ratio,rooms_per_house__ratio,people_per_house__ratio,log__total_bedrooms,log__total_rooms,log__population,log__households,log__median_income,geo__Cluster 0 similarity,geo__Cluster 1 similarity,...,geo__Cluster 8 similarity,geo__Cluster 9 similarity,knn__district_avg_price,cat__ocean_proximity_<1H OCEAN,cat__ocean_proximity_INLAND,cat__ocean_proximity_ISLAND,cat__ocean_proximity_NEAR BAY,cat__ocean_proximity_NEAR OCEAN,remainder__housing_median_age,remainder__income_cat
0,1.846624,-0.866027,-0.330204,1.324114,0.637892,0.456906,1.310369,-1.071522,3.788947e-18,1.319640e-01,...,7.073518e-08,5.824761e-01,447500.6,0.0,0.0,0.0,1.0,0.0,1.861119,-0.954687
1,-0.508121,0.024550,-0.253616,-0.252671,-0.063576,-0.711654,-0.142030,1.194712,3.346712e-01,1.735776e-09,...,1.275435e-01,2.078829e-10,366500.0,1.0,0.0,0.0,0.0,0.0,0.907630,1.890078
2,-0.202155,-0.041193,-0.051041,-0.925266,-0.859927,-0.941997,-0.913030,-0.756981,2.874903e-18,2.749482e-01,...,1.871204e-08,2.896858e-01,108880.0,0.0,1.0,0.0,0.0,0.0,0.351428,-0.954687
3,-0.149006,-0.034858,-0.141475,0.952773,0.943475,0.670700,0.925373,-0.912253,8.832834e-01,2.237307e-14,...,5.336846e-04,3.778950e-16,106320.0,0.0,1.0,0.0,0.0,0.0,-0.919891,-0.954687
4,0.963208,-0.666554,-0.306148,1.437622,1.003590,0.719093,1.481464,0.034537,4.893551e-01,2.891966e-11,...,3.490774e-02,3.361020e-12,312160.0,0.0,0.0,0.0,0.0,1.0,0.589800,-0.006432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,0.804368,-0.452111,-0.269780,0.465581,0.109720,-0.247054,0.379471,0.648953,3.180854e-01,2.306753e-10,...,8.255537e-02,3.886016e-11,452020.2,1.0,0.0,0.0,0.0,0.0,0.987087,0.941823
16508,-0.192328,0.036792,-0.073741,0.454022,0.477277,0.314542,0.402773,-0.637675,1.025197e-04,6.690741e-03,...,1.092970e-01,4.309336e-04,92600.0,0.0,1.0,0.0,0.0,0.0,-0.443146,-0.954687
16509,-0.242492,-0.109987,0.158542,0.824206,0.859552,1.243381,0.889897,0.333540,1.209845e-01,3.347757e-19,...,8.126491e-06,1.811644e-20,111020.0,0.0,0.0,0.0,0.0,1.0,-1.237721,-0.006432
16510,0.259775,-0.360937,-0.210332,0.987851,0.811293,0.579462,1.023329,0.377051,3.252711e-01,1.067490e-09,...,1.185436e-01,1.447641e-10,500001.0,1.0,0.0,0.0,0.0,0.0,0.669257,-0.006432


## 6

Exercise: _Try to implement the `StandardScalerClone` class again from scratch, then add support for the `inverse_transform()` method: executing `scaler.inverse_transform(scaler.fit_transform(X))` should return an array very close to `X`. Then add support for feature names: set `feature_names_in_` in the `fit()` method if the input is a DataFrame. This attribute should be a NumPy array of column names. Lastly, implement the `get_feature_names_out()` method: it should have one optional `input_features=None` argument. If passed, the method should check that its length matches `n_features_in_`, and it should match `feature_names_in_` if it is defined, then `input_features` should be returned. If `input_features` is `None`, then the method should return `feature_names_in_` if it is defined or `np.array(["x0", "x1", ...])` with length `n_features_in_` otherwise._

In [103]:
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted

class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, demean: bool = True):
        self.demean = demean
        self.mean_ = None
        self.scale_ = None
        self.n_features_in_ = 0
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.to_numpy()

        X = check_array(X)
        self.mean_ = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        self.n_features_in_ = X.shape[1]
        return self

    def transform(self, X):
        X = self._check_can_transform(X)
        if self.demean:
            X = X - self.mean_
        return X / self.scale_

    def inverse_transform(self, X):
        X = self._check_can_transform(X)
        X = X * self.scale_
        if self.demean:
            X = X + self.mean_
        return X

    def _check_can_transform(self, X):
        check_is_fitted(self)
        X = check_array(X)
        if self.n_features_in_ != X.shape[1]:
            raise ValueError(f"Expected {self.n_features_in_} columns, got {X.shape[1]}")
        return X

    def get_feature_names_out(self, feature_names_in=None):
        if feature_names_in is not None:
            if len(feature_names_in) != self.n_features_in_:
                raise ValueError(f"Expected {self.n_features_in_} names, got {len(feature_names_in)}")
            if self.feature_names_in_ is not None and not np.all(feature_names_in == self.feature_names_in_):
                raise ValueError(f"Expected {self.feature_names_in_}, got {feature_names_in}")
            out = feature_names_in
        elif self.feature_names_in_ is not None:
            out = self.feature_names_in_
        else:
            out = [f"x{i}" for i in range(self.n_features_in_)]

        return out

In [104]:
scaler = StandardScalerClone()
longitude_ = housing.reset_index().loc[:, ["longitude"]]
scaler.fit(longitude_)
scaled = scaler.transform(longitude_)
inverted = pd.DataFrame(scaler.inverse_transform(scaled), columns=["longitude"])
pd.testing.assert_frame_equal(longitude_, inverted, check_index_type=False)