# Data loading and preprocessing

Copied from the chapter

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()

In [2]:
from sklearn.model_selection import train_test_split

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

strat_train_set, strat_test_set = train_test_split(
    housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [3]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, n_init=10,
                              random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

In [5]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import rbf_kernel

def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())
preprocessing = ColumnTransformer([
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                               "households", "median_income"]),
        ("geo", cluster_simil, ["latitude", "longitude"]),
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=default_num_pipeline)  # one column remaining: housing_median_age

#### 1. Try a support vector machine regressor (sklearn.svm.SVR) with various hyperparameters, such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Note that support vector machines don’t scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross-validation, or else it will take hours. Don’t worry about what the hyperparameters mean for now; we’ll discuss them in Chapter 5. How does the best SVR predictor perform?

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR

svr_linear = make_pipeline(preprocessing, SVR(kernel='linear'))
lin_rmses = -cross_val_score(svr_linear, housing[:5000], housing_labels[:5000],
                              scoring="neg_root_mean_squared_error", cv=3)
pd.Series(lin_rmses).describe()

count         3.000000
mean     117853.287772
std        3084.317205
min      114504.103147
25%      116491.531325
50%      118478.959503
75%      119527.880084
max      120576.800665
dtype: float64

In [7]:
svr_rbf = make_pipeline(preprocessing, SVR(kernel='rbf'))
rbf_rmses = -cross_val_score(svr_rbf, housing[:5000], housing_labels[:5000],
                              scoring="neg_root_mean_squared_error", cv=3)
pd.Series(rbf_rmses).describe()

count         3.000000
mean     119704.447822
std        3233.889528
min      116199.308030
25%      118270.592754
50%      120341.877478
75%      121457.017719
max      122572.157959
dtype: float64

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

grid_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('predictor', SVR())
])

param_grid = [
    {'predictor__kernel':['linear'],
     'predictor__C':[1, 5, 10, 20]},
    {'predictor__kernel':['rbf'],
    'predictor__C':[1, 5, 10, 20],
    'predictor__gamma':['scale', 'auto', 0.1, 0.5]}
]

grid_search = GridSearchCV(grid_pipeline, param_grid, cv=3,
                           scoring='neg_root_mean_squared_error')
grid_search.fit(housing[:5000], housing_labels[:5000])

Best hyperparameters

In [9]:
grid_search.best_params_

{'predictor__C': 20, 'predictor__kernel': 'linear'}

#### 2. Try replacing the GridSearchCV with a RandomizedSearchCV.

In [28]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_grid = [
    {'predictor__kernel':['linear'],
     'predictor__C': randint(low=1, high=50)},
    {'predictor__kernel':['rbf'],
    'predictor__C': randint(low=1, high=50),
    'predictor__gamma':['scale', 'auto', 0.1, 0.5, 1.0]}
]

random_search = RandomizedSearchCV(grid_pipeline, param_grid, cv=3,
                           scoring='neg_root_mean_squared_error')
random_search.fit(housing[:5000], housing_labels[:5000])

In [29]:
random_search.best_params_

{'predictor__C': 48, 'predictor__kernel': 'linear'}

#### 3. Try adding a SelectFromModel transformer in the preparation pipeline to select only the most important attributes.

In [27]:
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeRegressor

preprocessing_columns = ColumnTransformer([
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                               "households", "median_income"]),
        ("geo", cluster_simil, ["latitude", "longitude"]),
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
        ("median_age", default_num_pipeline, ["housing_median_age"])
])

preprocessing_fs = make_pipeline(
    preprocessing_columns,
    SelectFromModel(DecisionTreeRegressor(), threshold=0.01)
)

In [29]:
preprocessing.fit_transform(housing).shape

(16512, 24)

In [30]:
preprocessing_fs.fit_transform(housing, housing_labels).shape

(16512, 15)

In [28]:
svr_linear_fs = make_pipeline(preprocessing_fs, SVR(kernel='linear', C=48))
lin_rmses_fs = -cross_val_score(svr_linear_fs, housing[:5000], housing_labels[:5000],
                              scoring="neg_root_mean_squared_error", cv=3)
pd.Series(lin_rmses_fs).describe()

count        3.000000
mean     86620.185070
std       1864.042158
min      84479.138697
25%      85989.276352
50%      87499.414008
75%      87690.708256
max      87882.002504
dtype: float64

#### 4. Try creating a custom transformer that trains a k-nearest neighbors regressor (sklearn.neighbors.KNeighborsRegressor) in its fit() method, and outputs the model’s predictions in its transform() method. Then add this feature to the preprocessing pipeline, using latitude and longitude as the inputs to this transformer. This will add a feature in the model that corresponds to the housing median price of the nearest districts.

In [77]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsRegressor

class KNRTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, estimator):
        self.KNR = KNeighborsRegressor()

    def fit(self, X, y):
        self.KNR.fit(X, y)
        return self

    def transform(self, X):
        predictions = self.KNR.predict(X)
        if predictions.ndim == 1:
            predictions = predictions.reshape(-1, 1)
        return predictions

In [82]:
preprocessing_columns_knr = ColumnTransformer([
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
    ("people_per_house", ratio_pipeline(), ["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                           "households", "median_income"]),
    ("geo", cluster_simil, ["latitude", "longitude"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ("median_age", default_num_pipeline, ["housing_median_age"]),
    ("knr", KNRTransformer(), ["latitude", "longitude"])
])

preprocessing_knr = make_pipeline(
    preprocessing_columns_knr,
    SelectFromModel(DecisionTreeRegressor(), threshold=0.01)
)

In [83]:
preprocessing_knr.fit_transform(housing, housing_labels).shape

(16512, 5)

##### 5. Automatically explore some preparation options using GridSearchCV

##### 6. Try to implement the StandardScalerClone class again from scratch, then add support for the inverse_transform() method: executing scaler.​inverse_transform(scaler.fit_transform(X)) should return an array very close to X. Then add support for feature names: set feature_names_in_ in the fit() method if the input is a DataFrame. This attribute should be a NumPy array of column names. Lastly, implement the get_feature_names_out() method: it should have one optional input_features=None argument. If passed, the method should check that its length matches n_features_in_, and it should match feature_names_in_ if it is defined; then input_features should be returned. If input_features is None, then the method should either return feature_names_in_ if it is defined or np.array(["x0", "x1", ...]) with length n_features_in_ otherwise.