In [16]:
import numpy as np
import pandas as pd

In [17]:
import os

housing_path = os.path.join("..", "datasets", "housing", "housing.csv")
housing_data = pd.read_csv(housing_path)

In [18]:
housing_data["income_cat"] = pd.cut(
    housing_data["median_income"],
    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5],
)

In [19]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(
    housing_data, test_size=0.2, random_state=42, stratify=housing_data["income_cat"]
)

for set in (train_set, test_set):
    set.drop("income_cat", axis=1, inplace=True)

In [20]:
X_train: pd.DataFrame = train_set.drop("median_house_value", axis=1)
y_train: pd.DataFrame = train_set["median_house_value"]

X_test: pd.DataFrame = test_set.drop("median_house_value", axis=1)
y_test: pd.DataFrame = test_set["median_house_value"]

In [21]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.preprocessing import OneHotEncoder


class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters, gamma, random_state) -> None:
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, name=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]


def column_ratio(X):
    return X[:, [0]] / X[:, [1]]


def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # features_name_out_ for ration pipeline


def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler(),
    )


cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
)

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler(),
)

cluster_similarity = ClusterSimilarity(n_clusters=10, gamma=1.0, random_state=42)

default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

preprocessing = ColumnTransformer(
    [
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),
        (
            "log",
            log_pipeline,
            [
                "total_bedrooms",
                "total_rooms",
                "population",
                "households",
                "median_income",
            ],
        ),
        ("geo", cluster_similarity, ["latitude", "longitude"]),
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=default_num_pipeline,
)  # one column remaining: housing_median_age

Rather than restrict ourselves to k-Nearest Neighbors regressors, let's create a transformer that accepts any regressor. For this, we can extend the `MetaEstimatorMixin` and have a required `estimator` argument in the constructor. The `fit()` method must work on a clone of this estimator, and it must also save `feature_names_in_`. The `MetaEstimatorMixin` will ensure that `estimator` is listed as a required parameters, and it will update `get_params()` and `set_params()` to make the estimator's hyperparameters available for tuning. Lastly, we create a `get_feature_names_out()` method: the output column name is the ...

In [22]:
from sklearn.base import TransformerMixin, BaseEstimator, MetaEstimatorMixin, clone
from sklearn.utils.validation import check_is_fitted
from sklearn.neighbors import KNeighborsRegressor


class FeatureFromRegressor(BaseEstimator, TransformerMixin, MetaEstimatorMixin):
    def __init__(self, estimator) -> None:
        self.estimator = estimator

    def fit(self, X, y):
        estimator_: BaseEstimator = clone(self.estimator)
        estimator_.fit(X, y)
        self.estimator_ = estimator_
        self.n_features_in_ = self.estimator_.n_features_in_
        if hasattr(self.estimator_, "feature_names_in_"):
            self.feature_names_in_ = self.estimator_.feature_names_in_
        return self

    def transform(self, X):
        check_is_fitted(self)
        predictions = self.estimator_.predict(X)
        # Because sklearn prefer input as an 2D array, not 1D, it is a good idea to reshape it to a 2D array.
        if predictions.ndim == 1:
            predictions = predictions.reshape(-1, 1)
        return predictions

    def get_feature_names_out_(self, names=None):
        check_is_fitted(self)
        n_outputs = getattr(self.estimator_, "n_outputs_", 1)
        estimator_class_name = self.estimator_.__class__.__name__
        estimator_short_name = estimator_class_name.lower().replace("_", "")
        return [f"{estimator_short_name}_prediction_{i}" for i in range(n_outputs)]

Let's check if it compiles to Scikit-Learn API:

In [23]:
from sklearn.utils.estimator_checks import check_estimator

check_estimator(FeatureFromRegressor(KNeighborsRegressor()))

Now, let test it!

In [24]:
knn_reg = KNeighborsRegressor(n_neighbors=3, weights="distance")
knn_transformer = FeatureFromRegressor(knn_reg)
geo_features = X_train[["latitude", "longitude"]]
knn_transformer.fit_transform(geo_features, y_train)

array([[ 68850.],
       [279600.],
       [ 79000.],
       ...,
       [135700.],
       [258100.],
       [ 62700.]])

In [25]:
knn_transformer.get_feature_names_out_()

['kneighborsregressor_prediction_0']

In [26]:
transformers = [
    (name, clone(transformer), columns)
    for name, transformer, columns in preprocessing.transformers
]
geo_index = [name for (name, _, _) in transformers].index("geo")
transformers[geo_index] = ("geo", knn_transformer, ["latitude", "longitude"])

new_geo_preprocessing = ColumnTransformer(transformers)

In [27]:
# rnd_search.best_params_ from housing notebook
svr_C = 157055.10989448498
svr_gamma = 0.26497040005002437
svr_kernel = "rbf"

In [28]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

new_geo_pipeline = Pipeline(
    [
        ("preprocessing", new_geo_preprocessing),
        ("svr", SVR(C=svr_C, gamma=svr_gamma, kernel=svr_kernel)),
    ]
)

In [29]:
from sklearn.model_selection import cross_val_score

new_pipe_rmses = -cross_val_score(
    new_geo_pipeline,
    X_train.iloc[:5000],
    y_train.iloc[:5000],
    scoring="neg_root_mean_squared_error",
    cv=3,
)
pd.Series(new_pipe_rmses).describe()

count         3.000000
mean     104487.511247
std        2940.605612
min      101582.902572
25%      102999.847802
50%      104416.793032
75%      105939.815585
max      107462.838138
dtype: float64

All the above code cells is from exercise_4 notebook.

Apparently, the model performs terribly! Hence, as the solution of exercise 5, we will automate the exploration process using RandomSearchCV.

In [30]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform, expon

param_distributions = {
    "preprocessing__geo__estimator__n_neighbors": range(1, 30),
    "preprocessing__geo__estimator__weights": ["distance", "uniform"],
    "svr__C": loguniform(20, 2e5),
    "svr__gamma": expon(scale=1.0),
}

geo_rnd_search = RandomizedSearchCV(
    new_geo_pipeline,
    param_distributions,
    n_iter=50,
    cv=3,
    random_state=42,
    scoring="neg_root_mean_squared_error",
)
geo_rnd_search.fit(X_train.iloc[:5000], y_train.iloc[:5000])

In [31]:
geo_rnd_search.best_params_

{'preprocessing__geo__estimator__n_neighbors': 20,
 'preprocessing__geo__estimator__weights': 'distance',
 'svr__C': 55456.48365602121,
 'svr__gamma': 0.006976409181650647}

In [32]:
-geo_rnd_search.best_score_

106367.2730733879

It seems like KNearestNeighbors performs worse than rbf_kernel in this case. Maybe we can try them both? And maybe training on the whole train set will help.