Make sure imblearn is installed:

In [1]:
!pip install imblearn



Note: if you are using sklearn=1.3, it needs to be downgraded for the code below to work:

!pip uninstall scikit-learn --yes

!pip uninstall imblearn --yes

!pip install scikit-learn==1.2.2

!pip install imblearn

In [2]:
# setting logging to print only error messages from Sklearnex
import logging
logging.basicConfig()
logging.getLogger("SKLEARNEX").setLevel(logging.ERROR)

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split

sns.set_theme(palette="Set2")

# execution time
from timeit import default_timer as timer
from datetime import timedelta

# increase column width
pd.set_option('display.max_colwidth', 200)

# silence warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Load the data

In [3]:
import os
import tarfile
import urllib


# the file is archived with tar and gunzip
URL = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz"

def get_dataframe():
    global URL
    
    # if the "datasets" folder does not exist, create it
    if not os.path.exists("datasets"):
        os.makedirs("datasets")
    
    # if the archived file does not exist, download it
    if not os.path.exists("datasets/housing.tgz"):
        urllib.request.urlretrieve(URL, "datasets/housing.tgz")
    
    # if the unpacked file does not exist, unpack it
    if not os.path.exists("datasets/housing.csv"):
        infile = tarfile.open("datasets/housing.tgz")
        infile.extractall(path="datasets")
        infile.close()
    
    # load the dataframe
    return pd.read_csv("datasets/housing.csv")

df = get_dataframe()

# Train-test split

In [4]:
df["income_cat"] = pd.cut(df["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                          labels=[1, 2, 3, 4, 5])
trainset, testset = train_test_split(df, test_size=0.2, random_state=7,
                                     stratify=df["income_cat"])
del trainset["income_cat"]
del testset["income_cat"]

# Repeat initial preprocessing steps

## Feature engineering

In [5]:
# create two new features
trainset["rooms_per_household"] = trainset["total_rooms"]/trainset["households"]
trainset["population_per_household"] = trainset["population"]/trainset["households"]
testset["rooms_per_household"] = testset["total_rooms"]/testset["households"]
testset["population_per_household"] = testset["population"]/testset["households"]

# rename "ISLAND" to "NEAR OCEAN"
trainset["ocean_proximity"] = trainset["ocean_proximity"].replace("ISLAND", "NEAR OCEAN")
testset["ocean_proximity"] = testset["ocean_proximity"].replace("ISLAND", "NEAR OCEAN")
trainset["ocean_proximity"].unique()

# delete capped values
    
print(f"Before: {trainset.shape[0]} train instances, {testset.shape[0]} test instances")

trainset.drop(trainset[trainset["median_house_value"] >= 500001].index, inplace=True)
testset.drop(testset[testset["median_house_value"] >= 500001].index, inplace=True)

print(f"After: {trainset.shape[0]} train instances, {testset.shape[0]} test instances")

Before: 16512 train instances, 4128 test instances
After: 15735 train instances, 3940 test instances


## Dummy variables

In [6]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(drop="first", sparse_output=False)

# categorical columns to transform
cat_cols = ["ocean_proximity"]

# fit an encoder and transform the **trainset**
cat_vals = trainset[cat_cols]

transformed = one_hot_encoder.fit_transform(cat_vals)

# the names of the new columns are the unique values of "ocean_proximity"
new_col_names = one_hot_encoder.get_feature_names_out(cat_cols)

# put the transformed data as columns in the trainset dataframe
for i, new_col_name in enumerate(new_col_names):
    trainset[new_col_name] = transformed[:,i]

# delete the categorical column
del trainset['ocean_proximity']

# transform the **testset** using the encoder fitted on trainset
cat_vals = testset[cat_cols]
transformed = one_hot_encoder.transform(cat_vals)

# put the transformed data as columns in the testset dataframe
for i, new_col_name in enumerate(new_col_names):
    testset[new_col_name] = transformed[:,i]

# delete the categorical column also in the test set
del testset['ocean_proximity']

## Separate predictors and target

In [7]:
ytrain = trainset["median_house_value"].copy()
Xtrain = trainset.drop("median_house_value", axis=1)
ytest = testset["median_house_value"].copy()
Xtest = testset.drop("median_house_value", axis=1)

# Pipeline

In [8]:
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import r_regression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error


class LogTransformer:

    def __init__(self, cols, **kwargs):
        self.cols = cols

    def fit(self, X, y=None, **kwargs):
        return self

    def transform(self, X, y=None, **kwargs):
        for col in self.cols:
            X[:, col] = np.log(X[:, col] + 1)
        return X


class OutlierDetectionWrapper:

    def __init__(self, **kwargs):
        self.clf = IsolationForest(**kwargs)

    def fit_resample(self, X, y):
        yhat = self.clf.fit(X).predict(X)
        return X[yhat != -1], y[yhat != -1]

    def set_params(self, **kwargs):
        self.clf.set_params(**kwargs)


# indices of columns to be log-transformed
col_idx = [Xtrain.columns.get_loc(x) for x in 
           ["total_rooms", "total_bedrooms", "population", "households", "median_income"]]

pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('logtransformer', LogTransformer(col_idx)),
    ('iforest', OutlierDetectionWrapper(n_estimators=100, random_state=7, contamination=0.03)),
    ('fsel', SelectKBest(r_regression, k=10)),
    ('scaler', StandardScaler()),
    ('dt', DecisionTreeRegressor(random_state=7))
])

pipe.fit(Xtrain, ytrain)

yhat = pipe.predict(Xtest)
tree_mse = mean_squared_error(ytest, yhat)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

74388.82096572242

# Tuning hyperparameters

In [9]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [10]:
start = timer()

pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('logtransformer', LogTransformer(col_idx)),
    ('iforest', OutlierDetectionWrapper(n_estimators=100, random_state=7)),
    ('fsel', SelectKBest(r_regression)),
    ('scaler', StandardScaler()),
    ('dt', DecisionTreeRegressor(random_state=7))
])

hp_grid = {
    'iforest__contamination': [0.001, 0.1],
    'fsel__k': [3, 13],
    'dt__max_depth': [2, 25],
    'dt__min_samples_leaf': [1, 50],
    'dt__min_samples_split': [2, 50],
    'dt__max_leaf_nodes': [5000, 16000]  
}

opt_grid_search = BayesSearchCV(
     pipe,
     hp_grid,
     n_iter=30,
     random_state=7,
     scoring='neg_root_mean_squared_error',
     return_train_score=True,
     cv=10
)

np.int = int
opt_grid_search.fit(Xtrain, ytrain)

print("Execution time HH:MM:SS:", timedelta(seconds=timer() - start))

Execution time HH:MM:SS: 0:05:46.196719


In [11]:
opt_grid_search.best_estimator_

In [12]:
-opt_grid_search.best_score_

53727.62101145471

In [13]:
cv_results = pd.DataFrame(opt_grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
cv_results["mean_train_score"] = -cv_results["mean_train_score"]
cv_results["mean_test_score"] = -cv_results["mean_test_score"]
cv_results["diff, %"] = 100*(cv_results["mean_train_score"]-cv_results["mean_test_score"]
                                                     )/cv_results["mean_train_score"]

cv_results.sort_values('mean_test_score')

Unnamed: 0,params,mean_train_score,mean_test_score,"diff, %"
27,"{'dt__max_depth': 25, 'dt__max_leaf_nodes': 8045, 'dt__min_samples_leaf': 20, 'dt__min_samples_split': 42, 'fsel__k': 12, 'iforest__contamination': 0.1}",46996.465598,53727.621011,-14.322684
24,"{'dt__max_depth': 25, 'dt__max_leaf_nodes': 8694, 'dt__min_samples_leaf': 20, 'dt__min_samples_split': 50, 'fsel__k': 12, 'iforest__contamination': 0.09899993847152797}",47525.914595,53761.06449,-13.119474
28,"{'dt__max_depth': 25, 'dt__max_leaf_nodes': 6998, 'dt__min_samples_leaf': 20, 'dt__min_samples_split': 50, 'fsel__k': 12, 'iforest__contamination': 0.1}",47613.942146,53787.950484,-12.966808
25,"{'dt__max_depth': 25, 'dt__max_leaf_nodes': 10112, 'dt__min_samples_leaf': 19, 'dt__min_samples_split': 50, 'fsel__k': 12, 'iforest__contamination': 0.1}",47493.527144,53839.713604,-13.362213
26,"{'dt__max_depth': 25, 'dt__max_leaf_nodes': 7804, 'dt__min_samples_leaf': 21, 'dt__min_samples_split': 49, 'fsel__k': 12, 'iforest__contamination': 0.1}",47640.104614,53932.887239,-13.209003
29,"{'dt__max_depth': 25, 'dt__max_leaf_nodes': 7201, 'dt__min_samples_leaf': 21, 'dt__min_samples_split': 50, 'fsel__k': 12, 'iforest__contamination': 0.1}",47719.566847,53944.53016,-13.044886
23,"{'dt__max_depth': 25, 'dt__max_leaf_nodes': 9420, 'dt__min_samples_leaf': 20, 'dt__min_samples_split': 50, 'fsel__k': 12, 'iforest__contamination': 0.07857685395755462}",47918.012118,54748.212786,-14.253932
20,"{'dt__max_depth': 25, 'dt__max_leaf_nodes': 13136, 'dt__min_samples_leaf': 21, 'dt__min_samples_split': 47, 'fsel__k': 12, 'iforest__contamination': 0.0746241436926501}",48119.292152,54934.466192,-14.16308
22,"{'dt__max_depth': 25, 'dt__max_leaf_nodes': 10901, 'dt__min_samples_leaf': 17, 'dt__min_samples_split': 28, 'fsel__k': 12, 'iforest__contamination': 0.07540277091032764}",46480.605716,55037.214843,-18.408988
15,"{'dt__max_depth': 25, 'dt__max_leaf_nodes': 13410, 'dt__min_samples_leaf': 25, 'dt__min_samples_split': 32, 'fsel__k': 12, 'iforest__contamination': 0.07145685116577218}",48871.369577,55065.466167,-12.674285
