Make sure imblearn is installed:

In [1]:
!pip install imblearn



Note: if you are using sklearn=1.3, it needs to be downgraded for the code below to work:

!pip uninstall scikit-learn --yes

!pip uninstall imblearn --yes

!pip install scikit-learn==1.2.2

!pip install imblearn

In [2]:
# setting logging to print only error messages from Sklearnex
import logging
logging.basicConfig()
logging.getLogger("SKLEARNEX").setLevel(logging.ERROR)

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split

sns.set_theme(palette="Set2")

# execution time
from timeit import default_timer as timer
from datetime import timedelta

# increase column width
pd.set_option('display.max_colwidth', 200)

# silence warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Load the data

In [3]:
import os
import tarfile
import urllib


# the file is archived with tar and gunzip
URL = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz"

def get_dataframe():
    global URL
    
    # if the "datasets" folder does not exist, create it
    if not os.path.exists("datasets"):
        os.makedirs("datasets")
    
    # if the archived file does not exist, download it
    if not os.path.exists("datasets/housing.tgz"):
        urllib.request.urlretrieve(URL, "datasets/housing.tgz")
    
    # if the unpacked file does not exist, unpack it
    if not os.path.exists("datasets/housing.csv"):
        infile = tarfile.open("datasets/housing.tgz")
        infile.extractall(path="datasets")
        infile.close()
    
    # load the dataframe
    return pd.read_csv("datasets/housing.csv")

df = get_dataframe()

# Train-test split

In [4]:
df["income_cat"] = pd.cut(df["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                          labels=[1, 2, 3, 4, 5])
trainset, testset = train_test_split(df, test_size=0.2, random_state=7,
                                     stratify=df["income_cat"])
del trainset["income_cat"]
del testset["income_cat"]

# Repeat initial preprocessing steps

## Feature engineering

In [5]:
# create two new features
trainset["rooms_per_household"] = trainset["total_rooms"]/trainset["households"]
trainset["population_per_household"] = trainset["population"]/trainset["households"]
testset["rooms_per_household"] = testset["total_rooms"]/testset["households"]
testset["population_per_household"] = testset["population"]/testset["households"]

# rename "ISLAND" to "NEAR OCEAN"
trainset["ocean_proximity"] = trainset["ocean_proximity"].replace("ISLAND", "NEAR OCEAN")
testset["ocean_proximity"] = testset["ocean_proximity"].replace("ISLAND", "NEAR OCEAN")
trainset["ocean_proximity"].unique()

# delete capped values
    
print(f"Before: {trainset.shape[0]} train instances, {testset.shape[0]} test instances")

trainset.drop(trainset[trainset["median_house_value"] >= 500001].index, inplace=True)
testset.drop(testset[testset["median_house_value"] >= 500001].index, inplace=True)

print(f"After: {trainset.shape[0]} train instances, {testset.shape[0]} test instances")

Before: 16512 train instances, 4128 test instances
After: 15735 train instances, 3940 test instances


## Dummy variables

In [6]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(drop="first", sparse=False)

# categorical columns to transform
cat_cols = ["ocean_proximity"]

# fit an encoder and transform the **trainset**
cat_vals = trainset[cat_cols]

transformed = one_hot_encoder.fit_transform(cat_vals)

# the names of the new columns are the unique values of "ocean_proximity"
new_col_names = one_hot_encoder.get_feature_names_out(cat_cols)

# put the transformed data as columns in the trainset dataframe
for i, new_col_name in enumerate(new_col_names):
    trainset[new_col_name] = transformed[:,i]

# delete the categorical column
del trainset['ocean_proximity']

# transform the **testset** using the encoder fitted on trainset
cat_vals = testset[cat_cols]
transformed = one_hot_encoder.transform(cat_vals)

# put the transformed data as columns in the testset dataframe
for i, new_col_name in enumerate(new_col_names):
    testset[new_col_name] = transformed[:,i]

# delete the categorical column also in the test set
del testset['ocean_proximity']



## Separate predictors and target

In [7]:
ytrain = trainset["median_house_value"].copy()
Xtrain = trainset.drop("median_house_value", axis=1)
ytest = testset["median_house_value"].copy()
Xtest = testset.drop("median_house_value", axis=1)

# Pipeline

In [8]:
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import r_regression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error


class LogTransformer:

    def __init__(self, cols, **kwargs):
        self.cols = cols

    def fit(self, X, y=None, **kwargs):
        return self

    def transform(self, X, y=None, **kwargs):
        for col in self.cols:
            X[:, col] = np.log(X[:, col] + 1)
        return X


class OutlierDetectionWrapper:

    def __init__(self, **kwargs):
        self.clf = IsolationForest(**kwargs)

    def fit_resample(self, X, y):
        yhat = self.clf.fit(X).predict(X)
        return X[yhat != -1], y[yhat != -1]

    def set_params(self, **kwargs):
        self.clf.set_params(**kwargs)

# Tuning hyperparameters

In [9]:
from sklearn.model_selection import GridSearchCV

In [10]:
def run_pipeline(hp_grid):

    # indices of columns to be log-transformed
    col_idx = [Xtrain.columns.get_loc(x) for x in 
               ["total_rooms", "total_bedrooms", "population", "households", "median_income"]]
    
    pipe = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('logtransformer', LogTransformer(col_idx)),
        ('iforest', OutlierDetectionWrapper(n_estimators=100, random_state=7, contamination=0.03)),
        ('fsel', SelectKBest(r_regression, k=10)),
        ('scaler', StandardScaler()),
        ('dt', DecisionTreeRegressor(random_state=7))
    ])
    
    best_hp = {
        'dt__max_depth': [16], 
        'dt__max_leaf_nodes': [9807], 
        'dt__min_samples_leaf': [16], 
        'dt__min_samples_split': [50], 
        'fsel__k': [13], 
        'iforest__contamination': [0.1]
    }

    best_hp.update(hp_grid)
    
    # we'll use 10-fold cross-validation
    grid_search = GridSearchCV(pipe, best_hp, cv=10,
                               scoring='neg_root_mean_squared_error', 
                               return_train_score=True, verbose=0)
    
    grid_search.fit(Xtrain, ytrain)
    
    cv_results = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score']]
    cv_results["mean_train_score"] = -cv_results["mean_train_score"]
    cv_results["mean_test_score"] = -cv_results["mean_test_score"]
    cv_results["diff, %"] = 100*(cv_results["mean_train_score"]-cv_results["mean_test_score"]
                                                         )/cv_results["mean_train_score"]
    
    return cv_results[["params", "diff, %"]].sort_values('diff, %')

## Effect of outlier removal

In [11]:
start = timer()

hp_grid = {
    'iforest__contamination': [0.001, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2]
}
result = run_pipeline(hp_grid)

print("Execution time HH:MM:SS:", timedelta(seconds=timer() - start))

result

Execution time HH:MM:SS: 0:01:36.856055


Unnamed: 0,params,"diff, %"
0,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.001}",-18.853342
1,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.025}",-16.501983
2,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.05}",-15.551095
3,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.075}",-14.578627
4,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-13.808643
5,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.125}",-13.489788
6,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.15}",-13.206109
7,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.175}",-12.280228
8,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.2}",-11.348466


## Effect of feature selection

In [12]:
start = timer()

hp_grid = {
    'fsel__k': [3, 5, 7, 9, 11, 13],
}
result = run_pipeline(hp_grid)

print("Execution time HH:MM:SS:", timedelta(seconds=timer() - start))

result

Execution time HH:MM:SS: 0:01:03.714857


Unnamed: 0,params,"diff, %"
3,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 9, 'iforest__contamination': 0.1}",-14.513725
4,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 11, 'iforest__contamination': 0.1}",-14.146509
5,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-13.808643
2,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 7, 'iforest__contamination': 0.1}",-13.334744
0,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 3, 'iforest__contamination': 0.1}",-12.957378
1,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 5, 'iforest__contamination': 0.1}",-12.491134


## Effect of max_depth

In [13]:
start = timer()

hp_grid = {
    'dt__max_depth': [2, 10, 15, 20, 25],
}
result = run_pipeline(hp_grid)

print("Execution time HH:MM:SS:", timedelta(seconds=timer() - start))

result

Execution time HH:MM:SS: 0:00:53.106482


Unnamed: 0,params,"diff, %"
3,"{'dt__max_depth': 20, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-13.827947
4,"{'dt__max_depth': 25, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-13.827947
2,"{'dt__max_depth': 15, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-13.774774
1,"{'dt__max_depth': 10, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-10.515221
0,"{'dt__max_depth': 2, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-0.387578


## Effect of min_samples_leaf

In [14]:
start = timer()

hp_grid = {
    'dt__min_samples_leaf': [1, 10, 20, 30, 40, 50],
}
result = run_pipeline(hp_grid)

print("Execution time HH:MM:SS:", timedelta(seconds=timer() - start))

result

Execution time HH:MM:SS: 0:01:05.437495


Unnamed: 0,params,"diff, %"
0,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-21.059726
1,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 10, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-15.532081
2,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 20, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-13.140952
3,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 30, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-10.324975
4,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 40, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-8.019621
5,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 50, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-6.662314


## Effect of min_samples_split

In [15]:
start = timer()

hp_grid = {
    'dt__min_samples_split': [2, 10, 20, 30, 40, 50],
}
result = run_pipeline(hp_grid)

print("Execution time HH:MM:SS:", timedelta(seconds=timer() - start))

result

Execution time HH:MM:SS: 0:01:05.319134


Unnamed: 0,params,"diff, %"
0,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 2, 'fsel__k': 13, 'iforest__contamination': 0.1}",-18.013621
1,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 10, 'fsel__k': 13, 'iforest__contamination': 0.1}",-18.013621
2,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 20, 'fsel__k': 13, 'iforest__contamination': 0.1}",-18.013621
3,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 30, 'fsel__k': 13, 'iforest__contamination': 0.1}",-18.013621
4,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 40, 'fsel__k': 13, 'iforest__contamination': 0.1}",-16.47147
5,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 9807, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-13.808643


## Effect of max_leaf_nodes

In [16]:
start = timer()

hp_grid = {
    'dt__max_leaf_nodes': [2500, 5000, 7500, 10000, None]
}
result = run_pipeline(hp_grid)

print("Execution time HH:MM:SS:", timedelta(seconds=timer() - start))

result

Execution time HH:MM:SS: 0:00:43.972412


Unnamed: 0,params,"diff, %"
4,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': None, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-13.81098
0,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 2500, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-13.808643
1,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 5000, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-13.808643
2,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 7500, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-13.808643
3,"{'dt__max_depth': 16, 'dt__max_leaf_nodes': 10000, 'dt__min_samples_leaf': 16, 'dt__min_samples_split': 50, 'fsel__k': 13, 'iforest__contamination': 0.1}",-13.808643
