In [15]:
%load_ext lab_black


import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.utils import shuffle
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LassoCV

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [5]:
def load_ames_housing():
    df = fetch_openml(name="house_prices", as_frame=True)
    X = df.data
    y = df.target

    features = [
        "YrSold",
        "HeatingQC",
        "Street",
        "YearRemodAdd",
        "Heating",
        "MasVnrType",
        "BsmtUnfSF",
        "Foundation",
        "MasVnrArea",
        "MSSubClass",
        "ExterQual",
        "Condition2",
        "GarageCars",
        "GarageType",
        "OverallQual",
        "TotalBsmtSF",
        "BsmtFinSF1",
        "HouseStyle",
        "MiscFeature",
        "MoSold",
    ]

    X = X[features]
    X, y = shuffle(X, y, random_state=0)

    X = X[:600]
    y = y[:600]
    return X, np.log(y)


X, y = load_ames_housing()

In [7]:
cat_selector = make_column_selector(dtype_include=object)
num_selector = make_column_selector(dtype_include=np.number)
cat_selector(X), num_selector(X)

(['HeatingQC',
  'Street',
  'Heating',
  'MasVnrType',
  'Foundation',
  'ExterQual',
  'Condition2',
  'GarageType',
  'HouseStyle',
  'MiscFeature'],
 ['YrSold',
  'YearRemodAdd',
  'BsmtUnfSF',
  'MasVnrArea',
  'MSSubClass',
  'GarageCars',
  'OverallQual',
  'TotalBsmtSF',
  'BsmtFinSF1',
  'MoSold'])

In [12]:
cat_tree_processor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)
num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True)
tree_processor = make_column_transformer(
    (num_tree_processor, num_selector), (cat_tree_processor, cat_selector)
)
tree_processor

ColumnTransformer(transformers=[('simpleimputer',
                                 SimpleImputer(add_indicator=True),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fc6807a9490>),
                                ('ordinalencoder',
                                 OrdinalEncoder(handle_unknown='use_encoded_value',
                                                unknown_value=-1),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fc6807a9810>)])

In [14]:
linear_processor = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore"), cat_selector),
    (
        make_pipeline(
            StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
        ),
        num_selector,
    ),
)
linear_processor

ColumnTransformer(transformers=[('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fc6807a9810>),
                                ('pipeline',
                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler()),
                                                 ('simpleimputer',
                                                  SimpleImputer(add_indicator=True))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fc6807a9490>)])

In [16]:
lasso_pipeline = make_pipeline(linear_processor, LassoCV())
lasso_pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc6807a9810>),
                                                 ('pipeline',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler()),
                                                                  ('simpleimputer',
                                                                   SimpleImputer(add_indicator=True))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc6807a9490>)])),
                ('lassocv', LassoCV())])

In [18]:
from sklearn.ensemble import RandomForestRegressor

rf_pipeline = make_pipeline(tree_processor, RandomForestRegressor(random_state=42))
rf_pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('simpleimputer',
                                                  SimpleImputer(add_indicator=True),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc6807a9490>),
                                                 ('ordinalencoder',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc6807a9810>)])),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])

In [19]:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

gbdt_pipeline = make_pipeline(
    tree_processor, HistGradientBoostingRegressor(random_state=0)
)
gbdt_pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('simpleimputer',
                                                  SimpleImputer(add_indicator=True),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc6807a9490>),
                                                 ('ordinalencoder',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc6807a9810>)])),
                ('histgradientboostingregressor',
                 HistGradientBoostingRegressor(random_state=0))])

In [22]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

stacking_regressor = StackingRegressor(
    estimators=[
        ("random forest", rf_pipeline),
        ("lasso", lasso_pipeline),
        ("gradient boosting", gbdt_pipeline),
    ],
    final_estimator=RidgeCV(),
)
from sklearn.model_selection import cross_validate, cross_val_predict

cross_validate(
    stacking_regressor,
    X,
    y,
    scoring=["r2", "neg_mean_absolute_error"],
    n_jobs=-1,
    verbose=0,
)

{'fit_time': array([2.89679909, 3.04144692, 3.03006387, 2.91020298, 2.88251114]),
 'score_time': array([0.02648282, 0.02798891, 0.02634287, 0.02596903, 0.0258379 ]),
 'test_r2': array([0.82586682, 0.78413678, 0.64197976, 0.83980335, 0.79267176]),
 'test_neg_mean_absolute_error': array([-0.11329703, -0.1165098 , -0.13046406, -0.11350389, -0.13363819])}

In [None]:
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate, cross_val_predict