In [1]:

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import sys

sys.path.append('../src')

from stacking_transformer import RegressionStackingTransformer

In [2]:
# Load demo data
boston = load_boston()
X, y = boston.data, boston.target
# Make train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        random_state=0)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [3]:
estimators_L1 = [
    ('et', ExtraTreesRegressor(
            random_state=0,
            n_jobs=-1,
            n_estimators=100,
            max_depth=3
    )),
    ('rf', RandomForestRegressor(
            random_state=0,
            n_jobs=-1,
            n_estimators=100,
            max_depth=3
    )),
    ('knn',KNeighborsRegressor(n_neighbors=10))
]

# Stacking
n_folds = 5
stack = RegressionStackingTransformer(
    estimators=estimators_L1,
    shuffle=True,
    random_state=0,
    verbose=1,
    n_folds=n_folds,
)

stack.fit(X_train, y_train)
S_train = stack.transform(X_train)
S_test = stack.transform(X_test)
# # Use 2nd level estimator to get final prediction
estimator_L2 = GradientBoostingRegressor(random_state=0,
                                learning_rate=0.1,
                                n_estimators=100,
                                max_depth=3
                                )

estimator_L2 = estimator_L2.fit(S_train, y_train)
y_pred = estimator_L2.predict(S_test)

# Final prediction score
print('Final score: [%.8f]' % mean_absolute_error(y_test, y_pred))

metric:  mean_absolute_error 
 n_estimators:  3 

Estimator: [et: ExtraTreesRegressor]
Mean Scores: [2.87846624]  -  Std Scrores: [0.21312134]

Estimator: [rf: RandomForestRegressor]
Mean Scores: [2.75414795]  -  Std Scrores: [0.29634211]

Estimator: [knn: KNeighborsRegressor]
Mean Scores: [4.57087747]  -  Std Scrores: [0.42235558]

Train set was detected.
Final score: [2.75958853]


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Init 1st level estimators
estimators_l1 = [
    ('et', ExtraTreesRegressor(
            random_state=0,
            n_jobs=-1,
            n_estimators=100,
            max_depth=3
    )),
    ('rf', RandomForestRegressor(
            random_state=0,
            n_jobs=-1,
            n_estimators=100,
            max_depth=3
    )),
    ('knn',KNeighborsRegressor(n_neighbors=10))
]
# Stacking
stack_l1 = RegressionStackingTransformer(
        estimators=estimators_l1,
        shuffle=True,
        random_state=0,
        verbose=1,
        n_jobs=-1
)

# Init 1st level estimators
estimators_l2 = [
    ('et', ExtraTreesRegressor(
            random_state=0,
            n_jobs=-1,
            n_estimators=100,
            max_depth=3
    )),
    ('rf', RandomForestRegressor(
            random_state=0,
            n_jobs=-1,
            n_estimators=100,
            max_depth=3
    )),
    ('knn',KNeighborsRegressor(
            n_neighbors=10
    ))
]

stack_l2 = RegressionStackingTransformer(
        estimators=estimators_l2,
        shuffle=True,
        random_state=0,
        verbose=1,
        n_jobs=-1    
)

pipeline = Pipeline([
        ('stack_l1',stack_l1), 
        ('stack_l2',stack_l2),
        ("final_pred",LinearRegression())
])

In [7]:
pipeline.fit(X_train, y_train)

metric:  mean_absolute_error 
 n_estimators:  3 

Estimator: [et: ExtraTreesRegressor]
Mean Scores: [2.86054217]  -  Std Scrores: [0.24351346]

Estimator: [rf: RandomForestRegressor]
Mean Scores: [2.65672577]  -  Std Scrores: [0.33812720]

Estimator: [knn: KNeighborsRegressor]
Mean Scores: [4.53943069]  -  Std Scrores: [0.30589029]

Train set was detected.
metric:  mean_absolute_error 
 n_estimators:  3 

Estimator: [et: ExtraTreesRegressor]
Mean Scores: [2.22556510]  -  Std Scrores: [0.29516346]

Estimator: [rf: RandomForestRegressor]
Mean Scores: [2.29382881]  -  Std Scrores: [0.32341186]

Estimator: [knn: KNeighborsRegressor]
Mean Scores: [2.27326733]  -  Std Scrores: [0.13162535]

Train set was detected.


In [8]:
y_pred_test = pipeline.predict(X_test)

In [9]:
from sklearn.metrics import mean_absolute_percentage_error

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_test_lr = lr.predict(X_test)


print(f"MAPE with stacking: % {100*mean_absolute_percentage_error(y_test, y_pred_test):.2f}" )
print(f"MAPE without stacking: % {100*mean_absolute_percentage_error(y_test, y_pred_test_lr):.2f}")

MAPE with stacking: % 13.54
MAPE without stacking: % 18.36
