In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

data = load_boston()
data.keys()

X, y = data["data"], data["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipe = Pipeline([('std_scl',StandardScaler())])
X_train = pipe.fit_transform(X_train)
X_test = pipe.transform(X_test)

X_train = pd.DataFrame(X_train,columns=data["feature_names"])
X_test = pd.DataFrame(X_test,columns=data["feature_names"])
y_train = pd.DataFrame(y_train,columns=["target"])
y_test = pd.DataFrame(y_test,columns=["target"])


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [2]:
#alternative: use AdaBoost and put that into the base estimators or the final estimator
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import AdaBoostRegressor

#format is identical to the stacking classifiers (jupyter notebook 15)
#structure: estimator name, estimator
stack_reg = StackingRegressor(estimators=[('dt_depth_5',DecisionTreeRegressor(max_depth=5)),
                                             ('ridge',Ridge()),
                                             ('rbf_svm',SVR())
                                            ],
                                 final_estimator=Lasso(),
                                 cv=5,
                                 n_jobs=2,
                                 passthrough=False,
                                 verbose=0
                                )
stack_reg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [3]:
print(stack_reg.estimators_)
print(stack_reg.named_estimators_.keys())
print(stack_reg.final_estimator_)

[DecisionTreeRegressor(max_depth=5), Ridge(), SVR()]
dict_keys(['dt_depth_5', 'ridge', 'rbf_svm'])
Lasso()


In [4]:
#looping over estimators to see how they scored: R² scores
for est_name in stack_reg.named_estimators_:
    print(f'{est_name} score:',stack_reg.named_estimators_[est_name].score(X_test, y_test))
print('stacked ensemble score:',stack_reg.score(X_test, y_test))

#result: nice performance boost

dt_depth_5 score: 0.6284153964871914
ridge score: 0.5881400471345533
rbf_svm score: 0.4957469419124396
stacked ensemble score: 0.6455029975251989


--------------

#### ***Play around, input different models, tune the hyperparameters and see how it all performs.***