In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor #using DecisionTreeRegressor (independent of feature scales)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

data = load_boston()
data.keys()

X, y = data["data"], data["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#feature scaling + fit_tranform + we are skipping validation realms (exception in this case)
pipe = Pipeline([('std_scl',StandardScaler())])
X_train = pipe.fit_transform(X_train)
X_test = pipe.transform(X_test)

X_train = pd.DataFrame(X_train,columns=data["feature_names"])
X_test = pd.DataFrame(X_test,columns=data["feature_names"])
y_train = pd.DataFrame(y_train,columns=["target"])
y_test = pd.DataFrame(y_test,columns=["target"])


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [2]:
from sklearn.ensemble import BaggingRegressor

# Performs voting regressor aggregation, same parameters as the BaggingClassifier (notebook before)
#only change: changing the name BaggingClassifier to Bagging Regressor AND DTClasifier to DTRegressor (abbreviated!)
bag_reg = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=3,
                                                      max_features='sqrt',
                                                      splitter='random'), #random instead fo best (best=slow)
                            n_estimators=30,  #30 different decision tree classifiers
                            max_samples=0.8,  # int(e.g.100 = 100 different samples)/float, default with replacement, size of the subset of the training data we give to each model during training
                                              # maximum training set sample size compared to original training set (80% of original training set)
                            max_features=0.9, # int(e.g. 100 = 100 different features)/float, default without replacement, define the random subspaces
                                              # maximum feature size compared to original number of features
                            bootstrap=True,   # bagging (= set to True)/pasting (= set to False) (affects max_samples behaviour)
                            bootstrap_features=False, # if feature selection should use bagging (max_features), random sampling the features without replacement
                            oob_score=False,  # perform oob scoring (oob=out-of-bag), it will slightly increase training times just because then youalso have to perform the predictions and evaluate them
                            warm_start=False,
                            n_jobs=2,
                            random_state=0,
                            verbose=0
                            )

bag_reg.fit(X_train, y_train)

  return column_or_1d(y, warn=True)


In [4]:
print(len(bag_reg.estimators_)) # array of trained estimators
print(len(bag_reg.estimators_samples_)) # array of sample subsets used to train each estimator
print(len(bag_reg.estimators_features_)) # array of feature subsets used to train each estimator

# #when oob_score is true we also get the out-of-bag score
# bag_clf.oob_score_
# bag_clf.oob_decision_function_

30
30
30


In [6]:
scores = []
#iterating over estimators + features
for est,features in zip(bag_reg.estimators_,bag_reg.estimators_features_):
    scores.append(est.score(X_test.values[:, features], y_test)) #scoring each estimator on the test set, no type conversion necessary because our targets are already numerical

In [7]:
print('Avg. estimator performance:',np.mean(scores)) #R² score
print('Estimaor performance std. dev.:',np.std(scores))

Avg. estimator performance: 0.1847313130270765
Estimaor performance std. dev.: 0.19848630969677158


In [8]:
bag_reg.score(X_test, y_test)

0.39328148432267007

#### Thanks to bagging we got a performance boost. Runs extremely fast after restarting the notebook (and it's a small dataset!). Don't forget grid search and hyperparameter tuning which we skipped here to get a better performance.
But of course, here in this case, in the bagging, regressor and classifiers,we have the extra options of doing bagging or pastingor random subspaces or random patches and tuning or playingwith all of that and tuning the parameters in thatsense to get that even higher bias for lower variance.And then from that ensemble again, reducing our bias.