<a href="https://colab.research.google.com/github/tarupathak30/machine_learning_algorithms/blob/main/Bagging/Bagging_Regressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
from sklearn import datasets

california_housing = datasets.fetch_california_housing()
x_california_housing, y_california_housing = california_housing.data, california_housing.target
print("Dataset Features Names " + str(california_housing.feature_names))
print("Dataset Features Size " + str(california_housing.data.shape))
print("Dataset Target Size " + str(california_housing.target.shape))

Dataset Features Names ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
Dataset Features Size (20640, 8)
Dataset Target Size (20640,)


In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_california_housing, y_california_housing, test_size=0.2, random_state=42)

In [14]:
print('Train/Test Sets Sizes : ', x_train.shape, x_test.shape, y_train.shape, y_test.shape)

Train/Test Sets Sizes :  (16512, 8) (4128, 8) (16512,) (4128,)


In [15]:
lr = LinearRegression()
knn = KNeighborsRegressor()
dtree = DecisionTreeRegressor()

In [16]:
lr.fit(x_train, y_train)
knn.fit(x_train, y_train)
dtree.fit(x_train, y_train)

In [17]:
y_pred_lr = lr.predict(x_test)
y_pred_knn = knn.predict(x_test)
y_pred_dtree = dtree.predict(x_test)

In [18]:
print("R^2 Score for LR ", r2_score(y_test, y_pred_lr))
print("R^2 Score for DT ", r2_score(y_test, y_pred_dtree))
print("R^2 Score for KNN", r2_score(y_test, y_pred_knn))

R^2 Score for LR  0.5757877060324524
R^2 Score for DT  0.6254878274010965
R^2 Score for KNN 0.14631049965900345


In [20]:
from sklearn.ensemble import BaggingRegressor

In [21]:
bag_regressor = BaggingRegressor(random_state=1)

In [22]:
bag_regressor.fit(x_train, y_train)

In [23]:
y_pred_bagging_regressor = bag_regressor.predict(x_test)

In [33]:
r2_train = bag_regressor.score(x_train, y_train)
r2_test = bag_regressor.score(x_test, y_test)
print(f"Training Coefficient of R^2 :  {r2_train:.3f}")
print(f"Test Coefficient of R^2 : {r2_test:.3f}")

Training Coefficient of R^2 :  0.962
Test Coefficient of R^2 : 0.795


Seems, we are overfitting

In [34]:
n_samples = california_housing.data.shape[0]
n_features = california_housing.data.shape[1]

In [38]:
params = {
    'estimator' : [None, LinearRegression(), KNeighborsRegressor()],
    'n_estimators' : [20, 50, 100],
    'max_samples' : [0.5, 1.0],
    'max_features' : [0.5, 1.0],
    'bootstrap' : [True, False],
    'bootstrap_features' : [True, False]
}

In [39]:
bagging_regressor_grid = GridSearchCV(BaggingRegressor(random_state=1), param_grid=params, cv=5)