In [2]:
import pandas as pd
import numpy as np

In [6]:
from sklearn import datasets

diabetes = datasets.load_diabetes()
X_diabetes, Y_diabetes = diabetes.data, diabetes.target
print('Dataset features names : '+ str(diabetes.feature_names))
print('Dataset features size : '+ str(diabetes.data.shape))
print('Dataset target size : '+ str(diabetes.target.shape))

Dataset features names : ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
Dataset features size : (442, 10)
Dataset target size : (442,)


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [8]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_diabetes,Y_diabetes,train_size=0.8,test_size=0.2,random_state=42)
print("Train/Test Sets Sizes: ", X_train.shape,X_test.shape,y_train.shape,y_test.shape)

Train/Test Sets Sizes:  (353, 10) (89, 10) (353,) (89,)


In [11]:
# train the models
lr = LinearRegression()
dt = DecisionTreeRegressor()
knn = KNeighborsRegressor()

In [12]:
lr.fit(X_train,y_train)
dt.fit(X_train,y_train)
knn.fit(X_train,y_train)

In [13]:
ypred1 = lr.predict(X_test)
ypred2 = dt.predict(X_test)
ypred3 = knn.predict(X_test)

In [14]:
print('R^2 score for LR: ', r2_score(y_test,ypred1))
print('R^2 score for DT: ', r2_score(y_test,ypred2))
print('R^2 score for KNN: ', r2_score(y_test,ypred3))

R^2 score for LR:  0.4526027629719195
R^2 score for DT:  0.1522165452289984
R^2 score for KNN:  0.43016439526042805


# Bagging Regressor

In [15]:
from sklearn.ensemble import BaggingRegressor

bag_reg = BaggingRegressor(random_state=1)
bag_reg.fit(X_train,y_train)

In [20]:
y_preds = bag_reg.predict(X_test)
print('Training Coefficient of R^2 : %.3f'%bag_reg.score(X_train, y_train))
print('Test Coefficient of R^2 : %.3f'%bag_reg.score(X_test, y_test))

Training Coefficient of R^2 : 0.895
Test Coefficient of R^2 : 0.386


In [22]:
n_samples = boston.data.shape[0]
n_features = boston.data.shape[1]
print(n_samples)
print(n_features)

442
10


In [24]:
%%time

params = {
    'estimator':[None,LinearRegression(),KNeighborsRegressor()],
    'n_estimators':[20,50,100],
    'max_samples':[0.5,1.0],
    'max_features':[0.5,1.0],
    'bootstrap':[True,False],
    'bootstrap_features':[True,False]
}

bagging_regressor_grid = GridSearchCV(BaggingRegressor(random_state=1, n_jobs=-1), param_grid =params, cv=3, n_jobs=-1, verbose=1)
bagging_regressor_grid.fit(X_train, y_train)

print('Train R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_train, y_train))
print('Test R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_test, y_test))
print('Best R^2 Score Through Grid Search : %.3f'%bagging_regressor_grid.best_score_)
print('Best Parameters : ',bagging_regressor_grid.best_params_)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Train R^2 Score : 0.527
Test R^2 Score : 0.454
Best R^2 Score Through Grid Search : 0.490
Best Parameters :  {'bootstrap': False, 'bootstrap_features': False, 'estimator': LinearRegression(), 'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 20}
CPU times: total: 938 ms
Wall time: 1min 20s
