In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.metrics import accuracy_score,r2_score,mean_absolute_error,mean_squared_error
from sklearn.ensemble import BaggingClassifier,BaggingRegressor
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

In [3]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

In [4]:
df = pd.DataFrame(housing.data,columns= housing.feature_names)

In [5]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [6]:
df['Target'] = housing.target

In [7]:
df.head(2)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585


In [8]:
df.shape

(20640, 9)

In [9]:
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64

In [10]:
df.duplicated().sum()

0

In [11]:
x = df.iloc[:,:8]
y = df.iloc[:,-1]

In [12]:
x

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [13]:
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: Target, Length: 20640, dtype: float64

In [14]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [15]:
print(x_train.shape,x_test.shape)
print(y_train.shape,y_test.shape)

(16512, 8) (4128, 8)
(16512,) (4128,)


In [16]:
lr = LinearRegression()


In [17]:
lr.fit(x_train,y_train)

In [18]:
y1_pred1 = lr.predict(x_test)

In [19]:
r2_score(y_test,y1_pred1)

0.5757877060324514

In [20]:
mean_squared_error(y_test,y1_pred1)

0.5558915986952435

In [21]:
bag = BaggingRegressor()


In [22]:
dt = DecisionTreeRegressor()

In [23]:
bag.fit(x_train,y_train)


In [24]:
dt.fit(x_train,y_train)

In [29]:
print('Bagging accuracy on training data set :', bag.score(x_train,y_train))
print('Bagging accuracy on test data set :', bag.score(x_test,y_test))
print('Decision Tree accuracy on training data set :', dt.score(x_train,y_train))
print('Decision Tree accuracy on test data set :', dt.score(x_test,y_test))

Bagging accuracy on training data set : 0.9610030043833443
Bagging accuracy on test data set : 0.788672003307522
Decision Tree accuracy on training data set : 1.0
Decision Tree accuracy on test data set : 0.6258906579153797


In [25]:
y1_pred2 = bag.predict(x_test)

In [26]:
y1_pred2.shape

(4128,)

In [27]:
y1_pred3 = dt.predict(x_test)

In [28]:
print('R2 score of Bagging :',r2_score(y_test,y1_pred2))
print('R2 score of Decision Tree :',r2_score(y_test,y1_pred3))

R2 score of Bagging : 0.788672003307522
R2 score of Decision Tree : 0.6258906579153797


In [37]:
from sklearn.neighbors import KNeighborsRegressor

In [38]:
# grid search cv
params = {'base_estimator': [None, LinearRegression(), KNeighborsRegressor()],
          'n_estimators': [20,50,100],
          'max_samples': [0.5,1.0],
          'max_features': [0.5,1.0],
          'bootstrap': [True, False],
          'bootstrap_features': [True, False]}

In [42]:
bag_grid_search = GridSearchCV(BaggingRegressor(random_state=1, n_jobs=-1),param_grid = params)

In [43]:
bag_grid_search.fit(x_train,y_train)



In [44]:
y1_pred4 = bag_grid_search.predict(x_test)

In [45]:
bag_grid_search.best_score_

0.8077581518144885

In [46]:
bag_grid_search.best_estimator_

In [48]:
r2_score(y_test,y1_pred4)

0.8092032500062782