In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split


In [18]:
from sklearn.metrics import r2_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('BostonHousing.csv')

In [4]:
df

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [5]:
df.columns

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat', 'medv'],
      dtype='object')


CRIM - per capita crime rate by town

ZN - proportion of residential land zoned for lots over 25,000 sq.ft.

INDUS - proportion of non-retail business acres per town.

CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)

NOX - nitric oxides concentration (parts per 10 million)

RM - average number of rooms per dwelling

AGE - proportion of owner-occupied units built prior to 1940

DIS - weighted distances to five Boston employment centres

RAD - index of accessibility to radial highways

TAX - full-value property-tax rate per $10,000

PTRATIO - pupil-teacher ratio by town

B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town

LSTAT - % lower status of the population

MEDV - Median value of owner-occupied homes in $1000's

In [6]:
x = df[['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat']]

In [7]:
x

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48


In [8]:
y = df['medv']

In [9]:
y

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: medv, Length: 506, dtype: float64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [11]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [24]:
gr = GradientBoostingRegressor(n_estimators=200,learning_rate=0.1,max_depth=2)

In [25]:
model = gr.fit(X_train,y_train)

In [26]:
y_pred = model.predict(X_test)

In [27]:
y_pred

array([11.57401495, 36.33644295, 13.18777499, 21.81863185, 17.19980138,
       44.02594028, 22.88870887, 19.78464051, 23.15081088, 43.4467896 ,
       17.71338574, 15.4855052 , 27.40445262, 39.73496777, 29.75999303,
       33.51134895, 20.64343805, 15.82132378, 19.99612461, 17.67126137,
       21.46787542, 30.46435384, 14.82961435, 17.37904284, 20.70003509,
       20.5893396 , 22.15669503, 24.26290892, 30.15319406, 28.83492292,
       14.22014069, 11.14691413, 21.16639675, 16.29855011, 32.87331304,
       35.90588282, 17.24974672, 27.80280976, 10.36872305, 24.81963992,
       20.00514235, 24.74081838, 24.44858922, 13.30184364, 13.79817214,
       23.56787311, 50.67404509, 21.23435938, 18.01298633,  9.91827581,
       23.62278955,  7.04069517, 16.07796131, 20.57821862, 33.11743558,
       16.17893199, 11.21728347, 31.22525511, 20.11153774, 12.7871847 ,
       29.18043285, 18.64108295, 24.69260661, 40.23664371,  9.25178994,
       42.25455924, 14.71573225, 15.61052156, 21.53769806, 20.93

In [28]:
print(f'R2 Score is {r2_score(y_test,y_pred)}')
# 

R2 Score is 0.8970643622836659


In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
LR = {'learning_rate':[0.15,0.1,0.10,0.05],'n_estimators':[100,150,200,250]}

tuning = GridSearchCV(estimator=GradientBoostingRegressor(),param_grid=LR,scoring='r2')

In [22]:
tuning.fit(X_train,y_train)

In [23]:
tuning.best_params_,tuning.best_score_

({'learning_rate': 0.1, 'n_estimators': 200}, 0.8819227836815495)