In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
concrete = pd.read_csv('datasets/concrete_data.csv')
concrete.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
X = concrete.drop('csMPa', axis= 1)
Y = concrete['csMPa']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size= 0.2)

In [5]:
baseline = GradientBoostingRegressor(max_depth= 3, n_estimators= 50)
baseline.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [6]:
y_pred = baseline.predict(x_test)
r2_score(y_test, y_pred)

0.8604967629906238

In [7]:
imp_fe = pd.Series(baseline.feature_importances_,
                   index = X.columns).sort_values(ascending= False)
imp_fe

age                 0.379397
cement              0.302443
water               0.102340
superplasticizer    0.086679
slag                0.069936
fineaggregate       0.041215
flyash              0.012838
coarseaggregate     0.005153
dtype: float64

### grid search

In [8]:
gb_reg = GradientBoostingRegressor(max_depth= 3)

In [9]:
params = {'n_estimators': [1,3,5,10,20,50,100,200,300,400,500,600]}
gridsearch = GridSearchCV(estimator= gb_reg, 
                          param_grid= params,
                          cv= 3)
gridsearch.fit(x_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
                 

In [10]:
gridsearch.best_params_

{'n_estimators': 500}

In [11]:
gb_reg = GradientBoostingRegressor(max_depth= 3,
                                   n_estimators = gridsearch.best_params_['n_estimators'])
gb_reg.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=500,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [12]:
y_pred = gb_reg.predict(x_test)
r2_score(y_test, y_pred)

0.9296249263138602

### warm start and using early stopping to find the opt no. of trees

In [13]:
gb_reg = GradientBoostingRegressor(max_depth= 3, warm_start= True)

In [17]:
min_error = float('inf')
error_increasing = 0

for n_estimators in range(1, 1000):
    gb_reg.n_estimators = n_estimators
    gb_reg.fit(x_train, y_train)
    
    y_pred = gb_reg.predict(x_test)
    val_error = mean_squared_error(y_test, y_pred)
    
    print('no.of estimators: ', gb_reg.n_estimators_)
    print('validation error: ',val_error)
    
    if val_error < min_error:
        min_val_error = val_error
        error_increasing = 0
        
    else:
        error_increasing +=1
        if error_increasing == 10:
            break
        

no.of estimators:  2
validation error:  249.37734003408346
no.of estimators:  2
validation error:  222.4337374091739
no.of estimators:  3
validation error:  197.62336684086762
no.of estimators:  4
validation error:  178.25073751932084
no.of estimators:  5
validation error:  161.16435044228888
no.of estimators:  6
validation error:  148.79964291920666
no.of estimators:  7
validation error:  138.30688045170547
no.of estimators:  8
validation error:  126.92995422113488
no.of estimators:  9
validation error:  119.45239346319752
no.of estimators:  10
validation error:  110.59086326471265
no.of estimators:  11
validation error:  103.22476616549764
no.of estimators:  12
validation error:  96.4780680546753
no.of estimators:  13
validation error:  91.96940563311549
no.of estimators:  14
validation error:  86.7554020484613
no.of estimators:  15
validation error:  82.75947507422002
no.of estimators:  16
validation error:  79.32455238202735
no.of estimators:  17
validation error:  75.9660143121102

no.of estimators:  156
validation error:  24.198058433442586
no.of estimators:  157
validation error:  24.23937042100582
no.of estimators:  158
validation error:  24.287963058083484
no.of estimators:  159
validation error:  24.223307506898443
no.of estimators:  160
validation error:  24.151530692523796
no.of estimators:  161
validation error:  24.07965203115571
no.of estimators:  162
validation error:  24.037304452610158
no.of estimators:  163
validation error:  24.060180205189518
no.of estimators:  164
validation error:  23.953237606709212
no.of estimators:  165
validation error:  23.957783589001725
no.of estimators:  166
validation error:  24.030556385823672
no.of estimators:  167
validation error:  23.981450495209106
no.of estimators:  168
validation error:  23.937275896703945
no.of estimators:  169
validation error:  23.848223114904922
no.of estimators:  170
validation error:  23.843963321913495
no.of estimators:  171
validation error:  23.782930949824532
no.of estimators:  172
val

no.of estimators:  302
validation error:  21.101007635110832
no.of estimators:  303
validation error:  21.082254869403627
no.of estimators:  304
validation error:  21.075211472407847
no.of estimators:  305
validation error:  21.096015433854195
no.of estimators:  306
validation error:  21.073125224849726
no.of estimators:  307
validation error:  21.07857983650231
no.of estimators:  308
validation error:  21.07308943815177
no.of estimators:  309
validation error:  21.076901386716564
no.of estimators:  310
validation error:  21.042289148787372
no.of estimators:  311
validation error:  21.043466574592202
no.of estimators:  312
validation error:  21.041268257669113
no.of estimators:  313
validation error:  21.028691740725083
no.of estimators:  314
validation error:  21.0634703226174
no.of estimators:  315
validation error:  21.047252457712545
no.of estimators:  316
validation error:  21.05054428211739
no.of estimators:  317
validation error:  21.00689938253092
no.of estimators:  318
validat

no.of estimators:  444
validation error:  20.34388345268988
no.of estimators:  445
validation error:  20.337503257465137
no.of estimators:  446
validation error:  20.303822479359404
no.of estimators:  447
validation error:  20.29214239394614
no.of estimators:  448
validation error:  20.354311459426373
no.of estimators:  449
validation error:  20.349343972984204
no.of estimators:  450
validation error:  20.3300401873068
no.of estimators:  451
validation error:  20.304818541060538
no.of estimators:  452
validation error:  20.30436309217612
no.of estimators:  453
validation error:  20.300149783787013
no.of estimators:  454
validation error:  20.294397167052797
no.of estimators:  455
validation error:  20.300451675858497
no.of estimators:  456
validation error:  20.29614777043796
no.of estimators:  457
validation error:  20.253654222698735
no.of estimators:  458
validation error:  20.245366910700085
no.of estimators:  459
validation error:  20.24558668417333
no.of estimators:  460
validati

no.of estimators:  588
validation error:  19.678159155413518
no.of estimators:  589
validation error:  19.669208862036953
no.of estimators:  590
validation error:  19.668243915204386
no.of estimators:  591
validation error:  19.6759965840997
no.of estimators:  592
validation error:  19.670617517143523
no.of estimators:  593
validation error:  19.65029619690104
no.of estimators:  594
validation error:  19.641055995802922
no.of estimators:  595
validation error:  19.643587115796798
no.of estimators:  596
validation error:  19.657485892749083
no.of estimators:  597
validation error:  19.635177841886293
no.of estimators:  598
validation error:  19.683214492581374
no.of estimators:  599
validation error:  19.679864294377865
no.of estimators:  600
validation error:  19.679913140005034
no.of estimators:  601
validation error:  19.669652244368606
no.of estimators:  602
validation error:  19.671047743381266
no.of estimators:  603
validation error:  19.6672582722924
no.of estimators:  604
valida

no.of estimators:  729
validation error:  19.46531703674761
no.of estimators:  730
validation error:  19.4649441677158
no.of estimators:  731
validation error:  19.46271279031608
no.of estimators:  732
validation error:  19.45388386834009
no.of estimators:  733
validation error:  19.45157891272297
no.of estimators:  734
validation error:  19.450500844435506
no.of estimators:  735
validation error:  19.436904055251144
no.of estimators:  736
validation error:  19.435084646421537
no.of estimators:  737
validation error:  19.43417651234952
no.of estimators:  738
validation error:  19.433984275765518
no.of estimators:  739
validation error:  19.432417447989554
no.of estimators:  740
validation error:  19.43127091416161
no.of estimators:  741
validation error:  19.452686556106183
no.of estimators:  742
validation error:  19.455913993923634
no.of estimators:  743
validation error:  19.45331199762044
no.of estimators:  744
validation error:  19.439974190359692
no.of estimators:  745
validation

no.of estimators:  867
validation error:  19.386380741868876
no.of estimators:  868
validation error:  19.384336612872037
no.of estimators:  869
validation error:  19.38403559506894
no.of estimators:  870
validation error:  19.406134653678784
no.of estimators:  871
validation error:  19.408991412436343
no.of estimators:  872
validation error:  19.402236634982515
no.of estimators:  873
validation error:  19.393995833954826
no.of estimators:  874
validation error:  19.395140836794386
no.of estimators:  875
validation error:  19.390222243518913
no.of estimators:  876
validation error:  19.38923926471469
no.of estimators:  877
validation error:  19.385547198883
no.of estimators:  878
validation error:  19.398518171891787
no.of estimators:  879
validation error:  19.399074654679488
no.of estimators:  880
validation error:  19.405765576278906
no.of estimators:  881
validation error:  19.40922917421375
no.of estimators:  882
validation error:  19.409278240433814
no.of estimators:  883
validat

In [18]:
n_estimators

999

In [19]:
gb_reg_best = GradientBoostingRegressor(max_depth= 3,
                                   n_estimators = n_estimators)
gb_reg_best.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=999,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [20]:
y_pred = gb_reg_best.predict(x_test)
r2_score(y_test, y_pred)

0.9309264294095213