In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
df=pd.read_csv('cpp_cleaned_data')

In [3]:
kf=KFold(n_splits=10)


In [4]:
x=df.drop(['price'],axis=1)
y=df['price']

# dummy model

In [53]:
sco_tr=[]
sco_ts=[]
for train_index,test_index in kf.split(x):
    x_train,x_test,y_train,y_test=x.loc[train_index],x.loc[test_index],y[train_index],y[test_index]
    gb=GradientBoostingRegressor(n_estimators= 10,max_depth=2,learning_rate=1.0)
    gb.fit(x_train,y_train)
    sco_tr.append(gb.score(x_train,y_train))
    sco_ts.append(gb.score(x_test,y_test))
    
 

In [54]:
print(f'{sco_tr} and\nmean score: {np.array(sco_tr).mean()}')
print(f'{sco_ts} and\nmean score: {np.array(sco_ts).mean()}')

[0.9748918537693887, 0.9647422700703433, 0.9598756417553803, 0.9590906987910252, 0.9657555939930068, 0.9714985451035408, 0.9625669509011432, 0.9720217089356359, 0.9703805278843298, 0.9662273555096795] and
mean score: 0.9667051146713475
[0.5718936687224438, 0.5573586035739695, 0.8267904717243209, 0.7343586644137439, 0.5738341973792699, 0.1260492711047012, 0.8217063696246951, -0.18016284750208644, 0.1441075123476686, 0.6851990169647462] and
mean score: 0.4861134928353471


In [55]:
gb.predict([x_test.iloc[10,:]])

array([16044.07341013])

In [56]:
y_test.iloc[10]

13415.0

# using Hyperparameter tuning

In [38]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 100)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 40, num = 20)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Create the random grid
parm_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,2.0],
               'loss' : ['ls', 'lad', 'huber', 'quantile']}

cv=KFold(n_splits=10,shuffle=True,random_state=12)
rfr=GradientBoostingRegressor()

In [39]:
clf=RandomizedSearchCV(rfr,parm_grid,scoring='accuracy',cv=cv,n_jobs=-1,verbose=3,n_iter=100)

In [40]:
clf.fit(x_train,y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan]


RandomizedSearchCV(cv=KFold(n_splits=10, random_state=12, shuffle=True),
                   estimator=GradientBoostingRegressor(), n_iter=100, n_jobs=-1,
                   param_distributions={'learning_rate': [0.1, 0.2, 0.3, 0.4,
                                                          0.5, 0.6, 0.7, 0.8,
                                                          0.9, 1.0, 2.0],
                                        'loss': ['ls', 'lad', 'huber',
                                                 'quantile'],
                                        'max_depth': [2, 4, 6, 8, 10, 12, 14,
                                                      16, 18, 20, 22, 24, 26,
                                                      28, 30, 32, 34, 36, 38,
                                                      40, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                       

In [41]:
clf.best_params_

{'n_estimators': 580,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 2,
 'loss': 'huber',
 'learning_rate': 0.3}

In [None]:
# using best params get from hyperparameter tuning

In [57]:
sco_tr=[]
sco_ts=[]
for train_index,test_index in kf.split(x):
    x_train,x_test,y_train,y_test=x.loc[train_index],x.loc[test_index],y[train_index],y[test_index]
    gb1=GradientBoostingRegressor(n_estimators= 580,min_samples_split=2,min_samples_leaf=1,max_features='auto',max_depth=2,loss='huber',learning_rate=0.3)
    gb1.fit(x_train,y_train)
    sco_tr.append(gb1.score(x_train,y_train))
    sco_ts.append(gb1.score(x_test,y_test))

In [58]:
print(f'{sco_tr} and\nmean score: {np.array(sco_tr).mean()}')
print(f'{sco_ts} and\nmean score: {np.array(sco_ts).mean()}')

[0.9983333294593434, 0.9978853404932531, 0.99839002089837, 0.9973395715119681, 0.9981994944776252, 0.9983377088234394, 0.9969604478524535, 0.9985953477590317, 0.9984464251562927, 0.9984839647153614] and
mean score: 0.9980971651147138
[0.7249013462141745, 0.20708650432480513, 0.8525623616996281, 0.8677592939390324, -0.7796753229113871, -0.037211117255899095, 0.7673202634261378, -0.8486590482612741, -0.3834645686243461, 0.3706468259154597] and
mean score: 0.17412665384663312


In [59]:
gb1.predict([x_test.iloc[10,:]]),y_test.iloc[10]

(array([15002.4345855]), 13415.0)