In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from utility import *
from botorch.acquisition.analytic import ExpectedImprovement,ProbabilityOfImprovement,UpperConfidenceBound

In [5]:
X_train = pd.read_csv('./input/X_train.csv')
y_train = pd.read_csv('./input/y_train.csv').iloc[:,0]
parameter = pd.read_csv('./input/parameters.csv')
score = pd.read_csv('./input/scores.csv').iloc[:,1:]

In [8]:
log_rmse = lambda yhat,y: np.sqrt(np.mean((np.log(1+y) - np.log(1+yhat))**2))
def xgboost2perf(learning_rate,max_depth,min_child_weight,gamma,alpha,reg_lambda,\
                 subsample,colsample_bytree,objective,tree_method):

    xgboost = XGBRegressor(learning_rate=learning_rate,\
                           n_estimators=600,\
                           nthread=-1,\
                           
                           max_depth=max_depth,\
                           min_child_weight=min_child_weight,\
                           gamma=gamma,\
                           alpha=alpha,\
                           reg_lambda=reg_lambda,\
                           subsample=subsample,\
                           colsample_bytree=colsample_bytree,\
                           objective=objective,\
                           tree_method=tree_method
                           )

    score = 0
    n_fold = 6
    kf = KFold(n_fold,shuffle=True,random_state=7)
    for train_index, test_index in kf.split(X_train):
        xgboost.fit(X_train.iloc[train_index],y_train.iloc[train_index]);
        yhat = np.maximum(xgboost.predict(X_train.iloc[test_index]),0)
        score += log_rmse(yhat,y_train.iloc[test_index])
    score /= n_fold
    return score

In [9]:
# 'reg:squaredlogerror' and 'reg:pseudohubererror' does not work
cat_feat = {'objective':{'reg:squarederror':0,'reg:absoluteerror':1,'reg:gamma':2,'count:poisson':3},\
            'tree_method':{'exact':0,'hist':1}}
bounds = [[0.1,0.6],[2,16],[1.0,100.0],\
          [0.0,100.0],[0.0,100.0],[0.0,100.0],\
          [0.25,1.0],[0.25,1.0]]
IsMax = False
integer = [1,]

In [10]:
T = Transform(cat_feat,integer,bounds,IsMax)

In [11]:
bo_parameters = {'acq_fun': ExpectedImprovement,\
                'acq_kwargs': {},\
                 #'fit': fit_gpytorch_mll,\
                'BaseKernel': MaternKernel,\
                'q': 1,\
                'num_restarts': 4,\
                'raw_samples': 128,\
                'Bo_iter': 200,\
                'verbose': 10
                }

In [12]:
#%debug
import warnings
warnings.filterwarnings('ignore')
x_best,y_best,x,y,model = BO(xgboost2perf,parameter,score,T,**bo_parameters)

best val is 0.1347155871452955 at iter 10
best val is 0.1347155871452955 at iter 20
best val is 0.1347155871452955 at iter 30
best val is 0.13127062769829775 at iter 40
best val is 0.12964176954962917 at iter 50
best val is 0.12689068175022358 at iter 60
best val is 0.12689068175022358 at iter 70
best val is 0.12689068175022358 at iter 80
best val is 0.12689068175022358 at iter 90
best val is 0.12689068175022358 at iter 100
best val is 0.12689068175022358 at iter 110
best val is 0.12689068175022358 at iter 120
best val is 0.12689068175022358 at iter 130
best val is 0.12437457562667197 at iter 140
best val is 0.12437457562667197 at iter 150
best val is 0.12437457562667197 at iter 160
best val is 0.12437457562667197 at iter 170
best val is 0.12437457562667197 at iter 180
best val is 0.12437457562667197 at iter 190
best val is 0.12437457562667197 at iter 200


In [13]:
pd.concat([x,y],1).sort_values(by=['scores'])

Unnamed: 0,learning_rate,max_depth,min_child_weight,gamma,alpha,reg_lambda,subsample,colsample_bytree,objective,tree_method,scores
0,0.147024,4,70.555683,81.583557,97.686505,1.764679,0.985076,0.644647,count:poisson,hist,0.124375
0,0.176137,6,99.022384,54.493330,6.872106,62.703177,0.616812,0.316088,count:poisson,hist,0.126875
0,0.100881,2,76.159859,10.421766,95.815843,19.983223,0.801994,0.957193,count:poisson,hist,0.126891
0,0.297921,3,15.107730,40.495094,14.743448,41.523046,0.964602,0.425500,count:poisson,exact,0.127843
0,0.170058,5,40.726489,67.438718,87.992783,98.854587,0.603641,0.381499,count:poisson,hist,0.129025
...,...,...,...,...,...,...,...,...,...,...,...
0,0.587678,5,2.754033,97.491414,85.517532,38.658192,0.301558,0.686392,reg:absoluteerror,hist,0.401440
2,0.403225,3,38.475863,72.333209,41.717891,17.955389,0.535492,0.305701,reg:gamma,exact,0.402244
0,0.508482,7,39.145826,82.361468,30.970719,9.063373,0.863166,0.387076,reg:gamma,exact,0.404701
4,0.450057,5,7.628559,35.718665,33.163884,30.461418,0.627340,0.649008,reg:gamma,hist,0.404789


In [None]:
x_best,y_best,x,y,model = BO(xgboost2perf,x,y,T,**bo_parameters)

best val is 0.12437457562667197 at iter 10
best val is 0.12437457562667197 at iter 20
best val is 0.12437457562667197 at iter 30
best val is 0.12437457562667197 at iter 40
best val is 0.12437457562667197 at iter 50
best val is 0.12437457562667197 at iter 60
best val is 0.12437457562667197 at iter 70
best val is 0.12437457562667197 at iter 80
best val is 0.12437457562667197 at iter 90
best val is 0.12437457562667197 at iter 100
best val is 0.12437457562667197 at iter 110
best val is 0.12437457562667197 at iter 120
best val is 0.12437457562667197 at iter 130


In [None]:
pd.concat([x,y],1).sort_values(by=['scores'])

In [None]:
x.to_csv('bo_x.csv'); y.to_csv('bo_y.csv')