In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [55]:
name = 'gbm'

In [3]:
submission = pd.read_csv('../Data/sample_submission.csv')

In [4]:
train_X = pd.read_csv('../Data/train_X_xgb.csv',index_col=0)
train_y = pd.read_csv('../Data/train_y_xgb.csv',index_col=0)
val_X = pd.read_csv('../Data/val_X_xgb.csv',index_col=0)
val_y = pd.read_csv('../Data/val_y_xgb.csv',index_col=0)
test_X = pd.read_csv('../Data/test_X_xgb.csv',index_col=0)

In [5]:
dtrain = xgb.DMatrix(train_X.values, label=train_y.values)
dval = xgb.DMatrix(val_X.values, label=val_y.values)
dtest = xgb.DMatrix(test_X.values)

In [6]:
# set margin
mean = train_y.values.mean()
dtrain.set_base_margin(np.ones(train_X.shape[0])*mean)
dval.set_base_margin(np.ones(val_X.shape[0])*mean)
dtest.set_base_margin(np.ones(test_X.shape[0])*mean)

Build model

In [10]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [63]:
def MAE_obj(preds, dtrain):
    labels = dtrain.get_label()
    grad = np.where(preds>labels,1.0,-1.0)
    hess = np.ones_like(grad)
    return grad, hess

def huber_approx_obj(preds, dtrain):
    d = preds - dtrain.get_label() #remove .get_labels() for sklearn
    h = 1  #h is delta in the graphic
    scale = 1 + (d / h) ** 2
    scale_sqrt = np.sqrt(scale)
    grad = d / scale_sqrt
    hess = 1 / scale / scale_sqrt
    return grad, hess

def fair_obj(preds, dtrain):
    """y = c * abs(x) - c**2 * np.log(abs(x)/c + 1)"""
    x = preds - dtrain.get_label()
    c = 1
    den = abs(x) + c
    grad = c*x / den
    hess = c*c / den ** 2
    return grad, hess

def log_cosh_obj(preds, dtrain):
    x = preds - dtrain.get_label()
    grad = np.tanh(x)
    hess = 1 / np.cosh(x)**2
    return grad, hess

In [64]:
iterations = 40

In [66]:
for i in range(iterations):
    param = {}
    results = {}

    param['gamma '] = np.random.rand()*10
    param['max_depth'] = np.random.randint(2,24)
    param['nthread'] = 4
    param['colsample_bylevel'] = 0.2 + np.random.rand()*0.6
    param['subsample'] = 0.2 + np.random.rand()*0.6
    param['lambda'] = np.random.rand()*4
    param['alpha'] = np.random.rand()*4
    param['grow_policy'] = np.random.choice(['depthwise','lossguide'])
    param['max_bin'] = np.random.choice([64,128,256,512,1024])
    obj = np.random.choice([MAE_obj,huber_approx_obj,fair_obj,log_cosh_obj])


    param['eta'] = 0.01
    param['eval_metric'] = 'mae'
    param['tree_method'] = 'hist'

    num_round = 1000

    GBM = xgb.train(param, dtrain, num_round, watchlist, 
                    obj=obj,verbose_eval=False,early_stopping_rounds=50,evals_result=results)

    min_loss = min(results['val']['mae'])
    if min_loss<2.35:
        GBM.save_model('../Model/'+name+str(min_loss)+'.model')
        submission.iloc[:,1] = GBM.predict(dtest,ntree_limit=GBM.best_ntree_limit)
        submission.to_csv('../Submission/gbms/'+name+str(min_loss)+'.csv',index=False)

    print(param,min_loss)

{'gamma ': 5.5560202627264825, 'max_depth': 21, 'nthread': 4, 'colsample_bylevel': 0.6747826472482675, 'subsample': 0.5193875584748104, 'lambda': 2.1257484892274148, 'alpha': 3.9321497386428255, 'grow_policy': 'lossguide', 'max_bin': 256, 'eta': 0.01, 'eval_metric': 'mae', 'tree_method': 'hist'} 2.365874
{'gamma ': 6.155418148873245, 'max_depth': 4, 'nthread': 4, 'colsample_bylevel': 0.547149204233072, 'subsample': 0.7597495081298145, 'lambda': 1.7366083739664502, 'alpha': 1.6097087078439256, 'grow_policy': 'depthwise', 'max_bin': 512, 'eta': 0.01, 'eval_metric': 'mae', 'tree_method': 'hist'} 2.424783
{'gamma ': 9.25832754428316, 'max_depth': 17, 'nthread': 4, 'colsample_bylevel': 0.3931182428028154, 'subsample': 0.5534521992475786, 'lambda': 3.883152451293924, 'alpha': 2.6223158628666923, 'grow_policy': 'lossguide', 'max_bin': 64, 'eta': 0.01, 'eval_metric': 'mae', 'tree_method': 'hist'} 2.348078
{'gamma ': 6.655617066678704, 'max_depth': 22, 'nthread': 4, 'colsample_bylevel': 0.57034

{'gamma ': 6.962103480365068, 'max_depth': 17, 'nthread': 4, 'colsample_bylevel': 0.45329249325626025, 'subsample': 0.265344952070706, 'lambda': 0.44514092255012816, 'alpha': 2.295797374450342, 'grow_policy': 'lossguide', 'max_bin': 128, 'eta': 0.01, 'eval_metric': 'mae', 'tree_method': 'hist'} 2.348826
{'gamma ': 6.023101313117167, 'max_depth': 16, 'nthread': 4, 'colsample_bylevel': 0.5140620009836314, 'subsample': 0.7668724957333979, 'lambda': 2.724784871540132, 'alpha': 3.4963603579315925, 'grow_policy': 'lossguide', 'max_bin': 512, 'eta': 0.01, 'eval_metric': 'mae', 'tree_method': 'hist'} 2.413561
{'gamma ': 5.4620332453149425, 'max_depth': 19, 'nthread': 4, 'colsample_bylevel': 0.26546742087005093, 'subsample': 0.33410637525284603, 'lambda': 2.2262224054074475, 'alpha': 1.8573249358718562, 'grow_policy': 'lossguide', 'max_bin': 512, 'eta': 0.01, 'eval_metric': 'mae', 'tree_method': 'hist'} 2.355312
{'gamma ': 8.618318890120966, 'max_depth': 10, 'nthread': 4, 'colsample_bylevel': 0