All code is from the following Kaggle notebook: https://www.kaggle.com/code/cocoyachi/safedriver-xgboost-musthave-ch08

In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2024-01-17T14:26:12.245463-05:00

Python implementation: CPython
Python version       : 3.11.6
IPython version      : 8.18.1

Compiler    : GCC 13.2.0
OS          : Linux
Release     : 6.5.0-14-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 16
Architecture: 64bit



In [3]:
%watermark --gpu

GPU Info: 
  GPU 0: Quadro RTX 5000



In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import save_npz
from bayes_opt import BayesianOptimization

In [7]:
%watermark --iversions

xgboost: 2.0.3
pandas : 2.1.4
numpy  : 1.26.3



In [8]:
data_path = '../input/'

train = pd.read_csv(data_path + 'train.csv.zip', index_col='id')
test = pd.read_csv(data_path + 'test.csv.zip', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv.zip', index_col='id')

In [9]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis=1) 

all_features = all_data.columns

In [10]:
cat_features = [feature for feature in all_features if 'cat' in feature]

onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])

In [11]:
all_data['num_missing'] = (all_data==-1).sum(axis=1)

In [12]:
remaining_features = [feature for feature in all_features
                      if ('cat' not in feature and 'calc' not in feature)]

remaining_features.append('num_missing')

In [13]:
ind_features = [feature for feature in all_features if 'ind' in feature]

is_first_feature = True
for ind_feature in ind_features:
    if is_first_feature:
        all_data['mix_ind'] = all_data[ind_feature].astype(str) + '_'
        is_first_feature = False
    else:
        all_data['mix_ind'] += all_data[ind_feature].astype(str) + '_'

In [14]:
all_data['mix_ind']

0          2_2_5_1_0_0_1_0_0_0_0_0_0_0_11_0_1_0_
1           1_1_7_0_0_0_0_1_0_0_0_0_0_0_3_0_0_1_
2          5_4_9_1_0_0_0_1_0_0_0_0_0_0_12_1_0_0_
3           0_1_2_0_0_1_0_0_0_0_0_0_0_0_8_1_0_0_
4           0_2_0_1_0_1_0_0_0_0_0_0_0_0_9_1_0_0_
                           ...                  
1488023     0_1_6_0_0_0_1_0_0_0_0_0_0_0_2_0_0_1_
1488024    5_3_5_1_0_0_0_1_0_0_0_0_0_0_11_1_0_0_
1488025     0_1_5_0_0_1_0_0_0_0_0_0_0_0_5_0_0_1_
1488026    6_1_5_1_0_0_0_0_1_0_0_0_0_0_13_1_0_0_
1488027    7_1_4_1_0_0_0_0_1_0_0_0_0_0_12_1_0_0_
Name: mix_ind, Length: 1488028, dtype: object

In [15]:
cat_count_features = []
for feature in cat_features+['mix_ind']:
    val_counts_dict = all_data[feature].value_counts().to_dict()
    all_data[f'{feature}_count'] = all_data[feature].apply(lambda x:
                                                           val_counts_dict[x])
    cat_count_features.append(f'{feature}_count')

In [16]:
from scipy import sparse

drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin',
                 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_14']

all_data_remaining = all_data[remaining_features+cat_count_features].drop(drop_features, axis=1)

all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining),
                               encoded_cat_matrix],
                              format='csr')

In [17]:
num_train = len(train) 


X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]

y = train['target'].values

In [18]:
X.shape

(595212, 217)

In [19]:
def eval_gini(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    
    n_samples = y_true.shape[0]
    L_mid = np.linspace(1 / n_samples, 1, n_samples)

    pred_order = y_true[y_pred.argsort()]
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    G_pred = np.sum(L_mid - L_pred)

    true_order = y_true[y_true.argsort()]
    L_true = np.cumsum(true_order) / np.sum(true_order)
    G_true = np.sum(L_mid - L_true)
    
    return G_pred / G_true

In [20]:
def gini(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds)

In [21]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      test_size=0.2,
                                                      random_state=0)

bayes_dtrain = xgb.DMatrix(X_train, y_train)
bayes_dvalid = xgb.DMatrix(X_valid, y_valid)

In [25]:
param_bounds = {'max_depth': (4,8),
                'subsample': (0.6, 0.9),
                'colsample_bytree': (0.7, 1.0),
                'min_child_weight': (5, 7),
                'gamma': (8, 11),
                'reg_alpha': (7, 9),
                'reg_lambda': (1.1, 1.5),
                'scale_pos_weight': (1.4, 1.6)}


fixed_params = {'objective' : 'binary:logistic',
                'learning_rate': 0.01,
                'tree_method' : 'hist',
                'device': 'cuda',
                'random_state': 1993}

In [26]:
def eval_function(max_depth, subsample, colsample_bytree, min_child_weight,
                 reg_alpha, gamma, reg_lambda, scale_pos_weight):

    params = {'max_depth': int(round(max_depth)),
              'subsample': subsample,
              'colsample_bytree': colsample_bytree,
              'min_child_weight': min_child_weight,
              'gamma': gamma,
              'reg_alpha':reg_alpha,
              'reg_lambda': reg_lambda,
              'scale_pos_weight': scale_pos_weight}

    params.update(fixed_params)
    
    print('hyperparameters :', params)    
        
    # XGBoost model training
    xgb_model = xgb.train(params=params, 
                          dtrain=bayes_dtrain,
                          num_boost_round=4000,
                          evals=[(bayes_dvalid, 'bayes_dvalid')],
                          maximize=True,
                          feval=gini,
                          early_stopping_rounds=300,
                          verbose_eval=False)
                           
    best_iter = xgb_model.best_iteration # optimal number of iterations
    # Make predictions with validation data
    preds = xgb_model.predict(bayes_dvalid, 
                              iteration_range=(0, best_iter))
    # Gini coefficient calculation
    gini_score = eval_gini(y_valid, preds)
    print(f'Gini coefficient: {gini_score}\n')
    
    return gini_score


In [27]:
%%time
optimizer = BayesianOptimization(f=eval_function, 
                                 pbounds=param_bounds, 
                                 random_state=0)


optimizer.maximize(init_points=3, n_iter=25)

|   iter    |  target   | colsam... |   gamma   | max_depth | min_ch... | reg_alpha | reg_la... | scale_... | subsample |
-------------------------------------------------------------------------------------------------------------------------
hyperparameters : {'max_depth': 6, 'subsample': 0.867531900234624, 'colsample_bytree': 0.8646440511781974, 'min_child_weight': 6.0897663659937935, 'gamma': 10.14556809911726, 'reg_alpha': 7.84730959867781, 'reg_lambda': 1.3583576452266626, 'scale_pos_weight': 1.4875174422525386, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}
Gini coefficient: 0.27725263638809877

| [0m1        [0m | [0m0.2773   [0m | [0m0.8646   [0m | [0m10.15    [0m | [0m6.411    [0m | [0m6.09     [0m | [0m7.847    [0m | [0m1.358    [0m | [0m1.488    [0m | [0m0.8675   [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.6261387899104622, 'colsample_bytree': 0.9890988281503088, 'min_chil



Gini coefficient: 0.2775577982453122

| [0m4        [0m | [0m0.2776   [0m | [0m0.8843   [0m | [0m10.45    [0m | [0m6.838    [0m | [0m6.494    [0m | [0m8.552    [0m | [0m1.381    [0m | [0m1.423    [0m | [0m0.7002   [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 6.003622047114569, 'gamma': 8.743547016432192, 'reg_alpha': 8.168383775711607, 'reg_lambda': 1.5, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.28036355424782544

| [95m5        [0m | [95m0.2804   [0m | [95m1.0      [0m | [95m8.744    [0m | [95m7.402    [0m | [95m6.004    [0m | [95m8.168    [0m | [95m1.5      [0m | [95m1.4      [0m | [95m0.6      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 5.01875181438279, 'gamma': 8.014663076640137, 'reg_alpha': 8.324037944061198, 'reg_lambda': 1.5, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.28158294411933565

| [95m6        [0m | [95m0.2816   [0m | [95m1.0      [0m | [95m8.015    [0m | [95m7.654    [0m | [95m5.019    [0m | [95m8.324    [0m | [95m1.5      [0m | [95m1.4      [0m | [95m0.6      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 5.0, 'gamma': 8.0, 'reg_alpha': 7.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.2818897598059814

| [95m7        [0m | [95m0.2819   [0m | [95m1.0      [0m | [95m8.0      [0m | [95m8.0      [0m | [95m5.0      [0m | [95m7.0      [0m | [95m1.1      [0m | [95m1.4      [0m | [95m0.6      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 7.0, 'gamma': 8.0, 'reg_alpha': 7.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.2816689068574829

| [0m8        [0m | [0m0.2817   [0m | [0m1.0      [0m | [0m8.0      [0m | [0m8.0      [0m | [0m7.0      [0m | [0m7.0      [0m | [0m1.1      [0m | [0m1.4      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 5.0, 'gamma': 8.0, 'reg_alpha': 9.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.27953615430052314

| [0m9        [0m | [0m0.2795   [0m | [0m1.0      [0m | [0m8.0      [0m | [0m4.0      [0m | [0m5.0      [0m | [0m9.0      [0m | [0m1.1      [0m | [0m1.4      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 5.0, 'gamma': 8.0, 'reg_alpha': 7.0, 'reg_lambda': 1.5, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.281577169868897

| [0m10       [0m | [0m0.2816   [0m | [0m1.0      [0m | [0m8.0      [0m | [0m6.439    [0m | [0m5.0      [0m | [0m7.0      [0m | [0m1.5      [0m | [0m1.4      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 7.0, 'gamma': 8.0, 'reg_alpha': 7.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.2800018912691984

| [0m11       [0m | [0m0.28     [0m | [0m1.0      [0m | [0m8.0      [0m | [0m4.0      [0m | [0m7.0      [0m | [0m7.0      [0m | [0m1.1      [0m | [0m1.4      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 7.0, 'gamma': 8.0, 'reg_alpha': 7.0, 'reg_lambda': 1.5, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.2818194053358264

| [0m12       [0m | [0m0.2818   [0m | [0m1.0      [0m | [0m8.0      [0m | [0m6.508    [0m | [0m7.0      [0m | [0m7.0      [0m | [0m1.5      [0m | [0m1.4      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 7.0, 'gamma': 8.0, 'reg_alpha': 9.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.2812495377668045

| [0m13       [0m | [0m0.2812   [0m | [0m1.0      [0m | [0m8.0      [0m | [0m6.118    [0m | [0m7.0      [0m | [0m9.0      [0m | [0m1.1      [0m | [0m1.4      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 5.0, 'gamma': 8.0, 'reg_alpha': 9.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.2811856206962982

| [0m14       [0m | [0m0.2812   [0m | [0m1.0      [0m | [0m8.0      [0m | [0m6.424    [0m | [0m5.0      [0m | [0m9.0      [0m | [0m1.1      [0m | [0m1.4      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.6, 'colsample_bytree': 0.7, 'min_child_weight': 5.999497105374086, 'gamma': 8.0, 'reg_alpha': 7.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.6, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.28210588983815477

| [95m15       [0m | [95m0.2821   [0m | [95m0.7      [0m | [95m8.0      [0m | [95m7.111    [0m | [95m5.999    [0m | [95m7.0      [0m | [95m1.1      [0m | [95m1.6      [0m | [95m0.6      [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.6, 'colsample_bytree': 0.7, 'min_child_weight': 7.0, 'gamma': 8.0, 'reg_alpha': 7.843941675661287, 'reg_lambda': 1.1, 'scale_pos_weight': 1.6, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.2819313036778345

| [0m16       [0m | [0m0.2819   [0m | [0m0.7      [0m | [0m8.0      [0m | [0m7.029    [0m | [0m7.0      [0m | [0m7.844    [0m | [0m1.1      [0m | [0m1.6      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.9, 'colsample_bytree': 0.7, 'min_child_weight': 5.814928105868421, 'gamma': 8.0, 'reg_alpha': 7.0, 'reg_lambda': 1.5, 'scale_pos_weight': 1.6, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.2805937640699338

| [0m17       [0m | [0m0.2806   [0m | [0m0.7      [0m | [0m8.0      [0m | [0m8.0      [0m | [0m5.815    [0m | [0m7.0      [0m | [0m1.5      [0m | [0m1.6      [0m | [0m0.9      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 7.0, 'gamma': 8.0, 'reg_alpha': 9.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.2815285901456691

| [0m18       [0m | [0m0.2815   [0m | [0m1.0      [0m | [0m8.0      [0m | [0m8.0      [0m | [0m7.0      [0m | [0m9.0      [0m | [0m1.1      [0m | [0m1.4      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 6.151662298449365, 'gamma': 8.0, 'reg_alpha': 7.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.6, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.2820892747155691

| [0m19       [0m | [0m0.2821   [0m | [0m1.0      [0m | [0m8.0      [0m | [0m6.117    [0m | [0m6.152    [0m | [0m7.0      [0m | [0m1.1      [0m | [0m1.6      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 5.980023457599013, 'gamma': 8.0, 'reg_alpha': 7.67487105892614, 'reg_lambda': 1.1, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.281782038495278

| [0m20       [0m | [0m0.2818   [0m | [0m1.0      [0m | [0m8.0      [0m | [0m6.817    [0m | [0m5.98     [0m | [0m7.675    [0m | [0m1.1      [0m | [0m1.4      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 5.0, 'gamma': 8.0, 'reg_alpha': 9.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.6, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.282614104748607

| [95m21       [0m | [95m0.2826   [0m | [95m1.0      [0m | [95m8.0      [0m | [95m8.0      [0m | [95m5.0      [0m | [95m9.0      [0m | [95m1.1      [0m | [95m1.6      [0m | [95m0.6      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.6, 'colsample_bytree': 0.7, 'min_child_weight': 5.0, 'gamma': 8.0, 'reg_alpha': 9.0, 'reg_lambda': 1.5, 'scale_pos_weight': 1.6, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.28223028288174223

| [0m22       [0m | [0m0.2822   [0m | [0m0.7      [0m | [0m8.0      [0m | [0m8.0      [0m | [0m5.0      [0m | [0m9.0      [0m | [0m1.5      [0m | [0m1.6      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.9, 'colsample_bytree': 0.7, 'min_child_weight': 5.490047753415671, 'gamma': 8.0, 'reg_alpha': 9.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.6, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.27990298499067706

| [0m23       [0m | [0m0.2799   [0m | [0m0.7      [0m | [0m8.0      [0m | [0m8.0      [0m | [0m5.49     [0m | [0m9.0      [0m | [0m1.1      [0m | [0m1.6      [0m | [0m0.9      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 7.0, 'gamma': 11.0, 'reg_alpha': 9.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.27610385977667906

| [0m24       [0m | [0m0.2761   [0m | [0m1.0      [0m | [0m11.0     [0m | [0m4.0      [0m | [0m7.0      [0m | [0m9.0      [0m | [0m1.1      [0m | [0m1.4      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 5.0, 'gamma': 9.105269134894964, 'reg_alpha': 9.0, 'reg_lambda': 1.5, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.27996659879172764

| [0m25       [0m | [0m0.28     [0m | [0m1.0      [0m | [0m9.105    [0m | [0m8.0      [0m | [0m5.0      [0m | [0m9.0      [0m | [0m1.5      [0m | [0m1.4      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 0.7, 'min_child_weight': 5.0, 'gamma': 8.0, 'reg_alpha': 7.0, 'reg_lambda': 1.5, 'scale_pos_weight': 1.6, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.28011848826597513

| [0m26       [0m | [0m0.2801   [0m | [0m0.7      [0m | [0m8.0      [0m | [0m4.486    [0m | [0m5.0      [0m | [0m7.0      [0m | [0m1.5      [0m | [0m1.6      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 5.0, 'gamma': 11.0, 'reg_alpha': 7.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.4, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.2777872033924644

| [0m27       [0m | [0m0.2778   [0m | [0m1.0      [0m | [0m11.0     [0m | [0m8.0      [0m | [0m5.0      [0m | [0m7.0      [0m | [0m1.1      [0m | [0m1.4      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 7.0, 'gamma': 8.7485867447037, 'reg_alpha': 7.0, 'reg_lambda': 1.1, 'scale_pos_weight': 1.6, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'tree_method': 'hist', 'device': 'cuda', 'random_state': 1993}




Gini coefficient: 0.2817580559467126

| [0m28       [0m | [0m0.2818   [0m | [0m1.0      [0m | [0m8.749    [0m | [0m7.138    [0m | [0m7.0      [0m | [0m7.0      [0m | [0m1.1      [0m | [0m1.6      [0m | [0m0.6      [0m |
CPU times: user 26min 47s, sys: 56.3 s, total: 27min 43s
Wall time: 26min 49s


In [28]:
max_params = optimizer.max['params']
max_params

{'colsample_bytree': 1.0,
 'gamma': 8.0,
 'max_depth': 8.0,
 'min_child_weight': 5.0,
 'reg_alpha': 9.0,
 'reg_lambda': 1.1,
 'scale_pos_weight': 1.6,
 'subsample': 0.6}

In [29]:
max_params['max_depth'] = int(round(max_params['max_depth']))

max_params.update(fixed_params)
max_params

{'colsample_bytree': 1.0,
 'gamma': 8.0,
 'max_depth': 8,
 'min_child_weight': 5.0,
 'reg_alpha': 9.0,
 'reg_lambda': 1.1,
 'scale_pos_weight': 1.6,
 'subsample': 0.6,
 'objective': 'binary:logistic',
 'learning_rate': 0.01,
 'tree_method': 'hist',
 'device': 'cuda',
 'random_state': 1993}

In [30]:
%%time

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)


oof_val_preds = np.zeros(X.shape[0]) 
oof_test_preds = np.zeros(X_test.shape[0]) 
dtest = xgb.DMatrix(X_test)

for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):

    print('#'*40, f'Fold {idx+1} / {folds.n_splits}', '#'*40)
    
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]


    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)
   

    xgb_model = xgb.train(params=max_params, 
                          dtrain=dtrain,
                          num_boost_round=5000,
                          evals=[(dvalid, 'valid')],
                          maximize=True,
                          feval=gini,
                          early_stopping_rounds=300,
                          verbose_eval=200)

    best_iter = xgb_model.best_iteration
    oof_test_preds += xgb_model.predict(dtest,
                                        iteration_range=(0, best_iter))/folds.n_splits
    
    oof_val_preds[valid_idx] += xgb_model.predict(dvalid, 
                                                  iteration_range=(0, best_iter))
    
    gini_score = eval_gini(y_valid, oof_val_preds[valid_idx])
    print(f'Fold {idx+1} gini score : {gini_score}\n')

######################################## Fold 1 / 5 ########################################
[0]	valid-logloss:0.22081	valid-gini:0.21636




[200]	valid-logloss:0.16259	valid-gini:0.27678
[400]	valid-logloss:0.15688	valid-gini:0.28882
[600]	valid-logloss:0.15607	valid-gini:0.29330
[800]	valid-logloss:0.15593	valid-gini:0.29487
[1000]	valid-logloss:0.15588	valid-gini:0.29533
[1200]	valid-logloss:0.15587	valid-gini:0.29588
[1400]	valid-logloss:0.15585	valid-gini:0.29634
[1600]	valid-logloss:0.15586	valid-gini:0.29687
[1800]	valid-logloss:0.15582	valid-gini:0.29723
[2000]	valid-logloss:0.15582	valid-gini:0.29722
[2200]	valid-logloss:0.15581	valid-gini:0.29739
[2400]	valid-logloss:0.15582	valid-gini:0.29756
[2600]	valid-logloss:0.15581	valid-gini:0.29767
[2800]	valid-logloss:0.15580	valid-gini:0.29789
[3000]	valid-logloss:0.15578	valid-gini:0.29796
[3200]	valid-logloss:0.15578	valid-gini:0.29827
[3400]	valid-logloss:0.15579	valid-gini:0.29837
[3600]	valid-logloss:0.15577	valid-gini:0.29841
[3800]	valid-logloss:0.15579	valid-gini:0.29841
[3883]	valid-logloss:0.15579	valid-gini:0.29840
Fold 1 gini score : 0.2984277239931949

####

In [31]:
print('OOF Verification data Gini coefficient:', eval_gini(y, oof_val_preds))


OOF Verification data Gini coefficient: 0.2882012500025105


In [32]:
submission['target'] = oof_test_preds
submission.to_csv('../submissions/submission_best_kaggle_xgb_gpu_hist_2.csv')

0.28663 Public, 0.28973 Private