All code is from the following Kaggle notebook: https://www.kaggle.com/code/cocoyachi/safedriver-xgboost-musthave-ch08

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import save_npz
from bayes_opt import BayesianOptimization

In [2]:
data_path = '../input/'

train = pd.read_csv(data_path + 'train.csv.zip', index_col='id')
test = pd.read_csv(data_path + 'test.csv.zip', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv.zip', index_col='id')

In [3]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis=1) 

all_features = all_data.columns

In [4]:
cat_features = [feature for feature in all_features if 'cat' in feature]

onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])

In [5]:
all_data['num_missing'] = (all_data==-1).sum(axis=1)

In [6]:
remaining_features = [feature for feature in all_features
                      if ('cat' not in feature and 'calc' not in feature)]

remaining_features.append('num_missing')

In [7]:
ind_features = [feature for feature in all_features if 'ind' in feature]

is_first_feature = True
for ind_feature in ind_features:
    if is_first_feature:
        all_data['mix_ind'] = all_data[ind_feature].astype(str) + '_'
        is_first_feature = False
    else:
        all_data['mix_ind'] += all_data[ind_feature].astype(str) + '_'

In [8]:
all_data['mix_ind']

0          2_2_5_1_0_0_1_0_0_0_0_0_0_0_11_0_1_0_
1           1_1_7_0_0_0_0_1_0_0_0_0_0_0_3_0_0_1_
2          5_4_9_1_0_0_0_1_0_0_0_0_0_0_12_1_0_0_
3           0_1_2_0_0_1_0_0_0_0_0_0_0_0_8_1_0_0_
4           0_2_0_1_0_1_0_0_0_0_0_0_0_0_9_1_0_0_
                           ...                  
1488023     0_1_6_0_0_0_1_0_0_0_0_0_0_0_2_0_0_1_
1488024    5_3_5_1_0_0_0_1_0_0_0_0_0_0_11_1_0_0_
1488025     0_1_5_0_0_1_0_0_0_0_0_0_0_0_5_0_0_1_
1488026    6_1_5_1_0_0_0_0_1_0_0_0_0_0_13_1_0_0_
1488027    7_1_4_1_0_0_0_0_1_0_0_0_0_0_12_1_0_0_
Name: mix_ind, Length: 1488028, dtype: object

In [9]:
cat_count_features = []
for feature in cat_features+['mix_ind']:
    val_counts_dict = all_data[feature].value_counts().to_dict()
    all_data[f'{feature}_count'] = all_data[feature].apply(lambda x:
                                                           val_counts_dict[x])
    cat_count_features.append(f'{feature}_count')

In [10]:
from scipy import sparse

drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin',
                 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_14']

all_data_remaining = all_data[remaining_features+cat_count_features].drop(drop_features, axis=1)

all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining),
                               encoded_cat_matrix],
                              format='csr')

In [11]:
num_train = len(train) 


X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]

y = train['target'].values

In [12]:
X.shape

(595212, 217)

In [13]:
def eval_gini(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    
    n_samples = y_true.shape[0]
    L_mid = np.linspace(1 / n_samples, 1, n_samples)

    pred_order = y_true[y_pred.argsort()]
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    G_pred = np.sum(L_mid - L_pred)

    true_order = y_true[y_true.argsort()]
    L_true = np.cumsum(true_order) / np.sum(true_order)
    G_true = np.sum(L_mid - L_true)
    
    return G_pred / G_true

In [14]:
def gini(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds)

In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      test_size=0.2,
                                                      random_state=0)

bayes_dtrain = xgb.DMatrix(X_train, y_train)
bayes_dvalid = xgb.DMatrix(X_valid, y_valid)

In [16]:
param_bounds = {'max_depth': (4,12),
                'subsample': (0.6, 1.0),
                'colsample_bytree': (0.5, 1.0),
                'min_child_weight': (5, 20),
                'gamma': (3, 11),
                'reg_alpha': (0, 9),
                'reg_lambda': (0.1, 1.5),
                'scale_pos_weight': (1.0, 2.0)}


fixed_params = {'objective' : 'binary:logistic',
                'learning_rate': 0.02,
                'random_state': 1993}

In [17]:
def eval_function(max_depth, subsample, colsample_bytree, min_child_weight,
                 reg_alpha, gamma, reg_lambda, scale_pos_weight):

    params = {'max_depth': int(round(max_depth)),
              'subsample': subsample,
              'colsample_bytree': colsample_bytree,
              'min_child_weight': min_child_weight,
              'gamma': gamma,
              'reg_alpha':reg_alpha,
              'reg_lambda': reg_lambda,
              'scale_pos_weight': scale_pos_weight}

    params.update(fixed_params)
    
    print('hyperparameters :', params)    
        
    # XGBoost model training
    xgb_model = xgb.train(params=params, 
                          dtrain=bayes_dtrain,
                          num_boost_round=2000,
                          evals=[(bayes_dvalid, 'bayes_dvalid')],
                          maximize=True,
                          feval=gini,
                          early_stopping_rounds=200,
                          verbose_eval=False)
                           
    best_iter = xgb_model.best_iteration # optimal number of iterations
    # Make predictions with validation data
    preds = xgb_model.predict(bayes_dvalid, 
                              iteration_range=(0, best_iter))
    # Gini coefficient calculation
    gini_score = eval_gini(y_valid, preds)
    print(f'Gini coefficient: {gini_score}\n')
    
    return gini_score


In [18]:
%%time
optimizer = BayesianOptimization(f=eval_function, 
                                 pbounds=param_bounds, 
                                 random_state=0)


optimizer.maximize(init_points=6, n_iter=50)

|   iter    |  target   | colsam... |   gamma   | max_depth | min_ch... | reg_alpha | reg_la... | scale_... | subsample |
-------------------------------------------------------------------------------------------------------------------------
hyperparameters : {'max_depth': 9, 'subsample': 0.9567092003128319, 'colsample_bytree': 0.7744067519636624, 'min_child_weight': 13.173247744953454, 'gamma': 8.721514930979357, 'reg_alpha': 3.8128931940501425, 'reg_lambda': 1.0042517582933186, 'scale_pos_weight': 1.4375872112626924, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2791697037080871

| [0m1        [0m | [0m0.2792   [0m | [0m0.7744   [0m | [0m8.722    [0m | [0m8.822    [0m | [0m13.17    [0m | [0m3.813    [0m | [0m1.004    [0m | [0m1.438    [0m | [0m0.9567   [0m |
hyperparameters : {'max_depth': 10, 'subsample': 0.6348517198806163, 'colsample_bytree': 0.9818313802505146, 'min_child_weight': 12.933423796293567, 'gamma': 6.067532150606222, 'reg_alpha': 5.112401049845391, 'reg_lambda': 1.3958352936097254, 'scale_pos_weight': 1.0710360581978868, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}
Gini coefficient: 0.28113448946604874

| [95m2        [0m | [95m0.2811   [0m | [95m0.9818   [0m | [95m6.068    [0m | [95m10.33    [0m | [95m12.93    [0m | [95m5.112    [0m | [95m1.396    [0m | [95m1.071    [0m | [95m0.6349   [0m |
hyperparameters : {'max_depth': 10, 'subsample': 0.9122116705145822, 'colsample_bytree': 0.5101091987201629, 'min_child_weight': 18.050182223702286, 



Gini coefficient: 0.2831538435080924

| [95m7        [0m | [95m0.2832   [0m | [95m1.0      [0m | [95m3.0      [0m | [95m7.963    [0m | [95m9.678    [0m | [95m5.987    [0m | [95m1.5      [0m | [95m1.165    [0m | [95m0.6      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.8159658985370651, 'colsample_bytree': 0.6816097525697367, 'min_child_weight': 9.393673977936512, 'gamma': 3.0038029128494568, 'reg_alpha': 6.350105057926053, 'reg_lambda': 0.7622613329095277, 'scale_pos_weight': 1.3034592558542544, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28276973454614274

| [0m8        [0m | [0m0.2828   [0m | [0m0.6816   [0m | [0m3.004    [0m | [0m7.932    [0m | [0m9.394    [0m | [0m6.35     [0m | [0m0.7623   [0m | [0m1.303    [0m | [0m0.816    [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 6.6285882815682, 'gamma': 3.0, 'reg_alpha': 0.0, 'reg_lambda': 1.5, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2833091255566522

| [95m9        [0m | [95m0.2833   [0m | [95m1.0      [0m | [95m3.0      [0m | [95m4.0      [0m | [95m6.629    [0m | [95m0.0      [0m | [95m1.5      [0m | [95m1.0      [0m | [95m0.6      [0m |
hyperparameters : {'max_depth': 12, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 7.723227715999092, 'gamma': 3.0, 'reg_alpha': 0.0, 'reg_lambda': 1.5, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.27392718571533264

| [0m10       [0m | [0m0.2739   [0m | [0m1.0      [0m | [0m3.0      [0m | [0m12.0     [0m | [0m7.723    [0m | [0m0.0      [0m | [0m1.5      [0m | [0m2.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 6.279716641146567, 'gamma': 3.396433443354955, 'reg_alpha': 5.41529249801106, 'reg_lambda': 1.5, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2830366278416258

| [0m11       [0m | [0m0.283    [0m | [0m1.0      [0m | [0m3.396    [0m | [0m4.0      [0m | [0m6.28     [0m | [0m5.415    [0m | [0m1.5      [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 5.0, 'gamma': 9.106523325451503, 'reg_alpha': 1.5597998679729665, 'reg_lambda': 1.5, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.27613510461781293

| [0m12       [0m | [0m0.2761   [0m | [0m1.0      [0m | [0m9.107    [0m | [0m4.0      [0m | [0m5.0      [0m | [0m1.56     [0m | [0m1.5      [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 10.53422694745015, 'gamma': 3.0, 'reg_alpha': 2.9577930184415013, 'reg_lambda': 1.5, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2830756606423848

| [0m13       [0m | [0m0.2831   [0m | [0m1.0      [0m | [0m3.0      [0m | [0m4.0      [0m | [0m10.53    [0m | [0m2.958    [0m | [0m1.5      [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 10.634161637395266, 'gamma': 3.0, 'reg_alpha': 9.0, 'reg_lambda': 1.5, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28311392920411516

| [0m14       [0m | [0m0.2831   [0m | [0m1.0      [0m | [0m3.0      [0m | [0m4.0      [0m | [0m10.63    [0m | [0m9.0      [0m | [0m1.5      [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 8.790316226204602, 'gamma': 6.88993033389942, 'reg_alpha': 9.0, 'reg_lambda': 1.5, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.27837652064258556

| [0m15       [0m | [0m0.2784   [0m | [0m1.0      [0m | [0m6.89     [0m | [0m6.813    [0m | [0m8.79     [0m | [0m9.0      [0m | [0m1.5      [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 13.711443080701658, 'gamma': 3.0, 'reg_alpha': 6.838398408945798, 'reg_lambda': 1.5, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28375304734247286

| [95m16       [0m | [95m0.2838   [0m | [95m1.0      [0m | [95m3.0      [0m | [95m6.223    [0m | [95m13.71    [0m | [95m6.838    [0m | [95m1.5      [0m | [95m2.0      [0m | [95m0.6      [0m |
hyperparameters : {'max_depth': 12, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 12.943101253541867, 'gamma': 3.0, 'reg_alpha': 9.0, 'reg_lambda': 1.5, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2802530064657938

| [0m17       [0m | [0m0.2803   [0m | [0m1.0      [0m | [0m3.0      [0m | [0m12.0     [0m | [0m12.94    [0m | [0m9.0      [0m | [0m1.5      [0m | [0m2.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 1.0, 'colsample_bytree': 1.0, 'min_child_weight': 15.628240556497023, 'gamma': 3.0, 'reg_alpha': 9.0, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.27876002306562353

| [0m18       [0m | [0m0.2788   [0m | [0m1.0      [0m | [0m3.0      [0m | [0m4.0      [0m | [0m15.63    [0m | [0m9.0      [0m | [0m0.1      [0m | [0m2.0      [0m | [0m1.0      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 10.424712496372111, 'gamma': 3.0, 'reg_alpha': 5.969674624624056, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28577499332746553

| [95m19       [0m | [95m0.2858   [0m | [95m0.5      [0m | [95m3.0      [0m | [95m4.0      [0m | [95m10.42    [0m | [95m5.97     [0m | [95m0.1      [0m | [95m2.0      [0m | [95m0.6      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 12.30502040321713, 'gamma': 3.0, 'reg_alpha': 4.423740774635093, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28571350645619836

| [0m20       [0m | [0m0.2857   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m6.198    [0m | [0m12.31    [0m | [0m4.424    [0m | [0m0.1      [0m | [0m2.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 5, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 15.397519283900468, 'gamma': 3.0, 'reg_alpha': 0.0, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2839615430715453

| [0m21       [0m | [0m0.284    [0m | [0m0.5      [0m | [0m3.0      [0m | [0m5.427    [0m | [0m15.4     [0m | [0m0.0      [0m | [0m0.1      [0m | [0m2.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 11, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 19.164210253636615, 'gamma': 3.0, 'reg_alpha': 0.0, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2787788500334402

| [0m22       [0m | [0m0.2788   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m10.85    [0m | [0m19.16    [0m | [0m0.0      [0m | [0m0.1      [0m | [0m2.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 1.0, 'colsample_bytree': 0.5, 'min_child_weight': 14.014302416395104, 'gamma': 3.0, 'reg_alpha': 3.5159727605622404, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2785173994137907

| [0m23       [0m | [0m0.2785   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m4.0      [0m | [0m14.01    [0m | [0m3.516    [0m | [0m0.1      [0m | [0m2.0      [0m | [0m1.0      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 9.537450791362264, 'gamma': 3.0, 'reg_alpha': 4.1695454964712075, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28508238250069584

| [0m24       [0m | [0m0.2851   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m5.829    [0m | [0m9.537    [0m | [0m4.17     [0m | [0m0.1      [0m | [0m2.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 11.024230096262, 'gamma': 3.0, 'reg_alpha': 5.757686418697249, 'reg_lambda': 1.5, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2848805586996512

| [0m25       [0m | [0m0.2849   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m5.663    [0m | [0m11.02    [0m | [0m5.758    [0m | [0m1.5      [0m | [0m2.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 12.82230518582083, 'gamma': 3.0, 'reg_alpha': 2.6657373015027037, 'reg_lambda': 0.1, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2824012823752983

| [0m26       [0m | [0m0.2824   [0m | [0m1.0      [0m | [0m3.0      [0m | [0m8.276    [0m | [0m12.82    [0m | [0m2.666    [0m | [0m0.1      [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 11.556447992478173, 'gamma': 3.0, 'reg_alpha': 6.033803693946755, 'reg_lambda': 0.1, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28432381663267675

| [0m27       [0m | [0m0.2843   [0m | [0m1.0      [0m | [0m3.0      [0m | [0m6.069    [0m | [0m11.56    [0m | [0m6.034    [0m | [0m0.1      [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 5, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 10.802149901238312, 'gamma': 3.0, 'reg_alpha': 0.0, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2837052722942011

| [0m28       [0m | [0m0.2837   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m5.339    [0m | [0m10.8     [0m | [0m0.0      [0m | [0m0.1      [0m | [0m2.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 12, 'subsample': 1.0, 'colsample_bytree': 0.5, 'min_child_weight': 5.0, 'gamma': 11.0, 'reg_alpha': 9.0, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.27689771003498337

| [0m29       [0m | [0m0.2769   [0m | [0m0.5      [0m | [0m11.0     [0m | [0m12.0     [0m | [0m5.0      [0m | [0m9.0      [0m | [0m0.1      [0m | [0m2.0      [0m | [0m1.0      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 6.672358555118746, 'gamma': 3.0, 'reg_alpha': 9.0, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2848735107174982

| [0m30       [0m | [0m0.2849   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m4.0      [0m | [0m6.672    [0m | [0m9.0      [0m | [0m0.1      [0m | [0m2.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 5.0, 'gamma': 3.0, 'reg_alpha': 9.0, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2836956242819137

| [0m31       [0m | [0m0.2837   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m7.321    [0m | [0m5.0      [0m | [0m9.0      [0m | [0m0.1      [0m | [0m2.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 20.0, 'gamma': 3.0, 'reg_alpha': 0.0, 'reg_lambda': 1.5, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28374934341151925

| [0m32       [0m | [0m0.2837   [0m | [0m1.0      [0m | [0m3.0      [0m | [0m4.0      [0m | [0m20.0     [0m | [0m0.0      [0m | [0m1.5      [0m | [0m2.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 1.0, 'colsample_bytree': 1.0, 'min_child_weight': 8.334391434283368, 'gamma': 3.0, 'reg_alpha': 6.7703188115090445, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.27962709470133845

| [0m33       [0m | [0m0.2796   [0m | [0m1.0      [0m | [0m3.0      [0m | [0m4.0      [0m | [0m8.334    [0m | [0m6.77     [0m | [0m0.1      [0m | [0m2.0      [0m | [0m1.0      [0m |
hyperparameters : {'max_depth': 5, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 10.937057246176103, 'gamma': 4.591287100112677, 'reg_alpha': 4.733185258742242, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2848768102891774

| [0m34       [0m | [0m0.2849   [0m | [0m0.5      [0m | [0m4.591    [0m | [0m5.092    [0m | [0m10.94    [0m | [0m4.733    [0m | [0m0.1      [0m | [0m2.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 5, 'subsample': 0.6405999102570725, 'colsample_bytree': 0.6687685850574014, 'min_child_weight': 5.047525744860247, 'gamma': 3.8180267436183364, 'reg_alpha': 8.130707811139215, 'reg_lambda': 0.2776022825205654, 'scale_pos_weight': 1.5026213992980457, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28508657974996415

| [0m35       [0m | [0m0.2851   [0m | [0m0.6688   [0m | [0m3.818    [0m | [0m4.58     [0m | [0m5.048    [0m | [0m8.131    [0m | [0m0.2776   [0m | [0m1.503    [0m | [0m0.6406   [0m |
hyperparameters : {'max_depth': 8, 'subsample': 1.0, 'colsample_bytree': 0.5, 'min_child_weight': 12.466901589607495, 'gamma': 3.0, 'reg_alpha': 5.660353347824085, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2786563413040715

| [0m36       [0m | [0m0.2787   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m8.292    [0m | [0m12.47    [0m | [0m5.66     [0m | [0m0.1      [0m | [0m2.0      [0m | [0m1.0      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 11.096968327244278, 'gamma': 3.0, 'reg_alpha': 3.401595303852577, 'reg_lambda': 0.1, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2858380491127173

| [95m37       [0m | [95m0.2858   [0m | [95m0.5      [0m | [95m3.0      [0m | [95m5.963    [0m | [95m11.1     [0m | [95m3.402    [0m | [95m0.1      [0m | [95m1.0      [0m | [95m0.6      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 1.0, 'min_child_weight': 9.514272271745538, 'gamma': 4.625382598580858, 'reg_alpha': 2.057727545620671, 'reg_lambda': 0.1, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2835544098925107

| [0m38       [0m | [0m0.2836   [0m | [0m1.0      [0m | [0m4.625    [0m | [0m6.344    [0m | [0m9.514    [0m | [0m2.058    [0m | [0m0.1      [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 5.0, 'gamma': 3.0, 'reg_alpha': 5.754680072558848, 'reg_lambda': 0.1, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2862708186697498

| [95m39       [0m | [95m0.2863   [0m | [95m0.5      [0m | [95m3.0      [0m | [95m6.73     [0m | [95m5.0      [0m | [95m5.755    [0m | [95m0.1      [0m | [95m1.0      [0m | [95m0.6      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 5.0, 'gamma': 3.0, 'reg_alpha': 3.2402026593588538, 'reg_lambda': 0.1, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2862500993005289

| [0m40       [0m | [0m0.2863   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m6.289    [0m | [0m5.0      [0m | [0m3.24     [0m | [0m0.1      [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.8528017051971715, 'colsample_bytree': 0.7519103638793956, 'min_child_weight': 6.05909731722155, 'gamma': 3.5471295269482717, 'reg_alpha': 4.045176783558097, 'reg_lambda': 1.3322098905446553, 'scale_pos_weight': 1.5301111434515544, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2819015347480523

| [0m41       [0m | [0m0.2819   [0m | [0m0.7519   [0m | [0m3.547    [0m | [0m6.676    [0m | [0m6.059    [0m | [0m4.045    [0m | [0m1.332    [0m | [0m1.53     [0m | [0m0.8528   [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.9902272908232179, 'colsample_bytree': 0.7880588262205819, 'min_child_weight': 5.412310924492727, 'gamma': 4.02776044559181, 'reg_alpha': 2.7146121016809595, 'reg_lambda': 0.2618273606700073, 'scale_pos_weight': 1.6362868122474445, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.27910729894126707

| [0m42       [0m | [0m0.2791   [0m | [0m0.7881   [0m | [0m4.028    [0m | [0m4.103    [0m | [0m5.412    [0m | [0m2.715    [0m | [0m0.2618   [0m | [0m1.636    [0m | [0m0.9902   [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 11.133206977060174, 'gamma': 3.0, 'reg_alpha': 2.817656794919924, 'reg_lambda': 1.5, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2832827411139968

| [0m43       [0m | [0m0.2833   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m6.568    [0m | [0m11.13    [0m | [0m2.818    [0m | [0m1.5      [0m | [0m2.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 5, 'subsample': 1.0, 'colsample_bytree': 1.0, 'min_child_weight': 11.189794258890743, 'gamma': 3.0, 'reg_alpha': 4.619885455537425, 'reg_lambda': 0.1, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2781509043418345

| [0m44       [0m | [0m0.2782   [0m | [0m1.0      [0m | [0m3.0      [0m | [0m4.829    [0m | [0m11.19    [0m | [0m4.62     [0m | [0m0.1      [0m | [0m2.0      [0m | [0m1.0      [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.7407025126968992, 'colsample_bytree': 0.5500551133729729, 'min_child_weight': 11.331546522393722, 'gamma': 3.5845923506142423, 'reg_alpha': 4.2691655450581, 'reg_lambda': 0.5803399505547525, 'scale_pos_weight': 1.7213844180262907, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2832398911616873

| [0m45       [0m | [0m0.2832   [0m | [0m0.5501   [0m | [0m3.585    [0m | [0m6.855    [0m | [0m11.33    [0m | [0m4.269    [0m | [0m0.5803   [0m | [0m1.721    [0m | [0m0.7407   [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 10.130947390319509, 'gamma': 3.846265260077911, 'reg_alpha': 3.541675428098197, 'reg_lambda': 0.19967771940430956, 'scale_pos_weight': 1.2043232767713237, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28532928427365195

| [0m46       [0m | [0m0.2853   [0m | [0m0.5      [0m | [0m3.846    [0m | [0m5.883    [0m | [0m10.13    [0m | [0m3.542    [0m | [0m0.1997   [0m | [0m1.204    [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6495299434821125, 'colsample_bytree': 0.7666876174995412, 'min_child_weight': 5.2405416841369155, 'gamma': 3.077144814018438, 'reg_alpha': 4.424728698254653, 'reg_lambda': 0.3086821315170775, 'scale_pos_weight': 1.41133310585121, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28349438275822414

| [0m47       [0m | [0m0.2835   [0m | [0m0.7667   [0m | [0m3.077    [0m | [0m5.752    [0m | [0m5.241    [0m | [0m4.425    [0m | [0m0.3087   [0m | [0m1.411    [0m | [0m0.6495   [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 0.5266915139907162, 'min_child_weight': 5.048764455352355, 'gamma': 3.225634665662528, 'reg_alpha': 6.929661008157756, 'reg_lambda': 0.16234728592988912, 'scale_pos_weight': 1.2320261441287903, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2855241490515839

| [0m48       [0m | [0m0.2855   [0m | [0m0.5267   [0m | [0m3.226    [0m | [0m6.024    [0m | [0m5.049    [0m | [0m6.93     [0m | [0m0.1623   [0m | [0m1.232    [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 5, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 10.727250119134368, 'gamma': 3.921736611333804, 'reg_alpha': 6.558044898602449, 'reg_lambda': 0.5641053289123935, 'scale_pos_weight': 1.6568597330580395, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2856733535802493

| [0m49       [0m | [0m0.2857   [0m | [0m0.5      [0m | [0m3.922    [0m | [0m4.83     [0m | [0m10.73    [0m | [0m6.558    [0m | [0m0.5641   [0m | [0m1.657    [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 9.835328576678714, 'gamma': 3.9294823766233615, 'reg_alpha': 5.404910852391608, 'reg_lambda': 0.6965597571283834, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2842619496636916

| [0m50       [0m | [0m0.2843   [0m | [0m0.5      [0m | [0m3.929    [0m | [0m5.508    [0m | [0m9.835    [0m | [0m5.405    [0m | [0m0.6966   [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 5.0, 'gamma': 3.0, 'reg_alpha': 6.842972494186192, 'reg_lambda': 0.1, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28475973614845684

| [0m51       [0m | [0m0.2848   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m7.742    [0m | [0m5.0      [0m | [0m6.843    [0m | [0m0.1      [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 8, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 5.0, 'gamma': 3.0, 'reg_alpha': 4.077609047493253, 'reg_lambda': 0.1, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28534960737078385

| [0m52       [0m | [0m0.2853   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m7.648    [0m | [0m5.0      [0m | [0m4.078    [0m | [0m0.1      [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 7, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 5.0, 'gamma': 4.568221093956778, 'reg_alpha': 6.021894358340864, 'reg_lambda': 0.1, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2835239818571111

| [0m53       [0m | [0m0.2835   [0m | [0m0.5      [0m | [0m4.568    [0m | [0m6.802    [0m | [0m5.0      [0m | [0m6.022    [0m | [0m0.1      [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.887224934893416, 'colsample_bytree': 0.899599677937608, 'min_child_weight': 6.238105589370054, 'gamma': 3.116010630263956, 'reg_alpha': 8.842753695990275, 'reg_lambda': 0.4662642090111643, 'scale_pos_weight': 1.0537510462066357, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.2817289784712896

| [0m54       [0m | [0m0.2817   [0m | [0m0.8996   [0m | [0m3.116    [0m | [0m5.553    [0m | [0m6.238    [0m | [0m8.843    [0m | [0m0.4663   [0m | [0m1.054    [0m | [0m0.8872   [0m |
hyperparameters : {'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 11.81519748448993, 'gamma': 4.2428371517554, 'reg_alpha': 2.615400423173235, 'reg_lambda': 0.1, 'scale_pos_weight': 1.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28358669798057246

| [0m55       [0m | [0m0.2836   [0m | [0m0.5      [0m | [0m4.243    [0m | [0m5.952    [0m | [0m11.82    [0m | [0m2.615    [0m | [0m0.1      [0m | [0m1.0      [0m | [0m0.6      [0m |
hyperparameters : {'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 11.419413476751902, 'gamma': 3.0, 'reg_alpha': 7.224648542076648, 'reg_lambda': 0.2955600803585693, 'scale_pos_weight': 2.0, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1993}




Gini coefficient: 0.28562446250038004

| [0m56       [0m | [0m0.2856   [0m | [0m0.5      [0m | [0m3.0      [0m | [0m4.0      [0m | [0m11.42    [0m | [0m7.225    [0m | [0m0.2956   [0m | [0m2.0      [0m | [0m0.6      [0m |
CPU times: user 1d 12h 50min 9s, sys: 2min 59s, total: 1d 12h 53min 9s
Wall time: 2h 29min 16s


In [20]:
max_params = optimizer.max['params']
max_params

{'colsample_bytree': 0.5,
 'gamma': 3.0,
 'max_depth': 6.7298867719391895,
 'min_child_weight': 5.0,
 'reg_alpha': 5.754680072558848,
 'reg_lambda': 0.1,
 'scale_pos_weight': 1.0,
 'subsample': 0.6}

In [21]:
max_params['max_depth'] = int(round(max_params['max_depth']))

max_params.update(fixed_params)
max_params

{'colsample_bytree': 0.5,
 'gamma': 3.0,
 'max_depth': 7,
 'min_child_weight': 5.0,
 'reg_alpha': 5.754680072558848,
 'reg_lambda': 0.1,
 'scale_pos_weight': 1.0,
 'subsample': 0.6,
 'objective': 'binary:logistic',
 'learning_rate': 0.02,
 'random_state': 1993}

In [22]:
%%time

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)


oof_val_preds = np.zeros(X.shape[0]) 
oof_test_preds = np.zeros(X_test.shape[0]) 
dtest = xgb.DMatrix(X_test)

for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):

    print('#'*40, f'Fold {idx+1} / {folds.n_splits}', '#'*40)
    
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]


    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)
   

    xgb_model = xgb.train(params=max_params, 
                          dtrain=dtrain,
                          num_boost_round=2000,
                          evals=[(dvalid, 'valid')],
                          maximize=True,
                          feval=gini,
                          early_stopping_rounds=200,
                          verbose_eval=100)

    best_iter = xgb_model.best_iteration
    oof_test_preds += xgb_model.predict(dtest,
                                        iteration_range=(0, best_iter))/folds.n_splits
    
    oof_val_preds[valid_idx] += xgb_model.predict(dvalid, 
                                                  iteration_range=(0, best_iter))
    
    gini_score = eval_gini(y_valid, oof_val_preds[valid_idx])
    print(f'Fold {idx+1} gini score : {gini_score}\n')

######################################## Fold 1 / 5 ########################################




[0]	valid-logloss:0.21137	valid-gini:0.19231
[100]	valid-logloss:0.15529	valid-gini:0.27176
[200]	valid-logloss:0.15202	valid-gini:0.28596
[300]	valid-logloss:0.15163	valid-gini:0.29233
[400]	valid-logloss:0.15147	valid-gini:0.29643
[500]	valid-logloss:0.15143	valid-gini:0.29755
[600]	valid-logloss:0.15141	valid-gini:0.29791
[700]	valid-logloss:0.15140	valid-gini:0.29831
[800]	valid-logloss:0.15139	valid-gini:0.29865
[900]	valid-logloss:0.15138	valid-gini:0.29891
[1000]	valid-logloss:0.15138	valid-gini:0.29893
[1100]	valid-logloss:0.15137	valid-gini:0.29916
[1200]	valid-logloss:0.15138	valid-gini:0.29872
[1300]	valid-logloss:0.15139	valid-gini:0.29879
[1315]	valid-logloss:0.15138	valid-gini:0.29890
Fold 1 gini score : 0.2991940038752933

######################################## Fold 2 / 5 ########################################
[0]	valid-logloss:0.21137	valid-gini:0.18049
[100]	valid-logloss:0.15543	valid-gini:0.26108
[200]	valid-logloss:0.15227	valid-gini:0.27386
[300]	valid-logloss:

In [23]:
print('OOF Verification data Gini coefficient:', eval_gini(y, oof_val_preds))


OOF Verification data Gini coefficient: 0.28864889403014304


In [24]:
submission['target'] = oof_test_preds
submission.to_csv('../submissions/submission_best_kaggle_xgb_2.csv')

In [27]:
fixed_params = {'objective' : 'binary:logistic',
                'learning_rate': 0.005,
                'random_state': 1993}

max_params.update(fixed_params)
max_params

{'colsample_bytree': 0.5,
 'gamma': 3.0,
 'max_depth': 7,
 'min_child_weight': 5.0,
 'reg_alpha': 5.754680072558848,
 'reg_lambda': 0.1,
 'scale_pos_weight': 1.0,
 'subsample': 0.6,
 'objective': 'binary:logistic',
 'learning_rate': 0.005,
 'random_state': 1993}

In [28]:
%%time

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)


oof_val_preds = np.zeros(X.shape[0]) 
oof_test_preds = np.zeros(X_test.shape[0]) 
dtest = xgb.DMatrix(X_test)

for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):

    print('#'*40, f'Fold {idx+1} / {folds.n_splits}', '#'*40)
    
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]


    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)
   

    xgb_model = xgb.train(params=max_params, 
                          dtrain=dtrain,
                          num_boost_round=20000,
                          evals=[(dvalid, 'valid')],
                          maximize=True,
                          feval=gini,
                          early_stopping_rounds=1000,
                          verbose_eval=400)

    best_iter = xgb_model.best_iteration
    oof_test_preds += xgb_model.predict(dtest,
                                        iteration_range=(0, best_iter))/folds.n_splits
    
    oof_val_preds[valid_idx] += xgb_model.predict(dvalid, 
                                                  iteration_range=(0, best_iter))
    
    gini_score = eval_gini(y_valid, oof_val_preds[valid_idx])
    print(f'Fold {idx+1} gini score : {gini_score}\n')

######################################## Fold 1 / 5 ########################################




[0]	valid-logloss:0.21263	valid-gini:0.19231
[400]	valid-logloss:0.15541	valid-gini:0.27204
[800]	valid-logloss:0.15202	valid-gini:0.28597
[1200]	valid-logloss:0.15160	valid-gini:0.29323
[1600]	valid-logloss:0.15145	valid-gini:0.29684
[2000]	valid-logloss:0.15140	valid-gini:0.29823
[2400]	valid-logloss:0.15137	valid-gini:0.29896
[2800]	valid-logloss:0.15135	valid-gini:0.29966
[3200]	valid-logloss:0.15133	valid-gini:0.30010
[3600]	valid-logloss:0.15132	valid-gini:0.30029
[4000]	valid-logloss:0.15131	valid-gini:0.30054
[4400]	valid-logloss:0.15131	valid-gini:0.30071
[4800]	valid-logloss:0.15131	valid-gini:0.30086
[5200]	valid-logloss:0.15130	valid-gini:0.30087
[5600]	valid-logloss:0.15130	valid-gini:0.30094
[6000]	valid-logloss:0.15130	valid-gini:0.30096
[6400]	valid-logloss:0.15130	valid-gini:0.30099
[6800]	valid-logloss:0.15130	valid-gini:0.30096
[7200]	valid-logloss:0.15130	valid-gini:0.30100
[7600]	valid-logloss:0.15130	valid-gini:0.30104
[8000]	valid-logloss:0.15130	valid-gini:0.301

In [29]:
print('OOF Verification data Gini coefficient:', eval_gini(y, oof_val_preds))

OOF Verification data Gini coefficient: 0.2897355723149737


In [30]:
submission['target'] = oof_test_preds
submission.to_csv('../submissions/submission_best_kaggle_xgb_3.csv')

In [31]:
max_params_0 = {'colsample_bytree': 0.92975858050776,
 'gamma': 9.95563546750357,
 'max_depth': 7,
 'min_child_weight': 6.249564429359247,
 'reg_alpha': 8.411512219837842,
 'reg_lambda': 1.424460008293778,
 'scale_pos_weight': 1.5416807226581535,
 'subsample': 0.8535233675350644,
 'objective': 'binary:logistic',
 'learning_rate': 0.01,
 'random_state': 1993}

In [32]:
%%time

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)


oof_val_preds = np.zeros(X.shape[0]) 
oof_test_preds = np.zeros(X_test.shape[0]) 
dtest = xgb.DMatrix(X_test)

for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):

    print('#'*40, f'Fold {idx+1} / {folds.n_splits}', '#'*40)
    
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]


    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)
   

    xgb_model = xgb.train(params=max_params_0, 
                          dtrain=dtrain,
                          num_boost_round=10000,
                          evals=[(dvalid, 'valid')],
                          maximize=True,
                          feval=gini,
                          early_stopping_rounds=200,
                          verbose_eval=200)

    best_iter = xgb_model.best_iteration
    oof_test_preds += xgb_model.predict(dtest,
                                        iteration_range=(0, best_iter))/folds.n_splits
    
    oof_val_preds[valid_idx] += xgb_model.predict(dvalid, 
                                                  iteration_range=(0, best_iter))
    
    gini_score = eval_gini(y_valid, oof_val_preds[valid_idx])
    print(f'Fold {idx+1} gini score : {gini_score}\n')

######################################## Fold 1 / 5 ########################################




[0]	valid-logloss:0.21996	valid-gini:0.20901
[200]	valid-logloss:0.16179	valid-gini:0.27455
[400]	valid-logloss:0.15622	valid-gini:0.28722
[600]	valid-logloss:0.15546	valid-gini:0.29128
[800]	valid-logloss:0.15535	valid-gini:0.29186
[1000]	valid-logloss:0.15533	valid-gini:0.29202
[1200]	valid-logloss:0.15532	valid-gini:0.29232
[1400]	valid-logloss:0.15532	valid-gini:0.29240
[1600]	valid-logloss:0.15530	valid-gini:0.29252
[1800]	valid-logloss:0.15531	valid-gini:0.29254
[2000]	valid-logloss:0.15531	valid-gini:0.29265
[2200]	valid-logloss:0.15530	valid-gini:0.29272
[2366]	valid-logloss:0.15530	valid-gini:0.29273
Fold 1 gini score : 0.2927066786908154

######################################## Fold 2 / 5 ########################################
[0]	valid-logloss:0.21996	valid-gini:0.19909
[200]	valid-logloss:0.16203	valid-gini:0.26142
[400]	valid-logloss:0.15664	valid-gini:0.27163
[600]	valid-logloss:0.15591	valid-gini:0.27571
[800]	valid-logloss:0.15581	valid-gini:0.27639
[1000]	valid-logl

In [42]:
save_npz('../input/X_test_217.npz', X_test)

In [43]:
np.save('../input/y_217.npy', y)