In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2024-01-11T16:10:30.648920-05:00

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 8.10.0

Compiler    : Clang 15.0.0 (clang-1500.0.40.1)
OS          : Darwin
Release     : 23.0.0
Machine     : arm64
Processor   : arm
CPU cores   : 20
Architecture: 64bit



In [3]:
%watermark --gpu

GPU Info: Install the gpu extra (pip install 'watermark[gpu]') to display GPU information for NVIDIA chipsets



In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import sklearn
import gc
import os
import xgboost

In [5]:
%watermark --iversions

pandas : 2.1.4
sklearn: 1.3.2
xgboost: 2.0.3
numpy  : 1.26.3



In [6]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [7]:
train = pd.read_csv('../input/train.csv.zip')
test = pd.read_csv('../input/test.csv.zip')
sample_submission = pd.read_csv('../input/sample_submission.csv.zip')

In [8]:
optuna_log = pd.read_csv('optuna_xgb_output_1.csv')
optuna_log.head()

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_objective,params_subsample,params_tree_method,state
0,0,0.265787,0.147691,0.514698,logloss,1.31182,0.024202,35,251,binary:logistic,0.405729,gpu_hist,COMPLETE
1,1,0.250456,0.00207,0.918553,logloss,0.001095,0.024384,16,275,binary:logistic,0.729856,gpu_hist,COMPLETE
2,2,0.272139,0.007562,0.982536,logloss,0.439774,0.003817,22,245,binary:logistic,0.474967,gpu_hist,COMPLETE
3,3,0.217394,8.35118,0.582607,logloss,2.247182,0.042538,17,210,binary:logistic,0.964056,gpu_hist,COMPLETE
4,4,0.266944,0.010888,0.525752,logloss,5.060227,0.001376,22,14,binary:logistic,0.64877,gpu_hist,COMPLETE


In [9]:
optuna_log.value.max()

0.2835632739412906

In [10]:
max_value_row = optuna_log[optuna_log['value'] == optuna_log['value'].max()]

In [11]:
max_value_row

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_objective,params_subsample,params_tree_method,state
48,48,0.283563,0.018523,0.621896,logloss,0.002266,0.009911,8,292,binary:logistic,0.475931,gpu_hist,COMPLETE


In [12]:
columns = test.columns[1:]

In [14]:
%%time
train_folds = []
val_folds = []
train_ys = []
val_ys = []

for i in range(5):
    print(f'Loading fold {i}')
    train_fold = pd.read_csv(f'../input/xgtrain_fold_{i}_5X.csv.gz')

    val_fold = pd.read_csv(f'../input/xgval_fold_{i}_5X.csv.gz')
    
    train_y = train_fold['target']
    train_fold = train_fold[train_fold.columns.difference(['target'])]
    
    val_y = val_fold['target']
    val_fold = val_fold[val_fold.columns.difference(['target'])]
    
    train_folds.append(train_fold)
    val_folds.append(val_fold)
    
    train_ys.append(train_y)
    val_ys.append(val_y)

Loading fold 0
Loading fold 1
Loading fold 2
Loading fold 3
Loading fold 4
CPU times: user 18.9 s, sys: 5.16 s, total: 24.1 s
Wall time: 24.1 s


In [15]:
X = train[columns].values
X_test = test[columns].values
Y = train.target.values

In [16]:
XGBClassifier()

In [17]:
max_value_row

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_objective,params_subsample,params_tree_method,state
48,48,0.283563,0.018523,0.621896,logloss,0.002266,0.009911,8,292,binary:logistic,0.475931,gpu_hist,COMPLETE


In [34]:
%%time
train_oof = np.zeros((X.shape[0], ))
test_preds = 0
train_oof.shape

n_splits = 5
kf = KFold(n_splits=n_splits, random_state=137, shuffle=True)

for jj, (train_index, val_index) in enumerate(kf.split(X)):
    print("Fitting fold", jj+1)
    train_features = train_folds[jj][columns]
    train_target = train_ys[jj]
    
    val_features = val_folds[jj][columns]
    val_target = val_ys[jj]
    
    model = XGBClassifier(n_estimators=1000, max_depth=8, random_state=42, subsample=0.475931, colsample_bytree=0.621896, 
                          learning_rate=0.009911, min_child_weight=292, reg_lambda=0.002266, reg_alpha=0.018523, tree_method='hist')
    model.fit(train_features, train_target)
    val_pred = model.predict_proba(val_features)[:,1]
    train_oof[val_index] = val_pred
    print("Fold normalized:", gini_normalized(val_target, val_pred))
    test_preds += model.predict_proba(X_test)[:,1]/n_splits
    del train_features, train_target, val_features, val_target
    gc.collect()

Fitting fold 1
Fold normalized: 0.2874925234058961
Fitting fold 2
Fold normalized: 0.2922564701232187
Fitting fold 3
Fold normalized: 0.2772175160796194
Fitting fold 4
Fold normalized: 0.28516547245026885
Fitting fold 5
Fold normalized: 0.28032196571779805
CPU times: user 40min 41s, sys: 46min 40s, total: 1h 27min 21s
Wall time: 4min 44s


In [35]:
gini_normalized(Y, train_oof)

0.28427929805044927

In [36]:
roc_auc_score(Y, train_oof)

0.6421396487439169

In [37]:
test[columns].columns

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='obj

In [38]:
train_folds[jj][columns].columns

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='obj

In [39]:
sample_submission['target'] = test_preds

In [40]:
sample_submission.to_csv('../submissions/XGB_5_fold_5X_augment_optuna_best.csv.zip', index=False, compression='zip')

On the leaderbaord this submission scores 0.28061 on the public test set, and 0.28563 on the private dataset.

In [41]:
%%time
train_oof = np.zeros((X.shape[0], ))
test_preds = 0
train_oof.shape

n_splits = 5
kf = KFold(n_splits=n_splits, random_state=137, shuffle=True)

for jj, (train_index, val_index) in enumerate(kf.split(X)):
    print("Fitting fold", jj+1)
    train_features = train_folds[jj][columns]
    train_target = train_ys[jj]
    
    val_features = val_folds[jj][columns]
    val_target = val_ys[jj]
    
    model = XGBClassifier(n_estimators=2000, max_depth=8, random_state=42, subsample=0.475931, colsample_bytree=0.621896, 
                          learning_rate=0.005, min_child_weight=292, reg_lambda=0.002266, reg_alpha=0.018523, tree_method='hist')
    model.fit(train_features, train_target)
    val_pred = model.predict_proba(val_features)[:,1]
    train_oof[val_index] = val_pred
    print("Fold normalized:", gini_normalized(val_target, val_pred))
    test_preds += model.predict_proba(X_test)[:,1]/n_splits
    del train_features, train_target, val_features, val_target
    gc.collect()

Fitting fold 1
Fold normalized: 0.2875500522136766
Fitting fold 2
Fold normalized: 0.2939535528124366
Fitting fold 3
Fold normalized: 0.27634291567571456
Fitting fold 4
Fold normalized: 0.2840482411294676
Fitting fold 5
Fold normalized: 0.2810101674987164
CPU times: user 1h 20min 35s, sys: 1h 34min 55s, total: 2h 55min 31s
Wall time: 9min 22s


In [42]:
gini_normalized(Y, train_oof)

0.28439204803696866

In [43]:
roc_auc_score(Y, train_oof)

0.6421960229334411

In [45]:
sample_submission['target'] = test_preds
sample_submission.to_csv('../submissions/XGB_5_fold_5X_augment_optuna_best_2.csv.zip', index=False, compression='zip')

On the leaderbaord this submission scores 0.28067 on the public test set, and 0.28591 on the private dataset.

In [48]:
%%time
train_oof = np.zeros((X.shape[0], ))
test_preds = 0
train_oof.shape

n_splits = 5
kf = KFold(n_splits=n_splits, random_state=137, shuffle=True)

for jj, (train_index, val_index) in enumerate(kf.split(X)):
    print("Fitting fold", jj+1)
    train_features = train_folds[jj][columns]
    train_target = train_ys[jj]
    
    val_features = val_folds[jj][columns]
    val_target = val_ys[jj]
    
    model = XGBClassifier(n_estimators=1900, max_depth=8, random_state=42, subsample=0.475931, colsample_bytree=0.621896, 
                          learning_rate=0.005, min_child_weight=292, reg_lambda=0.002266, reg_alpha=0.018523, tree_method='hist')
    model.fit(train_features, train_target)
    val_pred = model.predict_proba(val_features)[:,1]
    train_oof[val_index] = val_pred
    print("Fold normalized:", gini_normalized(val_target, val_pred))
    test_preds += model.predict_proba(X_test)[:,1]/n_splits
    del train_features, train_target, val_features, val_target
    gc.collect()

Fitting fold 1
Fold normalized: 0.28776565280271915
Fitting fold 2
Fold normalized: 0.29390645707718416
Fitting fold 3
Fold normalized: 0.27637937140843605
Fitting fold 4
Fold normalized: 0.28419723118337376
Fitting fold 5
Fold normalized: 0.2810284607366598
CPU times: user 1h 17min 22s, sys: 1h 30min 46s, total: 2h 48min 8s
Wall time: 8min 57s


In [49]:
gini_normalized(Y, train_oof)

0.28446784948518056

In [50]:
sample_submission['target'] = test_preds
sample_submission.to_csv('../submissions/XGB_5_fold_5X_augment_optuna_best_3.csv.zip', index=False, compression='zip')

On the leaderbaord this submission scores 0.28067 on the public test set, and 0.28585 on the private dataset.

In [54]:
%%time
train_oof = np.zeros((X.shape[0], ))
test_preds = 0
train_oof.shape

n_splits = 5
kf = KFold(n_splits=n_splits, random_state=137, shuffle=True)

for jj, (train_index, val_index) in enumerate(kf.split(X)):
    print("Fitting fold", jj+1)
    train_features = train_folds[jj][columns]
    train_target = train_ys[jj]
    
    val_features = val_folds[jj][columns]
    val_target = val_ys[jj]
    
    model = XGBClassifier(n_estimators=4600, max_depth=8, random_state=42, subsample=0.475931, colsample_bytree=0.621896, 
                          learning_rate=0.002, min_child_weight=292, reg_lambda=0.002266, reg_alpha=0.018523, tree_method='hist')
    model.fit(train_features, train_target)
    val_pred = model.predict_proba(val_features)[:,1]
    train_oof[val_index] = val_pred
    print("Fold normalized:", gini_normalized(val_target, val_pred))
    test_preds += model.predict_proba(X_test)[:,1]/n_splits
    del train_features, train_target, val_features, val_target
    gc.collect()

Fitting fold 1
Fold normalized: 0.28746895901990765
Fitting fold 2
Fold normalized: 0.2937320988802002
Fitting fold 3
Fold normalized: 0.2765507920278883
Fitting fold 4
Fold normalized: 0.28442596477749593
Fitting fold 5
Fold normalized: 0.2806863129380779
CPU times: user 2h 56min 41s, sys: 2h 53min 35s, total: 5h 50min 16s
Wall time: 34min 36s
