All code is from the following Kaggle notebook: https://www.kaggle.com/code/cocoyachi/safedriver-xgboost-musthave-ch08

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import save_npz
from bayes_opt import BayesianOptimization

In [2]:
data_path = '../input/'

train = pd.read_csv(data_path + 'train.csv.zip', index_col='id')
test = pd.read_csv(data_path + 'test.csv.zip', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv.zip', index_col='id')

In [3]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis=1) 

all_features = all_data.columns

In [4]:
cat_features = [feature for feature in all_features if 'cat' in feature]

onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])

In [5]:
all_data['num_missing'] = (all_data==-1).sum(axis=1)

In [6]:
remaining_features = [feature for feature in all_features
                      if ('cat' not in feature and 'calc' not in feature)]

remaining_features.append('num_missing')

In [7]:
ind_features = [feature for feature in all_features if 'ind' in feature]

is_first_feature = True
for ind_feature in ind_features:
    if is_first_feature:
        all_data['mix_ind'] = all_data[ind_feature].astype(str) + '_'
        is_first_feature = False
    else:
        all_data['mix_ind'] += all_data[ind_feature].astype(str) + '_'

In [8]:
all_data['mix_ind']

0          2_2_5_1_0_0_1_0_0_0_0_0_0_0_11_0_1_0_
1           1_1_7_0_0_0_0_1_0_0_0_0_0_0_3_0_0_1_
2          5_4_9_1_0_0_0_1_0_0_0_0_0_0_12_1_0_0_
3           0_1_2_0_0_1_0_0_0_0_0_0_0_0_8_1_0_0_
4           0_2_0_1_0_1_0_0_0_0_0_0_0_0_9_1_0_0_
                           ...                  
1488023     0_1_6_0_0_0_1_0_0_0_0_0_0_0_2_0_0_1_
1488024    5_3_5_1_0_0_0_1_0_0_0_0_0_0_11_1_0_0_
1488025     0_1_5_0_0_1_0_0_0_0_0_0_0_0_5_0_0_1_
1488026    6_1_5_1_0_0_0_0_1_0_0_0_0_0_13_1_0_0_
1488027    7_1_4_1_0_0_0_0_1_0_0_0_0_0_12_1_0_0_
Name: mix_ind, Length: 1488028, dtype: object

In [9]:
cat_count_features = []
for feature in cat_features+['mix_ind']:
    val_counts_dict = all_data[feature].value_counts().to_dict()
    all_data[f'{feature}_count'] = all_data[feature].apply(lambda x:
                                                           val_counts_dict[x])
    cat_count_features.append(f'{feature}_count')

In [10]:
from scipy import sparse

drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin',
                 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_14']

all_data_remaining = all_data[remaining_features+cat_count_features].drop(drop_features, axis=1)

all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining),
                               encoded_cat_matrix],
                              format='csr')

In [11]:
num_train = len(train) 


X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]

y = train['target'].values

In [12]:
X.shape

(595212, 217)

In [13]:
X_test.shape

(892816, 217)

In [14]:
X.toarray()

array([[ 2.,  5.,  0., ...,  0.,  0.,  0.],
       [ 1.,  7.,  0., ...,  0.,  0.,  0.],
       [ 5.,  9.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 1., 10.,  1., ...,  0.,  0.,  0.],
       [ 5.,  3.,  0., ...,  0.,  0.,  0.],
       [ 0.,  8.,  1., ...,  0.,  0.,  0.]])

In [15]:
X_test.toarray()

array([[0., 8., 0., ..., 0., 0., 0.],
       [4., 5., 0., ..., 0., 1., 0.],
       [5., 3., 0., ..., 0., 0., 0.],
       ...,
       [0., 5., 1., ..., 0., 0., 0.],
       [6., 5., 0., ..., 0., 0., 0.],
       [7., 4., 0., ..., 0., 0., 0.]])

In [16]:
X_df = pd.DataFrame(data=X.toarray(), columns=[f'col_{i}' for i in range(217)])

In [17]:
X_df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_207,col_208,col_209,col_210,col_211,col_212,col_213,col_214,col_215,col_216
0,2.0,5.0,0.0,1.0,0.0,0.0,11.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,7.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,9.0,0.0,0.0,1.0,0.0,12.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,2.0,1.0,0.0,0.0,0.0,8.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,9.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
X_test_df = pd.DataFrame(data=X_test.toarray(), columns=[f'col_{i}' for i in range(217)])

In [19]:
train = pd.read_csv(data_path + 'train.csv.zip')
test = pd.read_csv(data_path + 'test.csv.zip')

In [20]:
X_df['id'] = train['id'].values
X_df['target'] = y
X_test_df['id'] = test['id'].values

In [21]:
def eval_gini(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    
    n_samples = y_true.shape[0]
    L_mid = np.linspace(1 / n_samples, 1, n_samples)

    pred_order = y_true[y_pred.argsort()]
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    G_pred = np.sum(L_mid - L_pred)

    true_order = y_true[y_true.argsort()]
    L_true = np.cumsum(true_order) / np.sum(true_order)
    G_true = np.sum(L_mid - L_true)
    
    return G_pred / G_true

In [22]:
def gini(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds)

In [27]:
#max_params = optimizer.max['params']
#max_params
'''{'colsample_bytree': 0.92975858050776,
 'gamma': 9.95563546750357,
 'max_depth': 6.809274695878221,
 'min_child_weight': 6.249564429359247,
 'reg_alpha': 8.411512219837842,
 'reg_lambda': 1.424460008293778,
 'scale_pos_weight': 1.5416807226581535,
 'subsample': 0.8535233675350644}'''

{'colsample_bytree': 0.92975858050776,
 'gamma': 9.95563546750357,
 'max_depth': 6.809274695878221,
 'min_child_weight': 6.249564429359247,
 'reg_alpha': 8.411512219837842,
 'reg_lambda': 1.424460008293778,
 'scale_pos_weight': 1.5416807226581535,
 'subsample': 0.8535233675350644}

In [28]:
max_params['max_depth'] = int(round(max_params['max_depth']))

max_params.update(fixed_params)
max_params

{'colsample_bytree': 0.92975858050776,
 'gamma': 9.95563546750357,
 'max_depth': 7,
 'min_child_weight': 6.249564429359247,
 'reg_alpha': 8.411512219837842,
 'reg_lambda': 1.424460008293778,
 'scale_pos_weight': 1.5416807226581535,
 'subsample': 0.8535233675350644,
 'objective': 'binary:logistic',
 'learning_rate': 0.02,
 'random_state': 1993}

In [23]:
best_params = {'objective': 'binary:logistic',
 'tree_method': 'exact',
 'device': 'cpu',
 'lambda': 0.016329246014877414,
 'alpha': 3.33029065395022,
 'colsample_bytree': 0.5456389940628341,
 'subsample': 0.7684743978358726,
 'learning_rate': 0.014586087705844453,
 'max_depth': 7,
 'min_child_weight': 135,
 'eval_metric': 'logloss',
 'random_state': 1777}

In [24]:
%%time

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)


oof_val_preds = np.zeros(X.shape[0]) 
oof_test_preds = np.zeros(X_test.shape[0]) 
dtest = xgb.DMatrix(X_test)

for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):

    print('#'*40, f'Fold {idx+1} / {folds.n_splits}', '#'*40)
    
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]


    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)
   

    xgb_model = xgb.train(params=best_params, 
                          dtrain=dtrain,
                          num_boost_round=1000,
                          evals=[(dvalid, 'valid')],
                          maximize=True,
                          feval=gini,
                          early_stopping_rounds=200,
                          verbose_eval=100)

    best_iter = xgb_model.best_iteration
    oof_test_preds += xgb_model.predict(dtest,
                                        iteration_range=(0, best_iter))/folds.n_splits
    
    oof_val_preds[valid_idx] += xgb_model.predict(dvalid, 
                                                  iteration_range=(0, best_iter))
    
    gini_score = eval_gini(y_valid, oof_val_preds[valid_idx])
    print(f'Fold {idx+1} gini score : {gini_score}\n')

######################################## Fold 1 / 5 ########################################




[0]	valid-logloss:0.21182	valid-gini:0.20499
[100]	valid-logloss:0.15918	valid-gini:0.27344
[200]	valid-logloss:0.15278	valid-gini:0.28206
[300]	valid-logloss:0.15188	valid-gini:0.28835
[400]	valid-logloss:0.15163	valid-gini:0.29270
[500]	valid-logloss:0.15151	valid-gini:0.29540
[600]	valid-logloss:0.15144	valid-gini:0.29728
[700]	valid-logloss:0.15140	valid-gini:0.29817
[800]	valid-logloss:0.15137	valid-gini:0.29913
[900]	valid-logloss:0.15135	valid-gini:0.29976
[999]	valid-logloss:0.15136	valid-gini:0.29963
Fold 1 gini score : 0.29976467621935393

######################################## Fold 2 / 5 ########################################
[0]	valid-logloss:0.21182	valid-gini:0.19873
[100]	valid-logloss:0.15933	valid-gini:0.26094
[200]	valid-logloss:0.15304	valid-gini:0.26889
[300]	valid-logloss:0.15223	valid-gini:0.27439
[400]	valid-logloss:0.15203	valid-gini:0.27816
[500]	valid-logloss:0.15192	valid-gini:0.28129
[600]	valid-logloss:0.15187	valid-gini:0.28252
[700]	valid-logloss:0.15

In [25]:
print('OOF Verification data Gini coefficient:', eval_gini(y, oof_val_preds))


OOF Verification data Gini coefficient: 0.2890416750522966


In [26]:
submission['target'] = oof_test_preds
submission.to_csv('../submissions/submission_best_kaggle_xgb_optuna_hist.csv')

In [33]:
X.shape

(595212, 217)

In [36]:
X_test.shape

(892816, 217)

In [37]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [38]:
X

<595212x217 sparse matrix of type '<class 'numpy.float64'>'
	with 24449411 stored elements in Compressed Sparse Row format>

In [39]:
X.toarray()

array([[ 2.,  5.,  0., ...,  0.,  0.,  0.],
       [ 1.,  7.,  0., ...,  0.,  0.,  0.],
       [ 5.,  9.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 1., 10.,  1., ...,  0.,  0.,  0.],
       [ 5.,  3.,  0., ...,  0.,  0.,  0.],
       [ 0.,  8.,  1., ...,  0.,  0.,  0.]])

In [41]:
save_npz('../input/X_217.npz', X)

In [42]:
save_npz('../input/X_test_217.npz', X_test)

In [43]:
np.save('../input/y_217.npy', y)