This is a complete customized cross validation using XGBoost, which also produces the final prediction, feature importance, and training prediction.

In [1]:
from tqdm import tnrange, tqdm, tqdm_notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.ion()
import xgboost as xgb
import warnings; warnings.simplefilter('ignore')
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import StratifiedKFold, KFold
import gc
import time

In [None]:
def cv(df, n_folds, params, stratified = True, save_train_prediction = False, n_estimators=10000):
    
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print(f"Starting XGBoost. Train shape: {train_df.shape}, test shape: {test_df.shape}")

    if stratified:
        folds = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = 2018)
    else:
        folds = KFold(n_splits = n_folds, shuffle = True, random_state = 2018)
        
    test_pred_proba = np.zeros(train_df.shape[0])
    
    prediction = np.zeros(test_df.shape[0])
    
    feats = [f for f in train_df.columns if f not in ['TARGET']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        print('Fold', n_fold, 'started at', time.ctime())
        xgtrain = xgb.DMatrix(data = train_df[feats].iloc[train_idx], label=train_df['TARGET'].iloc[train_idx])
        evalset = xgb.DMatrix(data = train_df[feats].iloc[valid_idx], label=train_df['TARGET'].iloc[valid_idx])
        bst = xgb.train(params = params, dtrain = xgtrain, num_boost_round = n_estimators, early_stopping_rounds = 200, evals = [(evalset, 'try')], maximize = True,verbose_eval = 100)
        del xgtrain
        gc.collect()
        
        test_pred_proba[valid_idx] = bst.predict(evalset, ntree_limit = bst.best_iteration)
        testinput = xgb.DMatrix(data = test_df[feats])
        prediction += bst.predict(testinput, ntree_limit = bst.best_iteration) / folds.n_splits
        
        bst.get_score(fmap='Fold_' + n_fold +'_feature_importance.csv', importance_type='weight')
        
        print(f'Fold {n_fold:2d} AUC : {roc_auc_score(valid_y, test_pred_proba[valid_idx]):.6f}')
        
        del evalset, testinput, bst
        gc.collect()

    roc_auc_test = roc_auc_score(train_df['TARGET'], test_pred_proba)

    print(f'Full AUC score {roc_auc_test:.6f}')

    if save_train_prediction:
        df_prediction = train_df[['TARGET']]
        df_prediction['Prediction'] = test_pred_proba
        df_prediction.to_csv('train_prediction.csv')
        del df_prediction
        gc.collect()

    df_prediction = test_df[['TARGET']]
    df_prediction['TARGET'] = prediction
    df_prediction.to_csv('test_prediction.csv')
    del df_prediction
    gc.collect()
    
    return roc_auc_test

In [2]:
xgb_params = dict( 
            learning_rate=0.0100664435413599,
            colsample_bytree=0.8,
            subsample=1,
            max_depth=7,
            reg_alpha=0.1,
            reg_lambda=1,
            min_child_weight=45,
            gamma=0,
            max_bin=256,
            tree_method = 'gpu_hist',
            objective='gpu:binary:logistic',
            eval_metric='auc',
            predictor='cpu_predictor'            
            )

In [None]:
traintest = pd.read_csv('traintest.csv', index_col = 0)
cols = list(traintest.columns)
cols.remove('TARGET')
tar = 'TARGET'
score = cv(traintest, 5, xgb_params, stratified = True, save_train_prediction = True, n_estimators=10000)