This is a complete customized cross validation using LightGBM, which also produces the final prediction, feature importance, and training prediction.

In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
import warnings; warnings.simplefilter('ignore')
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import StratifiedKFold, KFold
import gc
import time

In [None]:
def cv(df, n_folds, params, stratified = True, save_train_prediction = False):
    
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print(f"Starting XGBoost. Train shape: {train_df.shape}, test shape: {test_df.shape}")

    if stratified:
        folds = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = 2018)
    else:
        folds = KFold(n_splits = n_folds, shuffle = True, random_state = 2018)
        
    test_pred_proba = np.zeros(train_df.shape[0])
    
    prediction = np.zeros(test_df.shape[0])
    
    feats = [f for f in train_df.columns if f not in ['TARGET']]
    
    df_feature_importance = pd.DataFrame(index = feats)
    
    clf = LGBMClassifier(**params)
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        print('Fold', n_fold, 'started at', time.ctime())
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf.fit(train_x, train_y, eval_set = [(valid_x, valid_y)], eval_metric = 'auc', verbose = 100, early_stopping_rounds = 200)
        
        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        prediction += clf.predict_proba(test_df[feats], num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits

        df_feature_importance[n_fold] = pd.Series(clf.feature_importances_, index = feats)
        
        print(f'Fold {n_fold:2d} AUC : {roc_auc_score(valid_y, test_pred_proba[valid_idx]):.6f}')
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    roc_auc_test = roc_auc_score(train_df['TARGET'], test_pred_proba)

    print(f'Full AUC score {roc_auc_test:.6f}')

    df_feature_importance.fillna(0, inplace = True)
    df_feature_importance['mean'] = df_feature_importance.mean(axis = 1)
    
    if save_train_prediction:
        df_prediction = train_df[['TARGET']]
        df_prediction['Prediction'] = test_pred_proba
        df_prediction.to_csv('train_prediction.csv')
        del df_prediction
        gc.collect()

    df_prediction = test_df[['TARGET']]
    df_prediction['TARGET'] = prediction
    df_prediction.to_csv('test_prediction.csv')
    del df_prediction
    gc.collect()
    
    return df_feature_importance, roc_auc_test

In [None]:
lgbm_params = dict( 
            n_estimators=10000,
            learning_rate=0.0100664435413599,
            num_leaves=44,
            colsample_bytree=0.8,
            subsample=1,
            max_depth=8,
            reg_alpha=0.1,
            reg_lambda=0.0192546149319087,
            min_split_gain=0.1,
            min_child_weight=48
            )

In [None]:
traintest = pd.read_csv('traintest.csv', index_col = 0)
cols = list(traintest.columns)
cols.remove('TARGET')
tar = 'TARGET'
feature_importance, score = cv(traintest, 5, lgbm_params, stratified = True, save_train_prediction = True)
feature_importance.to_csv('feature_importance.csv')