In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb

def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

train_path = "C:\\Users\\Rui\\source\\Kaggle\\Safe_Driver_Prediction\\train.csv"
test_path = "C:\\Users\\Rui\\source\\Kaggle\\Safe_Driver_Prediction\\test.csv"

df_train = pd.read_csv(train_path, sep = ',')
df_test = pd.read_csv(test_path, sep = ',')

target_train = df_train['target'].values
id_test = df_test['id'].values

train = np.array(df_train.drop(['target','id'], axis = 1))
test = np.array(df_test.drop(['id'], axis = 1))

from imblearn.over_sampling import SMOTE
sme = SMOTE()
train, target_train = sme.fit_sample(train, target_train)


xgb_preds = []

K = 5
kf = KFold(n_splits = K, random_state = 3228, shuffle = True)


for train_index, test_index in kf.split(train):
    train_X, valid_X = train[train_index], train[test_index]
    train_y, valid_y = target_train[train_index], target_train[test_index]

    # params configuration also from the1owl's kernel
    # https://www.kaggle.com/the1owl/forza-baseline
    xgb_params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}

    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    d_test = xgb.DMatrix(test)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 5000,  watchlist, feval=gini_xgb, maximize=True, verbose_eval=50, early_stopping_rounds=100)
                        
    xgb_pred = model.predict(d_test)
    xgb_preds.append(list(xgb_pred))
    
    
preds=[]
for i in range(len(xgb_preds[0])):
    sum=0
    for j in range(K):
        sum+=xgb_preds[j][i]
    preds.append(sum / K)

output = pd.DataFrame({'id': id_test, 'target': preds})
output.to_csv("{}-foldCV_avg_sub.csv".format(K), index=False)   

[0]	train-gini:0.649698	valid-gini:0.64746
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[50]	train-gini:0.960366	valid-gini:0.960736
[100]	train-gini:0.966308	valid-gini:0.966849
[150]	train-gini:0.967346	valid-gini:0.968046
[200]	train-gini:0.967901	valid-gini:0.96854
[250]	train-gini:0.968163	valid-gini:0.968721
[300]	train-gini:0.968607	valid-gini:0.969121
[350]	train-gini:0.96907	valid-gini:0.969507
[400]	train-gini:0.969605	valid-gini:0.969976
[450]	train-gini:0.97003	valid-gini:0.970342
[500]	train-gini:0.970202	valid-gini:0.970503
[550]	train-gini:0.970366	valid-gini:0.970641
[600]	train-gini:0.970518	valid-gini:0.970775
[650]	train-gini:0.970654	valid-gini:0.970889
[700]	train-gini:0.970769	valid-gini:0.970981
[750]	train-gini:0.970951	valid-gini:0.971101
[800]	train-gini:0.971253	valid-gini:0.971306
[850]	train-gini:0.971611	valid-gini:0.971582
[900]	train-gini:0.971918	valid-g

[650]	train-gini:0.970897	valid-gini:0.970536
[700]	train-gini:0.971012	valid-gini:0.970614
[750]	train-gini:0.971159	valid-gini:0.970701
[800]	train-gini:0.971356	valid-gini:0.970823
[850]	train-gini:0.971666	valid-gini:0.971064
[900]	train-gini:0.971974	valid-gini:0.971277
[950]	train-gini:0.972254	valid-gini:0.971463
[1000]	train-gini:0.972513	valid-gini:0.971617
[1050]	train-gini:0.972747	valid-gini:0.971762
[1100]	train-gini:0.972973	valid-gini:0.971893
[1150]	train-gini:0.973174	valid-gini:0.971985
[1200]	train-gini:0.973366	valid-gini:0.972084
[1250]	train-gini:0.97354	valid-gini:0.972156
[1300]	train-gini:0.973713	valid-gini:0.97223
[1350]	train-gini:0.973868	valid-gini:0.972281
[1400]	train-gini:0.974019	valid-gini:0.972337
[1450]	train-gini:0.97417	valid-gini:0.972375
[1500]	train-gini:0.974309	valid-gini:0.972409
[1550]	train-gini:0.974449	valid-gini:0.972443
[1600]	train-gini:0.974595	valid-gini:0.972465
[1650]	train-gini:0.974745	valid-gini:0.972491
[1700]	train-gini:0.974