In [62]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb

cwd=os.getcwd()+'/'

def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

df_train = pd.read_csv("train.csv", sep = ',')
df_test = pd.read_csv("test.csv", sep = ',')

target_train = df_train['target'].values
id_test = df_test['id'].values

df_train=df_train.drop(['target','id'],axis=1)
df_test=df_test.drop(['id'], axis = 1)
combine= pd.concat([df_train,df_test],axis=0)

In [63]:
# Performing one hot encoding
cat_features = [a for a in combine.columns if a.endswith('cat')]
# those categorical features with no missing values
cat_no_na_list = ['ps_car_04_cat', 'ps_car_06_cat', 'ps_car_08_cat', 'ps_car_10_cat', 'ps_car_11_cat']

for column in cat_features:
  
    temp=pd.get_dummies(pd.Series(combine[column]))
    # assign the category with NA if the original data match with -1
    if not column in cat_no_na_list:
        temp[combine[column] == -1] = np.NaN
        # drop -1 columns
        temp = temp.drop([-1], axis=1)
    combine=pd.concat([combine,temp],axis=1)
    combine=combine.drop([column],axis=1)

df_train=combine[:df_train.shape[0]]
df_test=combine[df_train.shape[0]:]

train = np.array(df_train)
test = np.array(df_test)

print ("The train shape is:",train.shape)
print ('The test shape is:',test.shape)

The train shape is: (595212, 218)
The test shape is: (892816, 218)


In [68]:
xgb_preds = []


K = 5
kf = KFold(n_splits = K, random_state = 3228,shuffle=True)


for train_index, test_index in kf.split(train):
    train_X, valid_X = train[train_index], train[test_index]
    train_y, valid_y = target_train[train_index], target_train[test_index]

    # params configuration also from the1owl's kernel
    # https://www.kaggle.com/the1owl/forza-baseline
    xgb_params = {'eta': 0.02, 'max_depth': 5, 'subsample': 0.9, 'colsample_bytree': 0.9, 
                  'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}

    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    d_test = xgb.DMatrix(test)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 5000,  watchlist, feval=gini_xgb, maximize=True, verbose_eval=50, early_stopping_rounds=100)
                        
    xgb_pred = model.predict(d_test)
    xgb_preds.append(list(xgb_pred))
    
    
    
preds=[]
for i in range(len(xgb_preds[0])):
    sum=0
    for j in range(K):
        sum+=xgb_preds[j][i]
    preds.append(sum / K)

output = pd.DataFrame({'id': id_test, 'target': preds})
output.to_csv("{}-foldCV_avg_sub.csv".format(K), index=False)  

[0]	train-gini:0.215373	valid-gini:0.206195
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[50]	train-gini:0.256997	valid-gini:0.238886
[100]	train-gini:0.264434	valid-gini:0.241977
[150]	train-gini:0.282669	valid-gini:0.251248
[200]	train-gini:0.300166	valid-gini:0.258201
[250]	train-gini:0.314741	valid-gini:0.263626
[300]	train-gini:0.327965	valid-gini:0.267989
[350]	train-gini:0.338129	valid-gini:0.270249
[400]	train-gini:0.348994	valid-gini:0.271313
[450]	train-gini:0.358933	valid-gini:0.271878
[500]	train-gini:0.36788	valid-gini:0.272034
[550]	train-gini:0.375769	valid-gini:0.272146
[600]	train-gini:0.38331	valid-gini:0.272327
[650]	train-gini:0.389864	valid-gini:0.27255
[700]	train-gini:0.396518	valid-gini:0.272859
[750]	train-gini:0.402896	valid-gini:0.272591
Stopping. Best iteration:
[683]	train-gini:0.3945	valid-gini:0.272912

[0]	train-gini:0.209423	valid-gini:0.211063
Multiple 