In [30]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb

cwd=os.getcwd()+'/'

def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

In [31]:
df_train = pd.read_csv('./training.csv')
df_test = pd.read_csv('./test.csv')

target_train = df_train['IsBadBuy'].values
id_test = df_test['RefId'].values

df_train = df_train.drop(['IsBadBuy','RefId'],axis=1)
df_test= df_test.drop(['RefId'], axis = 1)
combine= pd.concat([df_train, df_test],axis=0)

In [32]:
drop_list = ['PRIMEUNIT', 'AUCGUART', 'Model', 'Trim', 
             'SubModel', 'WheelType', 'BYRNO', 'VNZIP1']

combine.PurchDate = pd.to_datetime(combine.PurchDate, format=r'%m/%d/%Y').astype(np.int64)//10**15
    
# add engine feature
combine['Engine'] = combine.SubModel.str.extract('\s+(\d+.\d+)L', expand = True).astype('float64')

# now we can start the drop list
combine = combine.drop(drop_list, axis=1)

num_features = combine.select_dtypes(include = ['float64', 'int64', 'bool']).columns.values
cat_features = combine.select_dtypes(include = ['object']).columns.values

In [33]:
cat_features

array(['Auction', 'Make', 'Color', 'Transmission', 'Nationality', 'Size',
       'TopThreeAmericanName', 'VNST'], dtype=object)

In [34]:
cat_na_features = []
for column in cat_features:
    if combine[column].isnull().sum()>0:
        cat_na_features.append(column)
cat_na_features

['Color', 'Transmission', 'Nationality', 'Size', 'TopThreeAmericanName']

In [35]:
for column in cat_features:
    temp = pd.get_dummies(pd.Series(combine[column]))
    combine = pd.concat([combine,temp],axis=1)
    combine = combine.drop([column],axis=1)

In [36]:
df_train=combine[:df_train.shape[0]]
df_test=combine[df_train.shape[0]:]

train = np.array(df_train)
test = np.array(df_test)

print ("The train shape is:",train.shape)
print ('The test shape is:',test.shape)

The train shape is: (72983, 131)
The test shape is: (48707, 131)


In [37]:
xgb_preds = []


K = 5
kf = KFold(n_splits = K, random_state = 3228,shuffle=True)


for train_index, test_index in kf.split(train):
    train_X, valid_X = train[train_index], train[test_index]
    train_y, valid_y = target_train[train_index], target_train[test_index]

    # params configuration also from the1owl's kernel
    # https://www.kaggle.com/the1owl/forza-baseline
    xgb_params = {'eta': 0.02, 'max_depth': 5, 'subsample': 0.9, 'colsample_bytree': 0.9, 
                  'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}

    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    d_test = xgb.DMatrix(test)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 5000,  watchlist, feval=gini_xgb, maximize=True, verbose_eval=50, early_stopping_rounds=100)
                        
    xgb_pred = model.predict(d_test)
    xgb_preds.append(list(xgb_pred))
    
    
    
preds=[]
for i in range(len(xgb_preds[0])):
    sum=0
    for j in range(K):
        sum+=xgb_preds[j][i]
    preds.append(sum / K)

output = pd.DataFrame({'RefId': id_test, 'IsBadBuy': preds})
output.to_csv("{}-foldCV_avg_sub.csv".format(K), index=False)

[0]	train-gini:0.318951	valid-gini:0.311681
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[50]	train-gini:0.517047	valid-gini:0.514266
[100]	train-gini:0.527737	valid-gini:0.519109
[150]	train-gini:0.541602	valid-gini:0.525435
[200]	train-gini:0.563605	valid-gini:0.532111
[250]	train-gini:0.581184	valid-gini:0.537899
[300]	train-gini:0.595471	valid-gini:0.541909
[350]	train-gini:0.609043	valid-gini:0.545437
[400]	train-gini:0.620636	valid-gini:0.548431
[450]	train-gini:0.631493	valid-gini:0.550676
[500]	train-gini:0.642446	valid-gini:0.552954
[550]	train-gini:0.65254	valid-gini:0.553848
[600]	train-gini:0.662797	valid-gini:0.555657
[650]	train-gini:0.671854	valid-gini:0.556737
[700]	train-gini:0.68127	valid-gini:0.558364
[750]	train-gini:0.688926	valid-gini:0.559751
[800]	train-gini:0.696721	valid-gini:0.560519
[850]	train-gini:0.70452	valid-gini:0.561637
[900]	train-gini:0.713031	valid-