In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import gc

In [2]:
print('loading files...')
train = pd.read_csv('../input/train.csv', na_values=-1)
test = pd.read_csv('../input/test.csv', na_values=-1)
col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(col_to_drop, axis=1)  
test = test.drop(col_to_drop, axis=1)  

for c in train.select_dtypes(include=['float64']).columns:
    train[c]=train[c].astype(np.float32)
    test[c]=test[c].astype(np.float32)
for c in train.select_dtypes(include=['int64']).columns[2:]:
    train[c]=train[c].astype(np.int8)
    test[c]=test[c].astype(np.int8)    

print(train.shape, test.shape)

loading files...
(595212, 39) (892816, 38)


In [3]:
# custom objective function (similar to auc)

def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

In [6]:
# xgb
params = {
    'eta': 0.02,
    'max_depth': 4,
    'subsample': 0.9,
    'colsample_bytree': 0.9, 
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': True
}

X = train.drop(['id', 'target'], axis=1)
features = X.columns
X = X.values
y = train['target'].values
sub=test['id'].to_frame()
sub['target']=0

cv_scores = []

nrounds=2000  # need to change to 2000
kfold = 5  # need to change to 5
skf = StratifiedKFold(n_splits=kfold, random_state=0)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=100)
    sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit) / kfold
    y_eval_pred = xgb_model.predict(xgb.DMatrix(X_valid), ntree_limit=xgb_model.best_ntree_limit)
    cv_scores.append(gini(y_valid, y_eval_pred) / gini(y_valid, y_valid))

 xgb kfold: 1  of  5 : 
[0]	train-auc:0.601324	valid-auc:0.602617	train-gini:0.202082	valid-gini:0.205887
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-auc:0.623693	valid-auc:0.620085	train-gini:0.247387	valid-gini:0.240162
[200]	train-auc:0.638531	valid-auc:0.629181	train-gini:0.277062	valid-gini:0.258362
[300]	train-auc:0.648991	valid-auc:0.635315	train-gini:0.297981	valid-gini:0.27063
[400]	train-auc:0.655998	valid-auc:0.638341	train-gini:0.311997	valid-gini:0.276683
[500]	train-auc:0.661262	valid-auc:0.640102	train-gini:0.322523	valid-gini:0.280205
[600]	train-auc:0.665372	valid-auc:0.640794	train-gini:0.330743	valid-gini:0.281587
[700]	train-auc:0.668688	valid-auc:0.641223	train-gini:0.337375	valid-gini:0.282446
Stopping. Best iteration:
[691]	train-auc:0.668416	valid-auc:0.64127	train-gini:0.336833	valid-gini:0.282541

 xgb kfold: 2  of  5 : 
[0]	train-auc:0.600769	vali

In [7]:
np.mean(cv_scores)

0.28416580012853843

In [8]:
sub.to_csv('../output/xgb1_284.csv', index=False, float_format='%.5f')