In [7]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
import lightgbm as lgb
import gc

cwd=os.getcwd()+'/'

def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

def gini_lgb(pred, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, pred) / gini(y, y)
    return [('gini', score, True)]

df_train = pd.read_csv("train.csv", sep = ',')
df_test = pd.read_csv("test.csv", sep = ',')

In [8]:
# Performing one hot encoding
train=df_train.drop(['target','id'],axis=1)
test=df_test.drop(['id'], axis = 1)
combine= pd.concat([train, test],axis=0)

cat_features = [a for a in combine.columns if a.endswith('cat')]
# those categorical features with no missing values
cat_no_na_list = ['ps_car_04_cat', 'ps_car_06_cat', 'ps_car_08_cat', 'ps_car_10_cat', 'ps_car_11_cat']

for column in cat_features:
  
    temp=pd.get_dummies(pd.Series(combine[column]))
    # assign the category with NA if the original data match with -1
    if not column in cat_no_na_list:
        temp[combine[column] == -1] = np.NaN
        # drop -1 columns
        temp = temp.drop([-1], axis=1)
    combine=pd.concat([combine,temp],axis=1)
    combine=combine.drop([column],axis=1)

X = np.array(combine[:df_train.shape[0]])
X_test = np.array(combine[df_train.shape[0]:])
y = np.array(df_train['target'])
sub1=df_test['id'].to_frame()
sub2=df_test['id'].to_frame()

sub1['target']=0
sub2['target']=0


In [9]:
# xgb
params = {'eta': 0.02, 'max_depth': 5, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}

nrounds=2000  
kfold = 5 
skf = StratifiedKFold(n_splits=kfold, random_state=0)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=100)
    sub1['target'] += xgb_model.predict(xgb.DMatrix(X_test), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (2*kfold)
sub1.to_csv('xgboost.csv', index=False, float_format='%.5f') 

gc.collect()
sub1.head(2)

 xgb kfold: 1  of  5 : 
[0]	train-gini:0.2043	valid-gini:0.204166
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.265652	valid-gini:0.249774
[200]	train-gini:0.30007	valid-gini:0.264314
[300]	train-gini:0.327677	valid-gini:0.274252
[400]	train-gini:0.348144	valid-gini:0.278185
[500]	train-gini:0.366465	valid-gini:0.280382
[600]	train-gini:0.382654	valid-gini:0.281398
[700]	train-gini:0.396848	valid-gini:0.281497
[800]	train-gini:0.410211	valid-gini:0.281204
Stopping. Best iteration:
[719]	train-gini:0.399662	valid-gini:0.281635

 xgb kfold: 2  of  5 : 
[0]	train-gini:0.209695	valid-gini:0.197873
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.265477	valid-gini:0.242058
[200]	train-gini:0.297938	valid-gini:0.263261
[300]	train-gini:0.325898	valid-gini:0.274

Unnamed: 0,id,target
0,0,0.014415
1,1,0.012693


In [10]:
def gini_lgb(pred, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, pred) / gini(y, y)
    return [('gini', score, True)]

In [11]:
# lgb
params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':10, 'max_bin':10,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10,  'min_data': 500}

skf = StratifiedKFold(n_splits=kfold, random_state=1)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=100, 
                  feval=gini_lgb, early_stopping_rounds=100)
    sub2['target'] += lgb_model.predict(X_test, 
                        num_iteration=lgb_model.best_iteration) / (2*kfold)
    
sub2.to_csv('lightgbm.csv', index=False, float_format='%.5f') 

gc.collect()


 lgb kfold: 1  of  5 : 
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.626614	valid_0's gini: 0.253227
[200]	valid_0's auc: 0.628598	valid_0's gini: 0.257195
[300]	valid_0's auc: 0.631438	valid_0's gini: 0.262875
[400]	valid_0's auc: 0.634752	valid_0's gini: 0.269505
[500]	valid_0's auc: 0.637188	valid_0's gini: 0.274376
[600]	valid_0's auc: 0.639021	valid_0's gini: 0.278042
[700]	valid_0's auc: 0.639979	valid_0's gini: 0.279959
[800]	valid_0's auc: 0.640275	valid_0's gini: 0.280549
[900]	valid_0's auc: 0.640873	valid_0's gini: 0.281745
[1000]	valid_0's auc: 0.641082	valid_0's gini: 0.282164
[1100]	valid_0's auc: 0.641258	valid_0's gini: 0.282516
[1200]	valid_0's auc: 0.641145	valid_0's gini: 0.282291
Early stopping, best iteration is:
[1148]	valid_0's auc: 0.641284	valid_0's gini: 0.282569
 lgb kfold: 2  of  5 : 
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.624629	valid_0's gini: 0.249258
[200]	valid_0's a

NameError: name 'sub' is not defined

In [12]:
sub2.head(2)

Unnamed: 0,id,target
0,0,0.014444
1,1,0.014051
