In [89]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import gc

print('loading files...')
train = pd.read_csv('../input/train.csv', na_values=-1, nrows=10000)
test = pd.read_csv('../input/test.csv', na_values=-1, nrows=10000)
col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(col_to_drop, axis=1)  
test = test.drop(col_to_drop, axis=1)  

for c in train.select_dtypes(include=['float64']).columns:
    train[c]=train[c].astype(np.float32)
    test[c]=test[c].astype(np.float32)
for c in train.select_dtypes(include=['int64']).columns[2:]:
    train[c]=train[c].astype(np.int8)
    test[c]=test[c].astype(np.int8)    

print(train.shape, test.shape)

loading files...
(10000, 39) (10000, 38)


In [90]:
# custom objective function (similar to auc)

def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

### XGB 자체 wrapper 사용

스태킹 하기 위해서는, 학습할 것과 submission이 필요  
imbalanced data이기 때문에 stratifiedkfold해줘야함  


In [91]:
X = train.drop(['id','target'], axis=1)

In [92]:
features = X.columns

In [93]:
X = X.values

In [94]:
y = train['target'].values

In [95]:
sub = test['id'].to_frame()

In [96]:
sub['target']=0

In [97]:
nrounds = 200
kfold = 2
skf = StratifiedKFold(n_splits = kfold, random_state=0)

In [98]:
for i, (train_index, valid_index) in enumerate(skf.split(X,y)):
    break

In [112]:
X_train, X_valid = X[train_index], X[test_index]
y_train, y_valid = y[train_index], y[test_index]
d_train = xgb.DMatrix(X_train, y_train) 
d_valid = xgb.DMatrix(X_valid, y_valid) 
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [102]:
import xgboost as xgb

In [110]:
params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}


In [113]:
# nrounds 만큼의 weak learner를 계속 붙여나가게 됨(부스팅)
# 100개의 round에서도 성능이 좋아지지 않으면 early stop
xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100,feval=gini_xgb, maximize=True, verbose_eval=100)

[0]	train-auc:0.598547	valid-auc:0.598547	train-gini:0.147114	valid-gini:0.147114
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-auc:0.804865	valid-auc:0.804865	train-gini:0.609731	valid-gini:0.609731
[199]	train-auc:0.883369	valid-auc:0.883369	train-gini:0.766738	valid-gini:0.766738


In [114]:
sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values),
                                  ntree_limit = xgb_model.best_ntree_limit+50)/(2*kfold)

* xgboost를 2개의 fold로 나눠서 model 2개로 반을 채운 것

- lgbm를 2개의 fold로 나눠서 model 2개로 반을 채울 것

- total 1.0의 submission이 나오고 그것을 제출 ==> 스태킹