### [Kaggle Clone Coding] Home Credit Default Risk
- [Home Credit Default Risk](https://www.kaggle.com/c/home-credit-default-risk)
- [Stacking Test-Sklearn, XGBoost, CatBoost, LightGBM](https://www.kaggle.com/eliotbarr/stacking-test-sklearn-xgboost-catboost-lightgbm)  
<br>
- Task : Supervised Binary Classification 
    - 0 : 대출 상환 O
    - 1 : 대출 상환 X  
  
- CSV File
    - application_{train|test}.csv 
    - bureau.csv : 모든 고객의 이전 credit
    - bureau_balance.csv : 모든 고객의 이전 credit의 월별 잔액
    - POS_CASH_balance.csv : 신청자가 home credit으로 가지고 있던 이전 pos(판매 시점)과 현금 대출의 월별 잔액
    - credit_card_balance.csv : 신청자가 home credit으로 가지고 있던 이전 신용카드의 월별 잔액
    - previous_application.csv : 고객의 주택신용대출에 대한 이전 모든 신청
    - installments_payments.csv : 이전 home credit에 대한 상환 내역
    - HomeCredit_columns_description.csv : 데이터 변수 설명

In [4]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import gc

NFOLDS = 3
SEED = 0
NROWS = None

data = pd.read_csv('./Home_Credit_Default_Risk/application_train.csv')
test = pd.read_csv('./Home_Credit_Default_Risk/application_test.csv')
prev = pd.read_csv('./Home_Credit_Default_Risk/previous_application.csv')

In [5]:
categorical_feats = [
    f for f in data.columns if data[f].dtype == 'object'
]

for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_])
    test[f_] = indexer.get_indexer(test[f_])
    
gc.enable()
y_train = data['TARGET']
del data['TARGET']

In [6]:
prev_cat_features = [
    f_ for f_ in prev.columns if prev[f_].dtype == 'object'
]
for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])
    
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

In [7]:
x_train = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
x_test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

x_train = x_train.fillna(0)
x_test= x_test.fillna(0)

ntrain = x_train.shape[0]
ntest = x_test.shape[0]

In [8]:
excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in x_train.columns if f_ not in excluded_feats]

x_train = x_train[features]
x_test = x_test[features]

kf = KFold(n_splits = NFOLDS, shuffle=True, random_state=SEED)

In [9]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]
        
class LightGBMWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))


In [10]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [11]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

lightgbm_params = {
    'n_estimators':200,
    'learning_rate':0.1,
    'num_leaves':123,
    'colsample_bytree':0.8,
    'subsample':0.9,
    'max_depth':15,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'min_split_gain':0.01,
    'min_child_weight':2    
}

In [12]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf= CatBoostClassifier, seed = SEED, params=catboost_params)
lg = LightGBMWrapper(clf = LGBMClassifier, seed = SEED, params = lightgbm_params)

xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0:	total: 215ms	remaining: 42.7s
1:	total: 261ms	remaining: 25.8s
2:	total: 298ms	remaining: 19.6s
3:	total: 338ms	remaining: 16.6s
4:	total: 377ms	remain

182:	total: 7.7s	remaining: 715ms
183:	total: 7.75s	remaining: 674ms
184:	total: 7.8s	remaining: 632ms
185:	total: 7.84s	remaining: 590ms
186:	total: 7.88s	remaining: 548ms
187:	total: 7.92s	remaining: 506ms
188:	total: 7.96s	remaining: 463ms
189:	total: 7.99s	remaining: 421ms
190:	total: 8.03s	remaining: 378ms
191:	total: 8.07s	remaining: 336ms
192:	total: 8.12s	remaining: 295ms
193:	total: 8.16s	remaining: 252ms
194:	total: 8.2s	remaining: 210ms
195:	total: 8.24s	remaining: 168ms
196:	total: 8.28s	remaining: 126ms
197:	total: 8.32s	remaining: 84ms
198:	total: 8.36s	remaining: 42ms
199:	total: 8.4s	remaining: 0us
0:	total: 41.9ms	remaining: 8.33s
1:	total: 83.3ms	remaining: 8.25s
2:	total: 125ms	remaining: 8.18s
3:	total: 169ms	remaining: 8.27s
4:	total: 212ms	remaining: 8.26s
5:	total: 259ms	remaining: 8.39s
6:	total: 303ms	remaining: 8.36s
7:	total: 340ms	remaining: 8.16s
8:	total: 380ms	remaining: 8.07s
9:	total: 420ms	remaining: 7.98s
10:	total: 458ms	remaining: 7.88s
11:	total: 5

25:	total: 1.09s	remaining: 7.33s
26:	total: 1.13s	remaining: 7.25s
27:	total: 1.17s	remaining: 7.19s
28:	total: 1.21s	remaining: 7.14s
29:	total: 1.25s	remaining: 7.1s
30:	total: 1.29s	remaining: 7.06s
31:	total: 1.33s	remaining: 6.99s
32:	total: 1.37s	remaining: 6.92s
33:	total: 1.41s	remaining: 6.87s
34:	total: 1.45s	remaining: 6.83s
35:	total: 1.49s	remaining: 6.8s
36:	total: 1.54s	remaining: 6.78s
37:	total: 1.58s	remaining: 6.72s
38:	total: 1.62s	remaining: 6.67s
39:	total: 1.65s	remaining: 6.62s
40:	total: 1.7s	remaining: 6.59s
41:	total: 1.73s	remaining: 6.53s
42:	total: 1.77s	remaining: 6.48s
43:	total: 1.81s	remaining: 6.43s
44:	total: 1.85s	remaining: 6.36s
45:	total: 1.89s	remaining: 6.32s
46:	total: 1.93s	remaining: 6.28s
47:	total: 1.98s	remaining: 6.25s
48:	total: 2.02s	remaining: 6.21s
49:	total: 2.06s	remaining: 6.18s
50:	total: 2.1s	remaining: 6.15s
51:	total: 2.15s	remaining: 6.12s
52:	total: 2.2s	remaining: 6.1s
53:	total: 2.25s	remaining: 6.07s
54:	total: 2.29s	rem

In [15]:
print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, cb_oof_train))))

XG-CV: 0.25969546149690725
ET-CV: 0.26296502347137407
RF-CV: 0.2629856213006646
RF-CV: 0.3309876006961808


In [13]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))

(307511, 4),(48744, 4)


In [14]:
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train,y_train)

test['TARGET'] = logistic_regression.predict_proba(x_test)[:,1]

test[['SK_ID_CURR', 'TARGET']].to_csv('first_submission.csv', index=False, float_format='%.8f')