# Stacking Test-Sklearn, XGBoost, CatBoost, LightGBM
* 참고 Notebook : https://www.kaggle.com/eliotbarr/stacking-test-sklearn-xgboost-catboost-lightgbm

In [93]:
import pandas as pd
import numpy as np
import gc
import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from math import sqrt
from scipy.stats import skew

# Global Variable

In [94]:
NFOLDS = 3
SEED = 0
NROWS = None

# Data Load

In [95]:
train = pd.read_csv('home_credit_data/application_train.csv')
test = pd.read_csv('home_credit_data/application_test.csv')
prev = pd.read_csv('home_credit_data/previous_application.csv')

print(train.shape)
print(test.shape)
print(prev.shape)

(307511, 122)
(48744, 121)
(1670214, 37)


In [96]:
categorical_features = [col for col in train.columns if train[col].dtype == 'object']
categorical_features

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']

In [97]:
train['NAME_CONTRACT_TYPE'].head()

0         Cash loans
1         Cash loans
2    Revolving loans
3         Cash loans
4         Cash loans
Name: NAME_CONTRACT_TYPE, dtype: object

In [98]:
# 카테고리 컬럼을 숫자로 변환
for col in categorical_features:
    train[col], indexer = pd.factorize(train[col])
    test[col] = indexer.get_indexer(test[col])
    
gc.enable()

train['NAME_CONTRACT_TYPE'].head()

0    0
1    0
2    1
3    0
4    0
Name: NAME_CONTRACT_TYPE, dtype: int64

In [99]:
y_train = train['TARGET']
del train['TARGET']

print(train.shape)

(307511, 121)


In [100]:
prev_cat_features = [col for col in prev.columns if prev[col].dtype == 'object']
prev_cat_features

['NAME_CONTRACT_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'FLAG_LAST_APPL_PER_CONTRACT',
 'NAME_CASH_LOAN_PURPOSE',
 'NAME_CONTRACT_STATUS',
 'NAME_PAYMENT_TYPE',
 'CODE_REJECT_REASON',
 'NAME_TYPE_SUITE',
 'NAME_CLIENT_TYPE',
 'NAME_GOODS_CATEGORY',
 'NAME_PORTFOLIO',
 'NAME_PRODUCT_TYPE',
 'CHANNEL_TYPE',
 'NAME_SELLER_INDUSTRY',
 'NAME_YIELD_GROUP',
 'PRODUCT_COMBINATION']

In [101]:
for col in prev_cat_features:
    prev[col], _ = pd.factorize(prev[col])

prev['NAME_CONTRACT_TYPE'].head()

0    0
1    1
2    1
3    1
4    1
Name: NAME_CONTRACT_TYPE, dtype: int64

In [102]:
avg_prev = prev.groupby('SK_ID_CURR').mean()

# SK_ID_CURR로 그룹화한뒤 갯수 카운트
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()

avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']

print(avg_prev.shape)
avg_prev.head()

(338857, 37)


Unnamed: 0_level_0,SK_ID_PREV,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,...,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,nb_app
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,1369693.0,0.0,3951.0,24835.5,23787.0,2520.0,24835.5,4.0,13.0,0.0,...,8.0,2.0,0.0,365243.0,-1709.0,-1499.0,-1619.0,-1612.0,0.0,1
100002,1038818.0,0.0,9251.775,179055.0,179055.0,0.0,179055.0,0.0,9.0,0.0,...,24.0,3.0,8.0,365243.0,-565.0,125.0,-25.0,-17.0,0.0,1
100003,2281150.0,0.333333,56553.99,435436.5,484191.0,3442.5,435436.5,3.0,14.666667,0.0,...,10.0,1.0,6.666667,365243.0,-1274.333333,-1004.333333,-1054.333333,-1047.333333,0.666667,3
100004,1564014.0,0.0,5357.25,24282.0,20106.0,4860.0,24282.0,4.0,5.0,0.0,...,4.0,0.0,10.0,365243.0,-784.0,-694.0,-724.0,-714.0,0.0,1
100005,2176837.0,0.5,4813.2,22308.75,20076.75,4464.0,44617.5,2.5,10.5,0.0,...,12.0,3.0,2.5,365243.0,-706.0,-376.0,-466.0,-460.0,0.0,2


In [103]:
del avg_prev['SK_ID_PREV']
print(avg_prev.shape)

(338857, 36)


In [111]:
x_train = train.merge(on = 'SK_ID_CURR', right = avg_prev.reset_index(), how = 'left')
x_test = test.merge(on = 'SK_ID_CURR', right = avg_prev.reset_index(), how = 'left')

print(x_train.shape)
print(x_test.shape)

(307511, 157)
(48744, 157)


In [112]:
# na 0으로 채우기
x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

n_train = x_train.shape[0]
n_test = x_test.shape[0]

In [113]:
excluded_features = ['SK_ID_CURR']
features = [col for col in x_train.columns if col not in excluded_features]

print(len(features))

156


In [114]:
x_train = x_train[features]
x_test = x_test[features]

print(x_train.shape)
print(x_test.shape)
x_train.head()

(307511, 156)
(48744, 156)


Unnamed: 0,NAME_CONTRACT_TYPE_x,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,AMT_GOODS_PRICE_x,NAME_TYPE_SUITE_x,...,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,nb_app
0,0,0,0,0,0,202500.0,406597.5,24700.5,351000.0,0,...,24.0,3.0,8.0,365243.0,-565.0,125.0,-25.0,-17.0,0.0,1.0
1,0,1,0,1,0,270000.0,1293502.5,35698.5,1129500.0,1,...,10.0,1.0,6.666667,365243.0,-1274.333333,-1004.333333,-1054.333333,-1047.333333,0.666667,3.0
2,1,0,1,0,0,67500.0,135000.0,6750.0,135000.0,0,...,4.0,0.0,10.0,365243.0,-784.0,-694.0,-724.0,-714.0,0.0,1.0
3,0,1,0,0,0,135000.0,312682.5,29686.5,297000.0,0,...,23.0,2.888889,5.888889,365243.0,91066.5,91584.0,182477.5,182481.75,0.0,9.0
4,0,0,0,0,0,121500.0,513000.0,21865.5,513000.0,0,...,20.666667,1.0,3.333333,365243.0,-1263.2,-837.2,72136.2,72143.8,0.6,6.0


In [115]:
kf = KFold(n_splits = NFOLDS, shuffle = True, random_state = SEED)

In [116]:
class SklearnWrapper(object):
    def __init__(self, clf, seed = 0, params = None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [117]:
class CatboostWrapper(object):
    def __init__(self, clf, seed = 0, params = None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [118]:
class LightGBMWrapper(object):
    def __init__(self, clf, seed = 0, params = None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [119]:
class XgbWrapper(object):
    def __init__(self, seed = 0, params = None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)
        
    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label = y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)
        
    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [120]:
def get_oof(clf):
    oof_train = np.zeros((n_train,))
    oof_test = np.zeros((n_test,))
    oof_test_skf = np.empty((NFOLDS, n_test))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

# Params

In [121]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

In [122]:
rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

In [123]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

In [124]:
catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

In [125]:
lightgbm_params = {
    'n_estimators':200,
    'learning_rate':0.1,
    'num_leaves':123,
    'colsample_bytree':0.8,
    'subsample':0.9,
    'max_depth':15,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'min_split_gain':0.01,
    'min_child_weight':2    
}

In [126]:
xg = XgbWrapper(seed = SEED, params = xgb_params)
et = SKlearnWrapper(clf = ExtraTreesClassifier, seed = SEED, params = et_params)
rf = SklearnWrapper(clf = RandomForestClassifier, seed = SEED, params = rf_params)
cb = CatboostWrapper(clf = CatBoostClassifier, seed = SEED, params = catboost_params)
lg = LightGBMWrapper(clf = LGBMClassifier, seed = SEED, params = lightgbm_params)

In [128]:
xg_oof_train, xg_oof_test = get_oof(xg)
# et_oof_train, et_oof_test = get_oof(et)
# rf_oof_train, rf_oof_test = get_oof(rf)
# cb_oof_train, cb_oof_test = get_oof(cb)

print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
# print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
# print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
# print("CB-CV: {}".format(sqrt(mean_squared_error(y_train, cb_oof_train))))

XG-CV: 0.2597074114623466


In [None]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis = 1)
x_test = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis = 1)

print("{}, {}".format(x_train.shape, x_test.shape))

In [None]:
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)

test['TARGET'] = logistic_regression.predict_proba(x_test)[:,1]

In [None]:
test[['SK_ID_CURR', 'TARGET']].to_csv('first_submission.csv', index = False, float_format = '%.8f')