In [1]:
MAX_ROUNDS = 400
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_SPOPPING_ROUNDS = 50
# Note: OPTIMIZE_ROUNDS가 셋팅되었을 때 EARLY_STOPPING_ROUNDS를 다소 높게 설정하였습니다.
#       작성자는 자신의 직관에 따라 이러한 초기 setting을 하였으므로
#       만약 early_stopping을 낮게 잡고 싶다면 EARLY_STOPPING_ROUNDS를 줄이셔야 합니다.

In [25]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from boruta import BorutaPy
from numba import jit

import time
import gc

In [3]:
# Gini 계수를 계산한다.

@jit
def eval_gini(y_true, y_prod):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = ytrue[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [4]:
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    
    return [('gini', gini_score)]

def add_noise(series, noise_level):
    return series * ( 1 + noise_level * np.random.randn(len(series)) )

def target_encode(trn_series=None,
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    
    temp = pd.concat([trn_series, target], axis=1)
    # target 평균을 계산
    averages = temp.groupby(by=trn.series.name)[target.name].agg(['mean','count'])
    # Smoothing을 계산
    smoothing = 1 / ( 1 + np.exp(-(averages['count'] - min_samples_leaf) / smoothing))
    # 모든 타겟 데이터에서 평균을 적용한다.
    prior = target.mean()
    # smoothing이 클수록 평균을 작게 계산되도록 제약을 건다.
    averages[target.name] = prior * (1-smoothing) + averages['mean'] * smoothing
    averages.drop(['mean', 'count'], axis=1, inplace=True)
    
    # train 데이터에 평균을 적용한다.
    ft_trn_series = pd.merge(
                trn_series.to_frame(trn_series.name),
                averages.reset_index().rename(columns={'index':target.name, target.name:'average'}),
                on=trn_series.name,
                how='left')['average'].rename(trn_series.name+'_mean').fillna(prior)
    # pd.merge는 인덱스를 유지하지 못하기 때문에 이를 저장한다.
    tf_trn_series.index = trn_series.index
    
    # val 데이터에 평균을 적용한다.
    ft_val_series = pd.merge(
            val_series.to_frame(val_series.name),
            averages.reset_index().rename(columns={'index':target.name, target.name:'average'}),
            on=val_series.name,
            how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # 다시 인덱스 저장
    ft_val_series.index=val_series.index
    
    # test 데이터에 평균을 적용
    ft_tst_series = pd.merge(
            tst_series.to_frame(tst_series.name),
            averages.reset_index().rename(columns={'index':target.name, target.name:'agerage'}),
            on=tst_series.name,
            how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # 인덱스 저장
    ft_tst.series.index= ft_series.index
    
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

In [100]:
train_df = pd.read_csv('../porto_train.csv')
test_df = pd.read_csv('../porto_test.csv')

아래의 코드는 노트북 작성자가 수행한 Feature selection 방식과 원리가 비슷한 원리의 라이브러리 BorutaPy 사용 예시입니다.  
원 코드 작성자는 BorutaPy가 아닌 자기 자신이 코드를 직접 작성하였다고 하셨지만 비슷한 원리로 만드셨다고 하셨습니다.

In [41]:
tr = train_df.copy()
target = tr.target
tr.drop('target', axis=1, inplace=True)
clf = LGBMClassifier(boosting_type='rf',
                     num_leaves=1024,
                     max_depth= 6, 
                     n_leaves= 500,
                     subsample=0.623,
                     colsample_bytree =.5)
fea_sal = BorutaPy(clf, n_estimators=400, verbose=2)
fea_sal.fit(tr.values, target.values)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	21
Tentative: 	5
Rejected: 	32
Iteration: 	9 / 100
Confirmed: 	21
Tentative: 	5
Rejected: 	32
Iteration: 	10 / 100
Confirmed: 	21
Tentative: 	5
Rejected: 	32
Iteration: 	11 / 100
Confirmed: 	21
Tentative: 	5
Rejected: 	32
Iteration: 	12 / 100
Confirmed: 	21
Tentative: 	3
Rejected: 	34
Iteration: 	13 / 100
Confirmed: 	21
Tentative: 	3
Rejected: 	34
Iteration: 	14 / 100
Confirmed: 	21
Tentative: 	3
Rejected: 	34
Iteration: 	15 / 100
Confirmed: 	21
Tentative: 	3
Rejected: 	34
Iteration: 	16 / 100
Confirmed: 	21
Tentative: 	3
Reject

BorutaPy(estimator=LGBMClassifier(bagging_freq=1, boosting_type='rf',
                                  colsample_bytree=0.5, max_depth=6,
                                  n_estimators=400, objective='binary',
                                  random_state=RandomState(MT19937) at 0x1B67B119040,
                                  subsample=0.632),
         n_estimators=400, random_state=RandomState(MT19937) at 0x1B67B119040,
         verbose=2)

In [73]:
for i in np.where(fea_sal.ranking_==1):
    print(tr.columns[i])

Index(['ps_ind_01', 'ps_ind_03', 'ps_ind_04_cat', 'ps_ind_05_cat',
       'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_01_cat',
       'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_07_cat',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15'],
      dtype='object')


In [101]:
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
	"ps_reg_03",  #            : 1408.42 / shadow  511.15
	"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
	"ps_ind_03",  #            : 1219.47 / shadow  230.55
	"ps_ind_15",  #            :  922.18 / shadow  242.00
	"ps_reg_02",  #            :  920.65 / shadow  267.50
	"ps_car_14",  #            :  798.48 / shadow  549.58
	"ps_car_12",  #            :  731.93 / shadow  293.62
	"ps_car_01_cat",  #        :  698.07 / shadow  178.72
	"ps_car_07_cat",  #        :  694.53 / shadow   36.35
	"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
	"ps_car_03_cat",  #        :  611.73 / shadow   50.67
	"ps_reg_01",  #            :  598.60 / shadow  178.57
	"ps_car_15",  #            :  593.35 / shadow  226.43
	"ps_ind_01",  #            :  547.32 / shadow  154.58
	"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
	"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
	"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
	"ps_car_11",  #            :  173.28 / shadow   76.45
	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
	"ps_calc_09",  #           :  169.13 / shadow  129.72
	"ps_calc_05",  #           :  148.83 / shadow  120.68
	"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
	"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
	"ps_ind_14",  #            :   37.37 / shadow   16.65
]
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),
    ('ps_reg_01', 'ps_car_04_cat')
]

In [102]:
# Process data
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + "_" + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + "_" + test_df[f2].apply(lambda x: str(x))
    
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))
    
    train_features.append(name1)
    
X = train_df[train_features]
test_df = test_df[train_features]

f_cats = [f for f in X.columns if "_cat" in f]

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.0

In [111]:
v_valid_pred = 0 *y
v_test_pred = 0

In [113]:
# set up folds
K = 5
kf = KFold(n_splits = K, random_state=1, shuffle=True)
np.random.seed(0)