In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os
import gc
from tqdm import tqdm
import random

import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score

import xgboost as xgb
import lightgbm as lgb

from pystacknet.pystacknet import StackNetClassifier

from bayes_opt import BayesianOptimization

from sklearn.model_selection import cross_val_score

In [3]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer()
data, target = data['data'], data['target']

tr_X, val_X, tr_y, val_y = train_test_split(data, target, test_size=0.2, random_state=SEED)

In [4]:
# Check List
## check objective ~ reg/clf
## check loss

In [5]:
bounds = {
    'num_leaves': (100, 800), 
    'min_data_in_leaf': (0, 150),
    'bagging_fraction' : (0.3, 0.9),
    'feature_fraction' : (0.3, 0.9),
#     'learning_rate': (0.01, 1),
    'min_child_weight': (0.01, 3),   
    'reg_alpha': (0.1, 3), 
    'reg_lambda': (0.1, 3),
    'max_depth':(6, 23),
    'n_estimators': (64, 512)
}

def build_xgb(x, y, init_points=15, n_iter=0, param=True, verbose=2):
    train_X, test_X, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=12, shuffle=True)
    def XGB_bayesian(
        #learning_rate,
        num_leaves, 
        bagging_fraction,
        feature_fraction,
        min_child_weight, 
        min_data_in_leaf,
        max_depth,
        reg_alpha,
        reg_lambda,
        n_estimators
         ):
        # LightGBM expects next three parameters need to be integer. 
        num_leaves = int(num_leaves)
        min_data_in_leaf = int(min_data_in_leaf)
        max_depth = int(max_depth)

        assert type(num_leaves) == int
        assert type(min_data_in_leaf) == int
        assert type(max_depth) == int


        params = {
                  'num_leaves': num_leaves, 
                  'min_data_in_leaf': min_data_in_leaf,
                  'min_child_weight': min_child_weight,
                  'bagging_fraction' : bagging_fraction,
                  'feature_fraction' : feature_fraction,
                  'learning_rate' : 0.05,
                  'max_depth': max_depth,
                  'reg_alpha': reg_alpha,
                  'reg_lambda': reg_lambda,
                  'objective': 'binary:logistic',
                  'save_binary': True,
                  'seed': SEED,
                  'feature_fraction_seed': SEED,
                  'bagging_seed': SEED,
                  'drop_seed': SEED,
                  'data_random_seed': SEED,
                  'verbose': 1,
                  'is_unbalance': True,
                  'boost_from_average': True,
                  'metric':'auc',
                  'n_estimators': int(n_estimators),
                  'tree_method ': 'gpu_hist' # check gpu availability
        }    

        ## set clf options
        clf = xgb.XGBClassifier(**params).fit(train_X, train_y)
        score = roc_auc_score(test_y, clf.predict_proba(test_X)[:,1])
        return score
    
    optimizer = BayesianOptimization(XGB_bayesian, bounds, random_state=42, verbose=verbose)
    init_points = init_points
    n_iter = n_iter

    optimizer.maximize(init_points=init_points, n_iter=n_iter)
    
    param_xgb = {
        'min_data_in_leaf': int(optimizer.max['params']['min_data_in_leaf']), 
        'num_leaves': int(optimizer.max['params']['num_leaves']), 
        'learning_rate': 0.05,
        'min_child_weight': optimizer.max['params']['min_child_weight'],
        'bagging_fraction': optimizer.max['params']['bagging_fraction'], 
        'feature_fraction': optimizer.max['params']['feature_fraction'],
        'reg_lambda': optimizer.max['params']['reg_lambda'],
        'reg_alpha': optimizer.max['params']['reg_alpha'],
        'max_depth': int(optimizer.max['params']['max_depth']), 
        'objective': 'binary:logistic',
        'save_binary': True,
        'seed': SEED,
        'feature_fraction_seed': SEED,
        'bagging_seed': SEED,
        'drop_seed': SEED,
        'data_random_seed': SEED,
        'verbose': 1,
        'is_unbalance': True,
        'boost_from_average': True,
        'metric':'auc',
        'n_estimators': int(optimizer.max['params']['n_estimators']),
         'tree_method ': 'gpu_hist' # check gpu availability
    }

    params = param_xgb.copy()
    
    xgb_clf = xgb.XGBClassifier(**params)
    xgb_clf.fit(x, y)
    
    if param:
        return xgb_clf, params, optimizer.max['target']
    else:
        return xgb_clf, optimizer.max['target']

def build_lgb(x, y, init_points=15, n_iter=0, param=True, verbose=2):
    train_X, test_X, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=12, shuffle=True)
    def LGB_bayesian(
        #learning_rate,
        num_leaves, 
        bagging_fraction,
        feature_fraction,
        min_child_weight, 
        min_data_in_leaf,
        max_depth,
        reg_alpha,
        reg_lambda,
        n_estimators
         ):
        # LightGBM expects next three parameters need to be integer. 
        num_leaves = int(num_leaves)
        min_data_in_leaf = int(min_data_in_leaf)
        max_depth = int(max_depth)

        assert type(num_leaves) == int
        assert type(min_data_in_leaf) == int
        assert type(max_depth) == int


        params = {
                  'num_leaves': num_leaves, 
                  'min_data_in_leaf': min_data_in_leaf,
                  'min_child_weight': min_child_weight,
                  'bagging_fraction' : bagging_fraction,
                  'feature_fraction' : feature_fraction,
                  'learning_rate' : 0.05,
                  'max_depth': max_depth,
                  'reg_alpha': reg_alpha,
                  'reg_lambda': reg_lambda,
                  'objective': 'cross_entropy',
                  'save_binary': True,
                  'seed': SEED,
                  'feature_fraction_seed': SEED,
                  'bagging_seed': SEED,
                  'drop_seed': SEED,
                  'data_random_seed': SEED,
                  'boosting': 'gbdt', ## some get better result using 'dart'
                  'verbose': 1,
                  'is_unbalance': True,
                  'boost_from_average': True,
                  'metric':'auc',
                  'n_estimators': int(n_estimators),
                  'tree_learner ': 'voting'
        }    

        ## set clf options
        clf = lgb.LGBMClassifier(**params).fit(train_X, train_y)
        score = roc_auc_score(test_y, clf.predict_proba(test_X)[:,1])
        return score
    
    optimizer = BayesianOptimization(LGB_bayesian, bounds, random_state=42, verbose=verbose)
    init_points = init_points
    n_iter = n_iter

    optimizer.maximize(init_points=init_points, n_iter=n_iter)
    
    param_lgb = {
        'min_data_in_leaf': int(optimizer.max['params']['min_data_in_leaf']), 
        'num_leaves': int(optimizer.max['params']['num_leaves']), 
        'learning_rate': 0.05,
        'min_child_weight': optimizer.max['params']['min_child_weight'],
        'bagging_fraction': optimizer.max['params']['bagging_fraction'], 
        'feature_fraction': optimizer.max['params']['feature_fraction'],
        'reg_lambda': optimizer.max['params']['reg_lambda'],
        'reg_alpha': optimizer.max['params']['reg_alpha'],
        'max_depth': int(optimizer.max['params']['max_depth']), 
        'objective': 'cross_entropy',
        'save_binary': True,
        'seed': SEED,
        'feature_fraction_seed': SEED,
        'bagging_seed': SEED,
        'drop_seed': SEED,
        'data_random_seed': SEED,
        'boosting_type': 'gbdt',  # also consider 'dart'
        'verbose': 1,
        'is_unbalance': True,
        'boost_from_average': True,
        'metric':'auc',
        'n_estimators': int(optimizer.max['params']['n_estimators']),
        'tree_learner ': 'voting'
    }

    params = param_lgb.copy()
    
    lgb_clf = lgb.LGBMClassifier(**params)
    lgb_clf.fit(x, y)
    
    if param:
        return lgb_clf, params, optimizer.max['target']
    else:
        return lgb_clf, optimizer.max['target']

In [6]:
lgb_clf, score = build_xgb(tr_X, tr_y, param=False)
xgb_clf, score = build_lgb(tr_X, tr_y, param=False)

|   iter    |  target   | baggin... | featur... | max_depth | min_ch... | min_da... | n_esti... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9963  [0m | [0m 0.5247  [0m | [0m 0.8704  [0m | [0m 18.44   [0m | [0m 1.8     [0m | [0m 23.4    [0m | [0m 133.9   [0m | [0m 140.7   [0m | [0m 2.612   [0m | [0m 1.843   [0m |
| [0m 2       [0m | [0m 0.9949  [0m | [0m 0.7248  [0m | [0m 0.3124  [0m | [0m 22.49   [0m | [0m 2.499   [0m | [0m 31.85   [0m | [0m 145.5   [0m | [0m 228.4   [0m | [0m 0.9823  [0m | [0m 1.622   [0m |
| [0m 3       [0m | [0m 0.9953  [0m | [0m 0.5592  [0m | [0m 0.4747  [0m | [0m 16.4    [0m | [0m 0.4271  [0m | [0m 43.82   [0m | [0m 228.1   [0m | [0m 419.2   [0m | [0m 2.377   [0m | [0m 0.6791  [0m |
| [0m 4       [0m | [0m 0.9956  [0m | [0m 0.6085  [0m | [0m

In [7]:
rf = RandomForestClassifier(max_depth=9, random_state=SEED)
pca = PCA(4)

models = [
    [lgb_clf, xgb_clf, pca], # pca 등도 이 단계에서 추가 가능
    [rf]
]

In [8]:
stk_clf = StackNetClassifier(models, 
                             metric='auc',
                            random_state=SEED, n_jobs=-1,
                            verbose=2)

stk_clf.fit(tr_X, tr_y)

Input Dimensionality 30 at Level 0 
3 models included in Level 0 
Fold 1/3 , model 0 , auc===0.985897 
Fold 1/3 , model 1 , auc===0.976749 
Fold 2/3 , model 0 , auc===0.992042 
Fold 2/3 , model 1 , auc===0.986737 
Fold 3/3 , model 0 , auc===0.988824 
Fold 3/3 , model 1 , auc===0.991961 
Level 0, model 0 , auc===0.988921 
Level 0, model 1 , auc===0.985149 
Output dimensionality of level 0 is 6 
 level 0 lasted 6.621325 seconds 
Input Dimensionality 6 at Level 1 
1 models included in Level 1 
Fold 1/3 , model 0 , auc===0.968363 
Fold 2/3 , model 0 , auc===0.977896 
Fold 3/3 , model 0 , auc===0.981275 
Level 1, model 0 , auc===0.975844 
Output dimensionality of level 1 is 1 
 level 1 lasted 1.366052 seconds 
 fit() lasted 7.989347 seconds 


In [9]:
roc_auc_score(val_y, stk_clf.predict_proba(val_X)[:,1])

1 estimators included in Level 0 
1 estimators included in Level 1 


0.9882083196855551