In [2]:
#source: https://www.kaggle.com/sz8416/simple-bayesian-optimization-for-lightgbm

In [2]:
import pandas as pd
import numpy as np
import warnings
import time
warnings.filterwarnings("ignore")
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [3]:
application_train = pd.read_csv('./application_train_part.csv')



In [4]:
input_df = application_train

categorical_feats = input_df.columns[input_df.dtypes == 'object']
for feat in categorical_feats:
    encoder = LabelEncoder()
    input_df[feat] = encoder.fit_transform(input_df[feat].fillna('NULL'))


In [5]:
input_df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,1,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,0,0,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,1,1,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,0,0,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,0,1,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
categorical_feats

Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
       'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE',
       'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'],
      dtype='object')

In [7]:
def label_encoder(input_df, encoder_dict=None):
    """ Process a dataframe into a form useable by LightGBM """
    # Label encode categoricals
    categorical_feats = input_df.columns[input_df.dtypes == 'object']
    for feat in categorical_feats:
        encoder = LabelEncoder()
        input_df[feat] = encoder.fit_transform(input_df[feat].fillna('NULL'))
    return input_df, categorical_feats.tolist(), encoder_dict
application_train, categorical_feats, encoder_dict = label_encoder(application_train)
X = application_train.drop('TARGET', axis=1)
y = application_train.TARGET

In [8]:

def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=5, random_seed=6, n_estimators=10000, learning_rate=0.05, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, categorical_feature = categorical_feats,
                             free_raw_data=False)
    # parameters
    def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
        params = {'application':'binary','num_iterations': n_estimators, 'learning_rate':learning_rate, 'early_stopping_round':100, 'metric':'auc'}
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
        return min(-np.array(cv_result['auc-mean']))
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (5, 8.99),
                                            'lambda_l1': (0, 5),
                                            'lambda_l2': (0, 3),
                                            'min_split_gain': (0.001, 0.1),
                                            'min_child_weight': (5, 50)}, random_state=0)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    # output optimization process
    if output_process==True: lgbBO.points_to_csv("../out_data/bayes_opt_result.csv")
    
    # return best parameters
    return lgbBO



In [9]:
lgbBO = bayes_parameter_opt_lgb(X, y, init_round=5, opt_round=10, n_folds=3,\
                                     random_seed=6,\
                                     n_estimators=100, learning_rate=0.05,
                                    output_process=False)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.7478  [0m | [0m 0.9098  [0m | [0m 0.6722  [0m | [0m 3.014   [0m | [0m 1.635   [0m | [0m 6.69    [0m | [0m 34.07   [0m | [0m 0.04432 [0m | [0m 42.73   [0m |
| [0m 2       [0m | [0m-0.7487  [0m | [0m 0.9927  [0m | [0m 0.4068  [0m | [0m 3.959   [0m | [0m 1.587   [0m | [0m 7.266   [0m | [0m 46.65   [0m | [0m 0.008033[0m | [0m 25.83   [0m |
| [95m 3       [0m | [95m-0.7476  [0m | [95m 0.804   [0m | [95m 0.7661  [0m | [95m 3.891   [0m | [95m 2.61    [0m | [95m 8.905   [0m | [95m 40.96   [0m | [95m 0.04669 [0m | [95m 40.39   [0m |
| [95m 4       [0m | [95m-0.7463  [0m | [95m 0.8237  [0m | [95m 0.6119  [0m | [95m 0.7168  [0m | [95m 2.834   [0m | [95m 7.082 

In [10]:
lgbBO.res

[{'target': -0.7478467685946991,
  'params': {'bagging_fraction': 0.909762700785465,
   'feature_fraction': 0.6721514930979355,
   'lambda_l1': 3.0138168803582195,
   'lambda_l2': 1.6346495489906907,
   'max_depth': 6.690382649362229,
   'min_child_weight': 34.065235087999525,
   'min_split_gain': 0.04432113391500656,
   'num_leaves': 42.727233016423675}},
 {'target': -0.7486651657009764,
  'params': {'bagging_fraction': 0.9927325521002058,
   'feature_fraction': 0.40675321506062223,
   'lambda_l1': 3.958625190413323,
   'lambda_l2': 1.5866847592587134,
   'max_depth': 7.2664977987647905,
   'min_child_weight': 46.65184872316975,
   'min_split_gain': 0.008032569761590808,
   'num_leaves': 25.829715293732356}},
 {'target': -0.7475700548471299,
  'params': {'bagging_fraction': 0.8040436794880652,
   'feature_fraction': 0.7660958764383504,
   'lambda_l1': 3.8907837547492523,
   'lambda_l2': 2.6100364447404574,
   'max_depth': 8.904687185508728,
   'min_child_weight': 40.96213538975256,
  