In [1]:
import sys
import time

class Logger(object):
    def __init__(self, logtofile=True, logfilename='log'):
        self.terminal = sys.stdout
        self.logfile = "{}_{}.log".format(logfilename, int(time.time()))
        self.logtofile = logtofile
    def write(self, message):
        if self.logtofile:
            self.log = open(self.logfile, "a")
            self.log.write(message)  
            self.log.close()
    def flush(self):
        #this flush method is needed for python 3 compatibility.
        #this handles the flush command by doing nothing.
        #you might want to specify some extra behavior here.
        pass
sys.stdout = Logger(logfilename='logfile')

In [2]:
import pandas as pd
import numpy as np
import warnings
import time
warnings.filterwarnings("ignore")
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score

PATH = '/home/kai/data/kaggle/homecredit/'
train = pd.read_pickle(PATH + 'train_bo0.pkl')

ignored_col = ['ORGANIZATION_TYPE', 'TARGET', 'SK_ID_CURR']
feats = [x for x in train.columns if x not in ignored_col]

train.shape

(100, 2395)

In [3]:
categorical_feats = ['FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_TYPE_SUITE','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS',
'NAME_HOUSING_TYPE',
'OCCUPATION_TYPE','FLAG_DOCUMENT_2','FLAG_DOCUMENT_3','FLAG_DOCUMENT_4','FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7','FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10','FLAG_DOCUMENT_11','FLAG_DOCUMENT_12',
'FLAG_DOCUMENT_13','FLAG_DOCUMENT_14','FLAG_DOCUMENT_15','FLAG_DOCUMENT_16','FLAG_DOCUMENT_17','FLAG_DOCUMENT_18',
'FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21',
'NAME_TYPE_SUITE']

In [4]:
X = train[feats]
y = train.TARGET
init_round=5
opt_round=15
n_folds=5
random_seed=1992
output_process=True

# prepare data
train_data = lgb.Dataset(data=X, label=y, categorical_feature = categorical_feats, free_raw_data=False)
# parameters

def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight, scale_pos_weight):
    params = {'objective':'binary',
              'num_iterations': 10000, 
              'learning_rate':0.01, 
              'early_stopping_round':150, 
              'metric':'auc', 
              'num_threads': 10, 
              'boosting_type': 'gbdt', 
              'task': 'train'}
    
    params["num_leaves"] = int(round(num_leaves))
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = int(round(max_depth))
    params['lambda_l1'] = max(lambda_l1, 0)
    params['lambda_l2'] = max(lambda_l2, 0)
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    params['scale_pos_weight'] = scale_pos_weight
    cv_result = lgb.cv(params,
                       train_data,
                       nfold=n_folds,
                       seed=random_seed,
                       stratified=True,
                       verbose_eval =200,
                       metrics=['auc'])
    return max(cv_result['auc-mean'])


# range 
lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (38, 60),
                                        'feature_fraction': (0.1, 0.3),
                                        'bagging_fraction': (0.8, 1),
                                        'max_depth': (5, 8.99),
                                        'lambda_l1': (0, 1),
                                        'lambda_l2': (0, 1),
                                        'min_split_gain': (0.001, 0.1),
                                        'scale_pos_weight': (1, 3),
                                        'min_child_weight': (1, 50)}, random_state=0)
# optimize

lgbBO.maximize(init_points=init_round, n_iter=opt_round)


# output optimization process
if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")