https://www.kaggle.com/sz8416/simple-bayesian-optimization-for-lightgbm を参照

In [1]:
import pandas as pd
import numpy as np
import warnings
import time
warnings.filterwarnings("ignore")
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score

訓練データを読み込み

In [2]:
application_train = pd.read_csv('../input/train.csv')

不要な列を削除

In [3]:
drop_column = ["PassengerId","Survived","Name","Cabin","Ticket","Embarked"]
X = application_train.drop(drop_column, axis=1)
X['Pclass'] = X['Pclass'].astype(np.int64)

カテゴリ変数をOne-hot表現に変換

In [4]:
X = pd.get_dummies(X)

In [5]:
y = application_train.Survived

In [6]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
0,3,22.0,1,0,7.25,0,1
1,1,38.0,1,0,71.2833,1,0
2,3,26.0,0,0,7.925,1,0
3,1,35.0,1,0,53.1,1,0
4,3,35.0,0,0,8.05,0,1


In [7]:
X.dtypes

Pclass          int64
Age           float64
SibSp           int64
Parch           int64
Fare          float64
Sex_female      uint8
Sex_male        uint8
dtype: object

In [8]:
categorical_feats = X.columns[X.dtypes == 'object']

チューニングするパラメータ

In [9]:
def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
    params = {'application':'binary','num_iterations':4000, 'learning_rate':0.2, 'early_stopping_round':100, 'metric':'auc'}
    params["num_leaves"] = round(num_leaves)
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = round(max_depth)
    params['lambda_l1'] = max(lambda_l1, 0)
    params['lambda_l2'] = max(lambda_l2, 0)
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
    return max(cv_result['auc-mean'])

各パラメータの探索範囲を設定

In [10]:
lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (5, 45),
                                        'feature_fraction': (0.1, 0.9),
                                        'bagging_fraction': (0.2, 1),
                                        'max_depth': (3, 9),
                                        'lambda_l1': (0, 5),
                                        'lambda_l2': (0, 3),
                                        'min_split_gain': (0.001, 0.1),
                                        'min_child_weight': (5, 50)}, random_state=0)

最大化されるパラメータを探す

最適値を返そうとするところでエラーになる

In [11]:
def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=5, random_seed=6, n_estimators=100, learning_rate=0.2, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, categorical_feature = categorical_feats, free_raw_data=False)
    #train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
        params = {'application':'binary','num_iterations': n_estimators, 'learning_rate':learning_rate, 'early_stopping_round':100, 'metric':'auc'}
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
        return max(cv_result['auc-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (2, 20),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.5, 1),
                                            'max_depth': (2, 6),
                                            'lambda_l1': (0, 5),
                                            'lambda_l2': (0, 3),
                                            'min_split_gain': (0.001, 0.1),
                                            'min_child_weight': (5, 50)}, random_state=0)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    print(lgbBO)
    
    # output optimization process
    if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")
    
    # return best parameters
    return lgbBO.max['params']

opt_params = bayes_parameter_opt_lgb(X, y, init_round=5, opt_round=10, n_folds=3, random_seed=6, n_estimators=200, learning_rate=0.2)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8434  [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 3.014   [0m | [0m 1.635   [0m | [0m 3.695   [0m | [0m 34.07   [0m | [0m 0.04432 [0m | [0m 18.05   [0m |
| [0m 2       [0m | [0m 0.8402  [0m | [0m 0.9818  [0m | [0m 0.4068  [0m | [0m 3.959   [0m | [0m 1.587   [0m | [0m 4.272   [0m | [0m 46.65   [0m | [0m 0.008033[0m | [0m 3.568   [0m |
| [0m 3       [0m | [0m 0.8396  [0m | [0m 0.5101  [0m | [0m 0.7661  [0m | [0m 3.891   [0m | [0m 2.61    [0m | [0m 5.914   [0m | [0m 40.96   [0m | [0m 0.04669 [0m | [0m 16.05   [0m |
[200]	cv_agg's auc: 0.83605 + 0.0335219
| [0m 4       [0m | [0m 0.8363  [0m | [0m 0.5591  [0m | [0m 0.6119  [0m | [0m 0.7168  [0m | [0m 2.8

最適化されたパラメータを出力

In [12]:
print(opt_params)

{'bagging_fraction': 0.9944354056604661, 'feature_fraction': 0.7839676996980822, 'lambda_l1': 0.020467527107711492, 'lambda_l2': 2.6665861809691007, 'max_depth': 2.977451979882435, 'min_child_weight': 5.4812949013676695, 'min_split_gain': 0.0998931785794044, 'num_leaves': 19.803000400983258}
