In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

In [2]:
train_label = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/train_204.h5')
test_label = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/test_204.h5')

In [3]:
#LabelEncoder
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
train_label['type'] = train_label['type'].map(type_map)

In [4]:
features = [x for x in train_label.columns if x not in ['ship','x','y','v','d','datetime','type','t','d_d','d_t',
                                                        'd_x','v_x','d_y','v_y','hour','date','diff_time']]
target = 'type'

In [5]:
def LGB_bayesian(
    learning_rate,
    num_leaves,  # int
    max_depth,   
    min_data_in_leaf,  # int    
    min_sum_hessian_in_leaf,    # int      
    feature_fraction,    
    min_gain_to_split
    ):
    
    # LightGBM expects next three parameters need to be integer. So we make them integer
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)

    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int

    param = {
        
        'n_estimators': 5000,
        
        'objective': 'multiclass',
        'boosting_type': 'gbdt',
        'metric': 'multi_logloss',
        'num_class': 3,
        
        'learning_rate': learning_rate,      
        'num_leaves': num_leaves,        
        'max_depth': max_depth,
        
        'min_data_in_leaf': min_data_in_leaf,
        'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf,
        
        'bagging_fraction': 1.0,
        'bagging_freq': 5,
        'feature_fraction': feature_fraction,
        'min_gain_to_split': min_gain_to_split,

        'save_binary': True, 

        #'is_unbalance': True

    }    

    fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    X = train_label[features].copy()
    y = train_label[target]
    score = []

    for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

        train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
        val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

        model = lgb.train(param, train_set, valid_sets=[train_set, val_set], verbose_eval=False, early_stopping_rounds = 100)

        val_pred = model.predict(X.iloc[val_idx])

        val_y = y.iloc[val_idx]
        val_pred = np.argmax(val_pred, axis=1)
        f1 = metrics.f1_score(val_y, val_pred, average='macro')

        score.append(f1)
    
    return sum(score)/len(score)

In [6]:
bounds_LGB = {
    
    'learning_rate': (0.01, 0.3), #学习率。默认为 0.1
    'num_leaves': (5, 60),  #给出了一棵树上的叶子数。默认为 31
    'max_depth':(5,35), #限制了树模型的最大深度，默认值为-1
    
    'min_data_in_leaf': (5, 20), #每个叶节点的最少样本数量。它是处理leaf-wise 树的过拟合的重要参数。
                                 #将它设为较大的值，可以避免生成一个过深的树。但是也可能导致欠拟合。

    'min_sum_hessian_in_leaf': (0.00001, 0.01), # 一个浮点数，表示一个叶子节点上的最小hessian 之和。
                                                #（也就是叶节点样本权重之和的最小值） 默认为1e-3 。
    
    'feature_fraction': (0.5, 1), #来使用特征子抽样,取值范围为[0.0,1.0]， 默认值为1.0
    
    'min_gain_to_split': (0, 1.0) #表示执行切分的最小增益，默认为0
}

In [7]:
from bayes_opt import BayesianOptimization
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=6)  

In [8]:
LGB_BO.maximize()

|   iter    |  target   | featur... | learni... | max_depth | min_da... | min_ga... | min_su... | num_le... |
-------------------------------------------------------------------------------------------------------------
|  1        |  0.8917   |  0.9464   |  0.1063   |  29.64    |  5.625    |  0.1077   |  0.005955 |  34.14    |
|  2        |  0.8791   |  0.7094   |  0.1073   |  23.68    |  11.57    |  0.7359   |  0.005185 |  36.84    |
|  3        |  0.8507   |  0.8227   |  0.2972   |  29.6     |  11.2     |  0.8763   |  0.008239 |  7.996    |
|  4        |  0.879    |  0.8593   |  0.2426   |  27.09    |  15.64    |  0.5409   |  0.001257 |  57.67    |
|  5        |  0.8873   |  0.7016   |  0.07292  |  26.52    |  19.91    |  0.2556   |  0.006716 |  37.95    |
|  6        |  0.8758   |  0.6283   |  0.2952   |  34.99    |  13.16    |  0.9723   |  0.00131  |  44.44    |
|  7        |  0.8839   |  1.0      |  0.01     |  5.0      |  20.0     |  0.0      |  0.01     |  60.0     |
|  8      

In [11]:
LGB_BO.max['target']

0.8989593950022489

In [12]:
LGB_BO.max['params']

{'feature_fraction': 0.6785634376053006,
 'learning_rate': 0.019383601030181666,
 'max_depth': 24.978386879068395,
 'min_data_in_leaf': 5.023455898071062,
 'min_gain_to_split': 0.05071488519451617,
 'min_sum_hessian_in_leaf': 0.005378853072855871,
 'num_leaves': 44.84874398334702}