In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from bayes_opt import BayesianOptimization
import warnings
import time
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error as mae
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('input/train_dataset.csv')
test = pd.read_csv('input/test_dataset.csv')
test['信用分'] = None
train = train.append(test)
train.columns = ['id','is_name_real','age','student','blacklist','is_4g_unhealth','internet_age','last_pay_until_now','last_pay_amount','amount_avg_6mon',
            'current_check_amount','remaining_amount','is_owe','sensitive','talk_number','shoping_usually','shoping_count','came_wandan','came_shanmu',
            'came_movie','came_tour','came_pe','buy_app_usage','logistics_app_usage','finance_app_usage','video_app_usage','air_app_usage',
                 'train_app_usage','news_app_usage','grades']
raw_df = train[['id','grades']]

In [3]:
features = ['cross_features.pkl','regular.pkl','lda.pkl']
for f in features:
    data = pd.read_pickle('features/%s'%f)
    for c in data.columns:
        raw_df[c] = data[c].tolist()
train = raw_df.iloc[0:50000,:]
test = raw_df.iloc[50000:,:]

In [4]:
predictor = [x for x in train.columns if x not in ['id','grades']]

In [5]:
predictor =  ['is_name_real', 'age', 'student', 'is_4g_unhealth',
       'internet_age', 'last_pay_until_now', 'last_pay_amount',
       'amount_avg_6mon', 'current_check_amount', 'remaining_amount', 'is_owe',
       'sensitive', 'talk_number', 'shoping_usually', 'shoping_count',
       'came_wandan', 'came_shanmu', 'came_movie', 'came_tour', 'came_pe',
       'buy_app_usage', 'logistics_app_usage', 'finance_app_usage',
       'video_app_usage', 'air_app_usage', 'train_app_usage', 'news_app_usage',
        'first_online_time', 'consume_change', 'is_enough',
       'consume_plan', 'consume_change_plan','last_pay_amount_offline','current_fee_stability', 'use_left_rate']

In [None]:
def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=5, random_seed=6, n_estimators=10000, learning_rate=0.001, output_process=False):
    # prepare data
    train = X
    #train_data = lgb.Dataset(data=X, label=y, categorical_feature = None, free_raw_data=False)
    # parameters
    def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
        train_results = np.zeros(train.shape[0])
        kfolds = KFold(random_state=2019,n_splits=5,shuffle=True)
        for train_index,valid_index in kfolds.split(train):

            x_train = train.loc[train_index,:]
            y_train = y[train_index].values
            x_valid = train.loc[valid_index,:]
            y_valid = y[valid_index].values
            d_train = lgb.Dataset(x_train,
                              label=y_train)
            d_valid = lgb.Dataset(x_valid,
                              label=y_valid)
            params = {'objective':'mae','num_iterations': n_estimators, 'learning_rate':learning_rate, 
                      'early_stopping_round':200, 'metric':'l1'}
            params["num_leaves"] = int(round(num_leaves))
            params['feature_fraction'] = max(min(feature_fraction, 1), 0)
            params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
            params['max_depth'] = int(round(max_depth))
            params['lambda_l1'] = max(lambda_l1, 0)
            params['lambda_l2'] = max(lambda_l2, 0)
            params['min_split_gain'] = min_split_gain
            params['min_child_weight'] = min_child_weight
            bst = lgb.train(params=params, train_set=d_train,valid_sets=[d_valid],verbose_eval=False,early_stopping_rounds=200)
            train_results[valid_index] = bst.predict(x_valid)
        cv_result = mae(train_results,y.values)
        print('cv %f'%cv_result)
        #cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, verbose_eval = 200, metrics=['l1'])
        return -cv_result
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 96),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (4, 8.99),
                                            'lambda_l1': (0, 5),
                                            'lambda_l2': (0, 3),
                                            'min_split_gain': (0.001, 0.1),
                                            'min_child_weight': (5, 50)}, random_state=0)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
 # output optimization process
    if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")
    
    # return best parameters
    return lgbBO.max['params']

opt_params = bayes_parameter_opt_lgb(train.loc[:,predictor], train.loc[:,'grades'], init_round=5, opt_round=30, n_folds=5, random_seed=6, n_estimators=2000, learning_rate=0.05)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------
cv 14.817457
| [0m 1       [0m | [0m-14.82   [0m | [0m 0.9098  [0m | [0m 0.6722  [0m | [0m 3.014   [0m | [0m 1.635   [0m | [0m 6.114   [0m | [0m 34.07   [0m | [0m 0.04432 [0m | [0m 88.21   [0m |
cv 14.780767
| [95m 2       [0m | [95m-14.78   [0m | [95m 0.9927  [0m | [95m 0.4068  [0m | [95m 3.959   [0m | [95m 1.587   [0m | [95m 6.835   [0m | [95m 46.65   [0m | [95m 0.008033[0m | [95m 30.27   [0m |
cv 14.836806
| [0m 3       [0m | [0m-14.84   [0m | [0m 0.804   [0m | [0m 0.7661  [0m | [0m 3.891   [0m | [0m 2.61    [0m | [0m 8.883   [0m | [0m 40.96   [0m | [0m 0.04669 [0m | [0m 80.2    [0m |
cv 14.836535
| [0m 4       [0m | [0m-14.84   [0m | [0m 0.8237  [0m | [0m 0.6119  [0m | [0m 0

In [31]:
opt_params

{'bagging_fraction': 1.0,
 'feature_fraction': 0.9,
 'lambda_l1': 5.0,
 'lambda_l2': 3.0,
 'max_depth': 4.0,
 'min_child_weight': 39.04831824235355,
 'min_split_gain': 0.1,
 'num_leaves': 58.2848541708953}