In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import sys, gc, warnings, random, math, time, datetime, os
from tqdm import tqdm_notebook
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

import lightgbm as lgb
from bayes_opt import BayesianOptimization

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
# read df_train and df_test
df_train = pd.read_pickle('../features/train_v2.pkl')
# df_test= pd.read_pickle('../features/test_v2.pkl')
TARGET = "open_channels"

In [3]:
BATCH_GROUP = [5,10]
df_train = df_train[df_train.batch.isin(BATCH_GROUP)]

In [4]:
def fe(df):
    
    for shift_val in range(1, 6):
    
        df[f'shift+{shift_val}'] = df.groupby(['batch']).shift(shift_val)['signal']
        df[f'shift-{shift_val}'] = df.groupby(['batch']).shift(-shift_val)['signal']
        df[f'diff+{shift_val}'] = df[f'shift+{shift_val}'] - df['signal']
        df[f'diff-{shift_val}'] = df[f'shift-{shift_val}'] - df['signal']

    return df

print("feature engineering on training data ...")
df_train = fe(df_train)
# print("feature engineering on testing data ...")
# df_test = fe(df_test)
print("feature engineering is done ...")

feature engineering on training data ...
feature engineering is done ...


In [5]:
# drop useless features
drop_features = [
    "time",
    "open_channels",
    "local_time",
    "batch",
    "mini_batch",
    "oof",
]
all_features = [col for col in df_train.columns if col not in drop_features]

# print("train/test shape is:", df_train.shape, df_test.shape)
print("features used # is", len(all_features))
df_train[all_features].head()

features used # is 21


Unnamed: 0,signal,shift+1,shift-1,diff+1,diff-1,shift+2,shift-2,diff+2,diff-2,shift+3,shift-3,diff+3,diff-3,shift+4,shift-4,diff+4,diff-4,shift+5,shift-5,diff+5,diff-5
2000000,9.8451,,9.4972,,-0.3479,,9.6538,,-0.1913,,10.0801,,0.235,,10.1454,,0.3003,,10.387,,0.5419
2000001,9.4972,9.8451,9.6538,0.3479,0.1566,,10.0801,,0.5829,,10.1454,,0.6482,,10.387,,0.8898,,9.2749,,-0.2223
2000002,9.6538,9.4972,10.0801,-0.1566,0.4263,9.8451,10.1454,0.1913,0.4916,,10.387,,0.7332,,9.2749,,-0.3789,,8.8249,,-0.8289
2000003,10.0801,9.6538,10.1454,-0.4263,0.0653,9.4972,10.387,-0.5829,0.3069,9.8451,9.2749,-0.235,-0.8052,,8.8249,,-1.2552,,9.3669,,-0.7132
2000004,10.1454,10.0801,10.387,-0.0653,0.2416,9.6538,9.2749,-0.4916,-0.8705,9.4972,8.8249,-0.6482,-1.3205,9.8451,9.3669,-0.3003,-0.7785,,9.4843,,-0.6611


In [6]:
def bayes_parameter_opt_lgb(X,
                            y,
                            init_round=15, # how many steps of random exploration
                            opt_round=25, # how many steps of bayes optimization
                            n_folds=5,
                            random_seed=6,
                            n_estimators=10000,
                            learning_rate=0.05,
                            output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X,
                             label=y,
#                              categorical_feature=categorical_features,
                             free_raw_data=False)

    # parameters
    def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth,
                 lambda_l1, lambda_l2, min_split_gain, min_child_weight):
        params = {
            'application': 'regression',
            'num_iterations': n_estimators,
            'learning_rate': learning_rate,
            'early_stopping_round': 200,
            'metric': 'rmse'
        }
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight

        # modify here
        cv_result = lgb.cv(params,
                           train_data,
                           nfold=n_folds,
                           seed=random_seed,
                           stratified=True,
                           verbose_eval=200,
                           metrics=['rmse'])
        
        return -min(cv_result['rmse-mean'])

    # range
    lgbBO = BayesianOptimization(lgb_eval, {
        'num_leaves': (32, 256),
        'feature_fraction': (0.1, 0.9),
        'bagging_fraction': (0.1, 0.9),
        'max_depth': (5, 10),
        'lambda_l1': (0.1, 3),
        'lambda_l2': (0.1, 5),
        'min_split_gain': (0.001, 0.1),
        'min_child_weight': (4, 64)
    },
                                 random_state=42)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)

    # output optimization process
    #     if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")

    # return best parameters
    return lgbBO.max

In [7]:
X = df_train[all_features]
y = df_train[TARGET]

opt_params = bayes_parameter_opt_lgb(X,
                                     y,
                                     init_round=10, # how many steps of random exploration
                                     opt_round=45, # how many steps of bayes optimization
                                     n_folds=5,
                                     random_seed=42,
                                     n_estimators=3000,
                                     learning_rate=0.1)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------
[200]	cv_agg's rmse: 0.288531 + 0.000733627
[400]	cv_agg's rmse: 0.288331 + 0.000774997
[600]	cv_agg's rmse: 0.288333 + 0.000768253
| [0m 1       [0m | [0m-0.2883  [0m | [0m 0.3996  [0m | [0m 0.8606  [0m | [0m 2.223   [0m | [0m 3.033   [0m | [0m 5.78    [0m | [0m 13.36   [0m | [0m 0.00675 [0m | [0m 226.0   [0m |
[200]	cv_agg's rmse: 0.288593 + 0.000780724
[400]	cv_agg's rmse: 0.288603 + 0.000801142
| [0m 2       [0m | [0m-0.2886  [0m | [0m 0.5809  [0m | [0m 0.6665  [0m | [0m 0.1597  [0m | [0m 4.853   [0m | [0m 9.162   [0m | [0m 16.74   [0m | [0m 0.019   [0m | [0m 73.08   [0m |
[200]	cv_agg's rmse: 0.289375 + 0.000775112
[400]	cv_agg's rmse: 0.289353 + 0.000788458
| [0m 3       [0m | [0m-0.2893  [0m | [

[200]	cv_agg's rmse: 0.288617 + 0.00080478
| [0m 23      [0m | [0m-0.2886  [0m | [0m 0.5812  [0m | [0m 0.5881  [0m | [0m 1.21    [0m | [0m 0.5925  [0m | [0m 8.061   [0m | [0m 4.129   [0m | [0m 0.004491[0m | [0m 254.5   [0m |
[200]	cv_agg's rmse: 0.288804 + 0.000749364
| [0m 24      [0m | [0m-0.2888  [0m | [0m 0.8867  [0m | [0m 0.4704  [0m | [0m 2.201   [0m | [0m 4.571   [0m | [0m 8.677   [0m | [0m 4.34    [0m | [0m 0.04053 [0m | [0m 252.5   [0m |
[200]	cv_agg's rmse: 0.288317 + 0.000812715
| [0m 25      [0m | [0m-0.2883  [0m | [0m 0.6083  [0m | [0m 0.6778  [0m | [0m 1.635   [0m | [0m 3.534   [0m | [0m 8.874   [0m | [0m 4.048   [0m | [0m 0.06942 [0m | [0m 255.8   [0m |
[200]	cv_agg's rmse: 0.288806 + 0.000748297
| [0m 26      [0m | [0m-0.2888  [0m | [0m 0.1832  [0m | [0m 0.5419  [0m | [0m 0.3147  [0m | [0m 4.22    [0m | [0m 8.852   [0m | [0m 4.306   [0m | [0m 0.09907 [0m | [0m 255.3   [0m |
[200]	cv_agg's rm

[600]	cv_agg's rmse: 0.291499 + 0.000726043
[800]	cv_agg's rmse: 0.291288 + 0.000742642
| [0m 50      [0m | [0m-0.2913  [0m | [0m 0.2046  [0m | [0m 0.3337  [0m | [0m 2.836   [0m | [0m 4.554   [0m | [0m 5.794   [0m | [0m 4.541   [0m | [0m 0.03767 [0m | [0m 255.0   [0m |
[200]	cv_agg's rmse: 0.288313 + 0.00082481
| [0m 51      [0m | [0m-0.2883  [0m | [0m 0.8905  [0m | [0m 0.8169  [0m | [0m 0.8719  [0m | [0m 2.587   [0m | [0m 9.898   [0m | [0m 4.151   [0m | [0m 0.08818 [0m | [0m 187.6   [0m |
[200]	cv_agg's rmse: 0.289143 + 0.000767589
[400]	cv_agg's rmse: 0.288757 + 0.000793537
[600]	cv_agg's rmse: 0.288641 + 0.000795012
[800]	cv_agg's rmse: 0.288618 + 0.000789155
| [0m 52      [0m | [0m-0.2886  [0m | [0m 0.6467  [0m | [0m 0.6489  [0m | [0m 0.969   [0m | [0m 4.002   [0m | [0m 9.023   [0m | [0m 4.704   [0m | [0m 0.02001 [0m | [0m 32.86   [0m |
[200]	cv_agg's rmse: 0.288096 + 0.000792125
| [0m 53      [0m | [0m-0.2881  [0m | 

In [8]:
opt_params

{'target': -0.288050205397324,
 'params': {'bagging_fraction': 0.6446328438944265,
  'feature_fraction': 0.8794830403891444,
  'lambda_l1': 1.140321128532213,
  'lambda_l2': 4.07807989577541,
  'max_depth': 9.897178623175707,
  'min_child_weight': 63.71813635352462,
  'min_split_gain': 0.049940148983824366,
  'num_leaves': 170.7920933377821}}