In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import sys, gc, warnings, random, math, time, datetime, os
from tqdm import tqdm_notebook
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

import lightgbm as lgb
from bayes_opt import BayesianOptimization

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
# chris clean dataset
df_train = pd.read_csv("../input/train_clean.csv")
df_test = pd.read_csv("../input/test_clean.csv")

df_train["local_time"] = df_train.time % 50
df_train.loc[df_train.local_time == 0.0000, "local_time"] = 50

BATCH_SIZE = 500000

# train
for batch_i in range(10):
    df_train.loc[BATCH_SIZE * batch_i:BATCH_SIZE * batch_i + 500000, 'batch'] = batch_i + 1

    df_train.loc[BATCH_SIZE * batch_i:BATCH_SIZE * batch_i + 100000, 'mini_batch'] = 1
    df_train.loc[BATCH_SIZE * batch_i + 100000:BATCH_SIZE * batch_i + 200000, 'mini_batch'] = 2
    df_train.loc[BATCH_SIZE * batch_i + 200000:BATCH_SIZE * batch_i + 300000, 'mini_batch'] = 3
    df_train.loc[BATCH_SIZE * batch_i + 300000:BATCH_SIZE * batch_i + 400000, 'mini_batch'] = 4
    df_train.loc[BATCH_SIZE * batch_i + 400000:BATCH_SIZE * batch_i + 500000, 'mini_batch'] = 5

df_train = df_train.drop(df_train[(df_train.batch.isin([8]))].index)
df_train.loc[(df_train.batch.isin([5,10])), "signal"] += 2.726

In [3]:
# df_train = pd.read_pickle('../features/train_v2.pkl')
TARGET = "open_channels"

print(df_train.shape)
df_train.head()

(4500000, 6)


Unnamed: 0,time,signal,open_channels,local_time,batch,mini_batch
0,0.0001,-2.76,0,0.0001,1.0,1.0
1,0.0002,-2.8557,0,0.0002,1.0,1.0
2,0.0003,-2.4074,0,0.0003,1.0,1.0
3,0.0004,-3.1404,0,0.0004,1.0,1.0
4,0.0005,-3.1525,0,0.0005,1.0,1.0


In [4]:
def fe(df):
    
    for shift_val in range(1, 6):
    
        df[f'shift+{shift_val}'] = df.groupby(['batch']).shift(shift_val)['signal']
        df[f'shift_{shift_val}'] = df.groupby(['batch']).shift(-shift_val)['signal']
        
        df[f'diff+{shift_val}'] = df[f'shift+{shift_val}'] - df['signal']
        df[f'diff_{shift_val}'] = df[f'shift_{shift_val}'] - df['signal']
        
        df[f'add+{shift_val}'] = df[f'shift+{shift_val}'] + df['signal']
        df[f'add_{shift_val}'] = df[f'shift_{shift_val}'] + df['signal']
        
        df[f'mul+{shift_val}'] = df[f'shift+{shift_val}'] * df['signal']
        df[f'mul_{shift_val}'] = df[f'shift_{shift_val}'] * df['signal']

    return df

print("feature engineering on training data ...")
df_train = fe(df_train)

df_train.dropna(inplace=True)

feature engineering on training data ...


In [5]:
df_train = df_train.drop(df_train[(df_train.batch.isin([5, 10]))
                                  & (df_train.open_channels.isin([0]))].index)

BATCH_GROUP = [6, 9]
df_train = df_train[df_train.batch.isin(BATCH_GROUP)]

print(df_train.shape)

(999980, 46)


In [6]:
color_list = [
    "b", "g", "r", "c", "m", "k", "y", '#0000FF', '#8A2BE2', '#A52A2A',
    '#DEB887', '#5F9EA0'
]

# drop useless features
drop_features = [
    "time",
    "open_channels",
    "local_time",
    "batch",
    "mini_batch",
    "oof",
]
all_features = [col for col in df_train.columns if col not in drop_features]

print("train/test shape is:", df_train.shape)
print("features used # is", len(all_features))
df_train[all_features].head()
df_train.head()

train/test shape is: (999980, 46)
features used # is 41


Unnamed: 0,time,signal,open_channels,local_time,batch,mini_batch,shift+1,shift_1,diff+1,diff_1,add+1,add_1,mul+1,mul_1,shift+2,shift_2,diff+2,diff_2,add+2,add_2,mul+2,mul_2,shift+3,shift_3,diff+3,diff_3,add+3,add_3,mul+3,mul_3,shift+4,shift_4,diff+4,diff_4,add+4,add_4,mul+4,mul_4,shift+5,shift_5,diff+5,diff_5,add+5,add_5,mul+5,mul_5
2500005,250.0006,3.4273,5,0.0006,6.0,1.0,3.3368,3.2246,-0.0905,-0.2027,6.7641,6.6519,11.436215,11.051672,3.9822,2.8491,0.5549,-0.5782,7.4095,6.2764,13.648194,9.76472,3.5277,3.4026,0.1004,-0.0247,6.955,6.8299,12.090486,11.661731,3.0907,3.7222,-0.3366,0.2949,6.518,7.1495,10.592756,12.757096,2.8555,3.4091,-0.5718,-0.0182,6.2828,6.8364,9.786655,11.684008
2500006,250.0007,3.2246,5,0.0007,6.0,1.0,3.4273,2.8491,0.2027,-0.3755,6.6519,6.0737,11.051672,9.187208,3.3368,3.4026,0.1122,0.178,6.5614,6.6272,10.759845,10.972024,3.9822,3.7222,0.7576,0.4976,7.2068,6.9468,12.841002,12.002606,3.5277,3.4091,0.3031,0.1845,6.7523,6.6337,11.375421,10.992984,3.0907,3.298,-0.1339,0.0734,6.3153,6.5226,9.966271,10.634731
2500007,250.0008,2.8491,5,0.0008,6.0,1.0,3.2246,3.4026,0.3755,0.5535,6.0737,6.2517,9.187208,9.694348,3.4273,3.7222,0.5782,0.8731,6.2764,6.5713,9.76472,10.60492,3.3368,3.4091,0.4877,0.56,6.1859,6.2582,9.506877,9.712867,3.9822,3.298,1.1331,0.4489,6.8313,6.1471,11.345686,9.396332,3.5277,3.1906,0.6786,0.3415,6.3768,6.0397,10.05077,9.090338
2500008,250.0009,3.4026,5,0.0009,6.0,1.0,2.8491,3.7222,-0.5535,0.3196,6.2517,7.1248,9.694348,12.665158,3.2246,3.4091,-0.178,0.0065,6.6272,6.8117,10.972024,11.599804,3.4273,3.298,0.0247,-0.1046,6.8299,6.7006,11.661731,11.221775,3.3368,3.1906,-0.0658,-0.212,6.7394,6.5932,11.353796,10.856336,3.9822,3.3192,0.5796,-0.0834,7.3848,6.7218,13.549834,11.29391
2500009,250.001,3.7222,5,0.001,6.0,1.0,3.4026,3.4091,-0.3196,-0.3131,7.1248,7.1313,12.665158,12.689352,2.8491,3.298,-0.8731,-0.4242,6.5713,7.0202,10.60492,12.275816,3.2246,3.1906,-0.4976,-0.5316,6.9468,6.9128,12.002606,11.876051,3.4273,3.3192,-0.2949,-0.403,7.1495,7.0414,12.757096,12.354726,3.3368,4.0096,-0.3854,0.2874,7.059,7.7318,12.420237,14.924533


In [7]:
def bayes_parameter_opt_lgb(X,
                            y,
                            init_round=15, # how many steps of random exploration
                            opt_round=25, # how many steps of bayes optimization
                            n_folds=5,
                            random_seed=6,
                            n_estimators=10000,
                            learning_rate=0.05,
                            output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X,
                             label=y,
                             params={'verbose': -1},
                             free_raw_data=False)

    # parameters
    def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth,
                 lambda_l1, lambda_l2, min_split_gain, min_child_weight):
        params = {
            'application': 'regression',
            'num_iterations': n_estimators,
            'learning_rate': learning_rate,
            'early_stopping_round': 200,
            'metric': 'rmse'
        }
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight

        # modify here
        cv_result = lgb.cv(params,
                           train_data,
                           nfold=n_folds,
                           seed=random_seed,
                           stratified=True,
                           verbose_eval=200,
                           metrics=['rmse'])
        
        return -min(cv_result['rmse-mean'])

    # range
    lgbBO = BayesianOptimization(lgb_eval, {
        'num_leaves': (64, 512),
        'feature_fraction': (0.9, 1),
        'bagging_fraction': (0.5, 0.9),
        'max_depth': (6, 10),
        'lambda_l1': (0.1, 1),
        'lambda_l2': (0.1, 1),
        'min_split_gain': (0.01, 1),
        'min_child_weight': (0.001, 0.1)
    },
                                 random_state=42)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)

    # output optimization process
    #     if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")

    # return best parameters
    return lgbBO.max

In [8]:
X = df_train[all_features]
y = df_train[TARGET]

opt_params = bayes_parameter_opt_lgb(X,
                                     y,
                                     init_round=5, # how many steps of random exploration
                                     opt_round=45, # how many steps of bayes optimization
                                     n_folds=3,
                                     random_seed=42,
                                     n_estimators=3000,
                                     learning_rate=0.1)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------
[200]	cv_agg's rmse: 0.131124 + 0.000286376
| [0m 1       [0m | [0m-0.1311  [0m | [0m 0.6498  [0m | [0m 0.9951  [0m | [0m 0.7588  [0m | [0m 0.6388  [0m | [0m 6.624   [0m | [0m 0.01644 [0m | [0m 0.0675  [0m | [0m 452.0   [0m |
[200]	cv_agg's rmse: 0.131061 + 0.000344998
| [95m 2       [0m | [95m-0.1311  [0m | [95m 0.7404  [0m | [95m 0.9708  [0m | [95m 0.1185  [0m | [95m 0.9729  [0m | [95m 9.33    [0m | [95m 0.02202 [0m | [95m 0.19    [0m | [95m 146.2   [0m |
[200]	cv_agg's rmse: 0.131444 + 0.000310184
| [0m 3       [0m | [0m-0.1314  [0m | [0m 0.6217  [0m | [0m 0.9525  [0m | [0m 0.4888  [0m | [0m 0.3621  [0m | [0m 8.447   [0m | [0m 0.01481 [0m | [0m 0.2992  [0m | [0m 228.1   [0m |
[200]	cv

[200]	cv_agg's rmse: 0.131208 + 0.000309423
| [0m 34      [0m | [0m-0.1312  [0m | [0m 0.8245  [0m | [0m 0.9467  [0m | [0m 0.4692  [0m | [0m 0.2567  [0m | [0m 9.742   [0m | [0m 0.03996 [0m | [0m 0.5275  [0m | [0m 65.78   [0m |
[200]	cv_agg's rmse: 0.131052 + 0.000312903
| [0m 35      [0m | [0m-0.1311  [0m | [0m 0.5355  [0m | [0m 0.902   [0m | [0m 0.4165  [0m | [0m 0.2832  [0m | [0m 9.338   [0m | [0m 0.04215 [0m | [0m 0.1986  [0m | [0m 64.5    [0m |
[200]	cv_agg's rmse: 0.130934 + 0.000326603
| [0m 36      [0m | [0m-0.1309  [0m | [0m 0.6527  [0m | [0m 0.9458  [0m | [0m 0.5996  [0m | [0m 0.2258  [0m | [0m 9.617   [0m | [0m 0.05128 [0m | [0m 0.1712  [0m | [0m 66.19   [0m |
[200]	cv_agg's rmse: 0.131532 + 0.000293041
| [0m 37      [0m | [0m-0.1315  [0m | [0m 0.7005  [0m | [0m 0.9847  [0m | [0m 0.5788  [0m | [0m 0.3368  [0m | [0m 7.64    [0m | [0m 0.03613 [0m | [0m 0.7903  [0m | [0m 65.1    [0m |
[200]	cv_agg's r

In [9]:
opt_params

{'target': -0.13089023806253963,
 'params': {'bagging_fraction': 0.8865613692803884,
  'feature_fraction': 0.9065201597280866,
  'lambda_l1': 0.4954355237596384,
  'lambda_l2': 0.6885879640923933,
  'max_depth': 9.114786058663146,
  'min_child_weight': 0.01793695260644467,
  'min_split_gain': 0.07283399213777113,
  'num_leaves': 64.49133271381203}}

In [10]:
# {'target': -0.288050205397324,
#  'params': {'bagging_fraction': 0.6446328438944265,
#   'feature_fraction': 0.8794830403891444,
#   'lambda_l1': 1.140321128532213,
#   'lambda_l2': 4.07807989577541,
#   'max_depth': 9.897178623175707,
#   'min_child_weight': 63.71813635352462,
#   'min_split_gain': 0.049940148983824366,
#   'num_leaves': 170.7920933377821}}