In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os, gc, random
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score
from viterbi_utils import *
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm

import numpy.fft as fft
from scipy import signal as scisig
from viterbi_utils import *
from utils_mini import *
from fast_macro_f1_func import *
import lightgbm as lgb

import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 500)

os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [2]:
df_train = pd.read_pickle('../features/train_clean.pkl')
df_test = pd.read_pickle('../features/test_clean.pkl')
TARGET = "open_channels"
df_test[TARGET] = 0

df_train["group"] = df_train["batch"].astype("str") + "_" + df_train["mini_batch"].astype("str")
df_test["group"] = df_test["batch"].astype("str") + "_" + df_test["mini_batch"].astype("str")

df_train["signal_original"] = df_train["signal"].copy()
df_test["signal_original"] = df_test["signal"].copy()

print(f"train size:{df_train.shape}, test size:{df_test.shape}")
df_train.head()

train size:(4500000, 8), test size:(2000000, 9)


Unnamed: 0,time,signal,open_channels,local_time,batch,mini_batch,group,signal_original
0,0.0001,-2.76,0,0.0001,1,1,1_1,-2.76
1,0.0002,-2.8557,0,0.0002,1,1,1_1,-2.8557
2,0.0003,-2.4074,0,0.0003,1,1,1_1,-2.4074
3,0.0004,-3.1404,0,0.0004,1,1,1_1,-3.1404
4,0.0005,-3.1525,0,0.0005,1,1,1_1,-3.1525


In [3]:
# configurations and main hyperparammeters
shft = 1
nn_epochs = 100
nn_batch_size = 16
class_num = 11 - shft
LR = 0.001

In [4]:
# reduce batch 5&10 open_channels from 11 to 10 (class 1-10)
df_train.loc[df_train.batch.isin([5,10]) & (df_train.open_channels < shft), "open_channels"] = shft
df_train[TARGET] = df_train[TARGET] - shft

# mini model
BATCH_GROUP = [5, 10]
df_train = df_train[df_train.batch.isin(BATCH_GROUP)].reset_index(drop=True)
TEST_GROUP = ["2_1", "2_3"]
df_test = df_test[df_test.group.isin(TEST_GROUP)].reset_index(drop=True)

print(f"train size:{df_train.shape}, test size:{df_test.shape}")

train size:(1000000, 8), test size:(200000, 9)


In [5]:
sig_mean = get_mean(df_train)
print(np.array(sig_mean).round(3))
# sig_mean = [-4.255, -3.017, -1.779, -0.541, 0.697, 1.935, 3.173, 4.411, 5.649, 6.887]

[-4.218 -2.969 -1.768 -0.526  0.71   1.941  3.175  4.409  5.641  6.875]


In [6]:
# remove the 50 hz noise using bandstop filter (group)
for group_i in df_train.group.unique():

    batch_i = df_train[df_train.group.isin([group_i])]
    signal_recovered = rm_noise(batch_i, sig_mean=sig_mean)
    df_train.loc[df_train.group.isin([group_i]), "signal"] = signal_recovered

In [7]:
sig_mean = get_mean(df_train)
print(np.array(sig_mean).round(3))

[-4.212 -2.959 -1.766 -0.523  0.71   1.941  3.175  4.409  5.641  6.875]


In [8]:
# feature engineering here
def fe(df, is_train):

    # shift features
    for shift_val in range(1, 6):
        group_on = "group"
        df[f'shift+{shift_val}'] = df.groupby([group_on])['signal'].shift(shift_val).fillna(0)
        df[f'shift_{shift_val}'] = df.groupby([group_on])['signal'].shift(-shift_val).fillna(0)
    
    return df

df_train = fe(df_train, is_train=1)
df_test = fe(df_test, is_train=0)

use_cols = [
    col for col in df_train.columns if col not in
    ["time", "local_time", "open_channels", "batch", "mini_batch", "group", "oof", "signal_original"]
]
print("Used columns is", use_cols)
df_train.head()

Used columns is ['signal', 'shift+1', 'shift_1', 'shift+2', 'shift_2', 'shift+3', 'shift_3', 'shift+4', 'shift_4', 'shift+5', 'shift_5']


Unnamed: 0,time,signal,open_channels,local_time,batch,mini_batch,group,signal_original,shift+1,shift_1,shift+2,shift_2,shift+3,shift_3,shift+4,shift_4,shift+5,shift_5
0,200.000107,7.1843,9,0.000107,5,1,5_1,7.1191,0.0,6.836219,0.0,6.992574,0.0,7.418564,0.0,7.48349,0.0,7.724653
1,200.000198,6.836219,9,0.000198,5,1,5_1,6.7712,7.1843,6.992574,0.0,7.418564,0.0,7.48349,0.0,7.724653,0.0,6.612053
2,200.000305,6.992574,9,0.000305,5,1,5_1,6.9278,6.836219,7.418564,7.1843,7.48349,0.0,7.724653,0.0,6.612053,0.0,6.16149
3,200.000397,7.418564,9,0.000397,5,1,5_1,7.3541,6.992574,7.48349,6.836219,7.724653,7.1843,6.612053,0.0,6.16149,0.0,6.702865
4,200.000504,7.48349,9,0.000504,5,1,5_1,7.4194,7.418564,7.724653,6.992574,6.612053,6.836219,6.16149,7.1843,6.702865,0.0,6.819578


In [9]:
rng=42
param_5 = {
    "objective": "regression",
    'metric': 'rmse',
    'num_leaves': 128,
    'learning_rate': 0.1,
    'n_estimators': 5000,
    'subsample': 0.65,
    "feature_fraction": 1,
    "lambda_l1": 0.85,
    "lambda_l2": 0.78,
    "max_depth": 8,
    "min_child_weight": 0.06,
    "min_split_gain": 0.06,
    'seed': rng,
    'n_jobs': -1
}
BATCH_GROUP_5 = [5,10]
# param_5 = param
gkf = GroupKFold(n_splits=5)
all_features = use_cols

In [10]:
groups = df_train["group"].values
oof_pred = np.zeros(df_train.shape[0])
adjust_num = 1

In [11]:
for fold, (train_index, valid_index) in enumerate(gkf.split(df_train, df_train[TARGET], groups)):

    print("Traning on folder", fold, "...")
    X_train, X_valid = df_train.iloc[train_index][all_features], df_train.iloc[valid_index][all_features]
    y_train, y_valid = df_train.iloc[train_index][TARGET], df_train.iloc[valid_index][TARGET]

    model_lgb = lgb.LGBMRegressor(**param_5)

    model_lgb.fit(X_train, y_train,
                    eval_set=[(X_train, y_train), (X_valid, y_valid)],
                    early_stopping_rounds=200,
                    verbose=200)
#     fi_df[f'importance_folder_{fold}'] = model_lgb.feature_importances_
    y_pred_valid = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration_)
    oof_pred[valid_index] = y_pred_valid

#     df_test.loc[(df_test.batch == 2) & (df_test.mini_batch == 1),TARGET] += model_lgb.predict(df_test_2_1[all_features], num_iteration=model_lgb.best_iteration_) / cv
#     df_test.loc[(df_test.batch == 2) & (df_test.mini_batch == 3),TARGET] += model_lgb.predict(df_test_2_3[all_features], num_iteration=model_lgb.best_iteration_) / cv

    print("--- Vilid F1 score is", np.round(macro_f1_score_nb(df_train.iloc[valid_index][TARGET].astype(np.int32).values - adjust_num, y_pred_valid.round().astype(np.int32) - adjust_num, 11 - adjust_num), 4))
    print("")

# optimize the round prediction
df_train.loc[df_train.batch.isin(BATCH_GROUP_5), "oof"] = oof_pred
optRf = OptimizedRounderF1_model5()
optRf.fit(df_train.loc[df_train.batch.isin(BATCH_GROUP_5), "oof"], df_train.loc[df_train.batch.isin(BATCH_GROUP_5), TARGET])
coefficientsf = optRf.coefficients()
print(coefficientsf)
df_train.loc[df_train.batch.isin(BATCH_GROUP_5), "oof"] = optRf.predict(df_train.loc[df_train.batch.isin(BATCH_GROUP_5), "oof"], coefficientsf)
df_test.loc[(df_test.batch == 2) & (df_test.mini_batch == 1),TARGET] = optRf.predict(df_test.loc[(df_test.batch == 2) & (df_test.mini_batch == 1),TARGET], coefficientsf)
df_test.loc[(df_test.batch == 2) & (df_test.mini_batch == 3),TARGET] = optRf.predict(df_test.loc[(df_test.batch == 2) & (df_test.mini_batch == 3),TARGET], coefficientsf)

# print("-------------------------------------")

print("Overall F1 score is", np.round(macro_f1_score_nb(df_train.loc[df_train.batch.isin(BATCH_GROUP_5), TARGET].astype(np.int32).values - adjust_num, df_train.loc[df_train.batch.isin(BATCH_GROUP_5), "oof"].astype(np.int32).values - adjust_num, 11- adjust_num), 5))

fi_df['importance'] = fi_df.sum(axis=1)
fi_df.sort_values('importance')['importance'][-30:].plot(
    kind='barh',
    figsize=(12, 5),
    title='- Feature Importance',
)
plt.show()

# 0.875 (9 classes)

Traning on folder 0 ...
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.276117	valid_1's rmse: 0.280589
Early stopping, best iteration is:
[63]	training's rmse: 0.277218	valid_1's rmse: 0.28052
--- Vilid F1 score is 0.744

Traning on folder 1 ...
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.275916	valid_1's rmse: 0.280405
Early stopping, best iteration is:
[66]	training's rmse: 0.277188	valid_1's rmse: 0.280334
--- Vilid F1 score is 0.7474

Traning on folder 2 ...
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.275644	valid_1's rmse: 0.281986
Early stopping, best iteration is:
[62]	training's rmse: 0.276874	valid_1's rmse: 0.281925
--- Vilid F1 score is 0.7426

Traning on folder 3 ...
Training until validation scores don't improve for 200 rounds


KeyboardInterrupt: 

In [15]:
np.unique(y_pred_valid.round()
)

array([1., 2., 3., 4., 5., 6., 7., 8., 9.])