In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 500)

from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_train = pd.read_pickle('../features/train_clean.pkl')
df_test = pd.read_pickle('../features/test_clean.pkl')

TARGET = "open_channels"
df_test[TARGET] = 0
df_test["group"] = df_test["batch"].astype("str") + "_" + df_test["mini_batch"].astype("str")
print(df_train.shape, df_test.shape)
df_test.head()

(4500000, 6) (2000000, 8)


Unnamed: 0,time,signal,local_time,mini_local_time,batch,mini_batch,open_channels,group
0,500.000092,-2.649832,9.2e-05,9.2e-05,1,1,0,1_1
1,500.000214,-2.849463,0.000214,0.000214,1,1,0,1_1
2,500.000305,-2.860094,0.000305,0.000305,1,1,0,1_1
3,500.000397,-2.435126,0.000397,0.000397,1,1,0,1_1
4,500.000488,-2.615657,0.000488,0.000488,1,1,0,1_1


In [3]:
df_model_4 = df_train[df_train.batch.isin([6,9])].reset_index(drop=True)

print(df_model_4.shape)
df_model_4.head()

(1000000, 6)


Unnamed: 0,time,signal,open_channels,local_time,batch,mini_batch
0,250.000107,2.8555,5,0.000107,6,1
1,250.000198,3.0907,5,0.000198,6,1
2,250.000305,3.5277,5,0.000305,6,1
3,250.000397,3.9822,5,0.000397,6,1
4,250.000504,3.3368,5,0.000504,6,1


In [4]:
df_test_model_4 = df_test[df_test.group.isin(["1_3", "2_2"])].reset_index(drop=True)

print(df_test_model_4.shape)
df_test_model_4.head()

(200000, 8)


Unnamed: 0,time,signal,local_time,mini_local_time,batch,mini_batch,open_channels,group
0,520.000122,3.5205,20.000122,0.000122,1,3,0,1_3
1,520.000183,3.0014,20.000183,0.000183,1,3,0,1_3
2,520.000305,3.223,20.000305,0.000305,1,3,0,1_3
3,520.000427,3.3751,20.000427,0.000427,1,3,0,1_3
4,520.000488,3.5275,20.000488,0.000488,1,3,0,1_3


In [5]:
# feature engineering here
def fe(df, is_train):

    df["group"] = df["batch"].astype("str") + "_" + df["mini_batch"].astype("str")
    
    # shift features
    for shift_val in range(1, 11):
        group_on = "batch" if is_train else "group"
        df[f'shift+{shift_val}'] = df.groupby([group_on])['signal'].shift(shift_val).fillna(0)
        df[f'shift_{shift_val}'] = df.groupby([group_on])['signal'].shift(-shift_val).fillna(0)
    
    return df


df_model_4 = fe(df_model_4, is_train=1)
df_test_model_4 = fe(df_test_model_4, is_train=0)

In [6]:
print(df_model_4.shape, df_test_model_4.shape)
df_model_4.head()

(1000000, 27) (200000, 28)


Unnamed: 0,time,signal,open_channels,local_time,batch,mini_batch,group,shift+1,shift_1,shift+2,shift_2,shift+3,shift_3,shift+4,shift_4,shift+5,shift_5,shift+6,shift_6,shift+7,shift_7,shift+8,shift_8,shift+9,shift_9,shift+10,shift_10
0,250.000107,2.8555,5,0.000107,6,1,6_1,0.0,3.0907,0.0,3.5277,0.0,3.9822,0.0,3.3368,0.0,3.4273,0.0,3.2246,0.0,2.8491,0.0,3.4026,0.0,3.7222,0.0,3.4091
1,250.000198,3.0907,5,0.000198,6,1,6_1,2.8555,3.5277,0.0,3.9822,0.0,3.3368,0.0,3.4273,0.0,3.2246,0.0,2.8491,0.0,3.4026,0.0,3.7222,0.0,3.4091,0.0,3.298
2,250.000305,3.5277,5,0.000305,6,1,6_1,3.0907,3.9822,2.8555,3.3368,0.0,3.4273,0.0,3.2246,0.0,2.8491,0.0,3.4026,0.0,3.7222,0.0,3.4091,0.0,3.298,0.0,3.1906
3,250.000397,3.9822,5,0.000397,6,1,6_1,3.5277,3.3368,3.0907,3.4273,2.8555,3.2246,0.0,2.8491,0.0,3.4026,0.0,3.7222,0.0,3.4091,0.0,3.298,0.0,3.1906,0.0,3.3192
4,250.000504,3.3368,5,0.000504,6,1,6_1,3.9822,3.4273,3.5277,3.2246,3.0907,2.8491,2.8555,3.4026,0.0,3.7222,0.0,3.4091,0.0,3.298,0.0,3.1906,0.0,3.3192,0.0,4.0096


In [7]:
use_cols = [
    col for col in df_model_4.columns if col not in
    ["time", "local_time", "open_channels", "batch", "mini_batch", "group"]
]
print("Used columns is", use_cols)

Used columns is ['signal', 'shift+1', 'shift_1', 'shift+2', 'shift_2', 'shift+3', 'shift_3', 'shift+4', 'shift_4', 'shift+5', 'shift_5', 'shift+6', 'shift_6', 'shift+7', 'shift_7', 'shift+8', 'shift_8', 'shift+9', 'shift_9', 'shift+10', 'shift_10']


In [8]:
proba_cols = ["proba_" + str(i) for i in range(6)]
for col in proba_cols:
    df_model_4[col] = 0

In [9]:
gkf = GroupKFold(n_splits=5)
df_model_4["oof"] = 0

for index, (tr_idx, val_idx) in enumerate(
        gkf.split(df_model_4[use_cols], df_model_4[TARGET],
                  df_model_4["group"])):
    print(df_model_4.loc[tr_idx][use_cols].shape)
    print("Running folder", index , ": Evaluate on", np.unique(df_model_4["group"][val_idx]))
    clf = RandomForestClassifier(n_estimators=200,
                                 max_depth=19,
                                 max_features=10,
                                 random_state=42,
                                 n_jobs=-1,
                                 verbose=0)
    clf.fit(df_model_4.loc[tr_idx][use_cols], df_model_4.loc[tr_idx][TARGET])
    df_model_4.loc[val_idx, proba_cols] = clf.predict_proba(df_model_4.loc[val_idx][use_cols])
    df_model_4.loc[val_idx, "oof"] = clf.predict(df_model_4.loc[val_idx][use_cols])

(800000, 21)
Running folder 0 : Evaluate on ['6_5' '9_5']
(800000, 21)
Running folder 1 : Evaluate on ['6_4' '9_4']
(800000, 21)
Running folder 2 : Evaluate on ['6_3' '9_3']
(800000, 21)
Running folder 3 : Evaluate on ['6_2' '9_2']
(800000, 21)
Running folder 4 : Evaluate on ['6_1' '9_1']


In [10]:
print("oof F1 score is", f1_score(df_model_4["oof"], df_model_4[TARGET], average = 'macro'))

oof F1 score is 0.9725057408759262


In [11]:
# shift3:  0.9683
# shift5:  0.9708(max_features=10), 0.9668
# shift10: 0.9715(max_features=10), 0.8957, 0.9712(max_features=16), 0.9717(max_depth=19)
# shift15: 0.9708(max_features=10),0.9715 
# shift20: 0.9508(depth=10), 0.9698(depth=20)