In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

In [2]:
# chris clean dataset
df_train = pd.read_csv('../input/train_clean.csv', dtype={'time': np.float32, 'signal': np.float32, 'open_channels':np.int32})
df_test = pd.read_csv('../input/test_clean.csv', dtype={'time': np.float32, 'signal': np.float32})


df_train["local_time"] = df_train.time % 50
df_train.loc[df_train.local_time == 0.0000, "local_time"] = 50

df_test["local_time"] = df_test.time % 50
df_test.loc[df_test.local_time == 0.0000, "local_time"] = 50

df_test["mini_local_time"] = df_test.time % 10
df_test.loc[df_test.local_time == 0.0000, "mini_local_time"] = 10

BATCH_SIZE = 500000

# train
for batch_i in range(10):
    df_train.loc[BATCH_SIZE * batch_i:BATCH_SIZE * batch_i + 500000, 'batch'] = batch_i + 1

    df_train.loc[BATCH_SIZE * batch_i:BATCH_SIZE * batch_i + 100000, 'mini_batch'] = 1
    df_train.loc[BATCH_SIZE * batch_i + 100000:BATCH_SIZE * batch_i + 200000, 'mini_batch'] = 2
    df_train.loc[BATCH_SIZE * batch_i + 200000:BATCH_SIZE * batch_i + 300000, 'mini_batch'] = 3
    df_train.loc[BATCH_SIZE * batch_i + 300000:BATCH_SIZE * batch_i + 400000, 'mini_batch'] = 4
    df_train.loc[BATCH_SIZE * batch_i + 400000:BATCH_SIZE * batch_i + 500000, 'mini_batch'] = 5
# test
for batch_i in range(4):
    df_test.loc[BATCH_SIZE * batch_i:BATCH_SIZE * batch_i + 500000, 'batch'] = batch_i + 1

    df_test.loc[BATCH_SIZE * batch_i:BATCH_SIZE * batch_i + 100000, 'mini_batch'] = 1
    df_test.loc[BATCH_SIZE * batch_i + 100000:BATCH_SIZE * batch_i + 200000, 'mini_batch'] = 2
    df_test.loc[BATCH_SIZE * batch_i + 200000:BATCH_SIZE * batch_i + 300000, 'mini_batch'] = 3
    df_test.loc[BATCH_SIZE * batch_i + 300000:BATCH_SIZE * batch_i + 400000, 'mini_batch'] = 4
    df_test.loc[BATCH_SIZE * batch_i + 400000:BATCH_SIZE * batch_i + 500000, 'mini_batch'] = 5

In [3]:
# channel 0 - batch 1
channel_0_batch_1 = df_train[(df_train.batch == 1) & (df_train.open_channels == 0)]
channel_0_batch_1_threshold = channel_0_batch_1.signal.quantile(0.99999)
channel_0_batch_1.loc[channel_0_batch_1.signal > channel_0_batch_1_threshold, "signal"] = channel_0_batch_1_threshold
df_train.loc[(df_train.batch == 1) & (df_train.open_channels == 0), "signal"] = channel_0_batch_1.signal

In [4]:
# channel 0 - batch 2
channel_0_batch_2 = df_train[(df_train.batch == 2) & (df_train.open_channels == 0)]
channel_0_batch_2_threshold = channel_0_batch_2.signal.quantile(0.99999)
channel_0_batch_2.loc[channel_0_batch_2.signal > channel_0_batch_2_threshold, "signal"] = channel_0_batch_2_threshold
df_train.loc[(df_train.batch == 2) & (df_train.open_channels == 0), "signal"] = channel_0_batch_2.signal

In [5]:
# delete the batch 8 samples
df_train = df_train.drop(df_train[(df_train.batch.isin([8]))].index).reset_index(drop=True)

In [6]:
# shift_val = 2.73
# df_train.loc[(df_train.batch.isin([5, 10])), "signal"] += shift_val
# df_test.loc[(df_test.batch.isin([2])) & (df_test.mini_batch.isin([1, 3])),
#             "signal"] += shift_val

In [7]:
df_train["batch"] = df_train["batch"].astype("int32")
df_train["mini_batch"] = df_train["mini_batch"].astype("int32")

df_test["batch"] = df_test["batch"].astype("int32")
df_test["mini_batch"] = df_test["mini_batch"].astype("int32")

In [8]:
print(df_train.shape, df_test.shape)
df_train.head()

(4500000, 6) (2000000, 6)


Unnamed: 0,time,signal,open_channels,local_time,batch,mini_batch
0,0.0001,-2.76,0,0.0001,1,1
1,0.0002,-2.8557,0,0.0002,1,1
2,0.0003,-2.4074,0,0.0003,1,1
3,0.0004,-3.1404,0,0.0004,1,1
4,0.0005,-3.1525,0,0.0005,1,1


In [9]:
df_test.head()

Unnamed: 0,time,signal,local_time,mini_local_time,batch,mini_batch
0,500.000092,-2.649832,9.2e-05,9.2e-05,1,1
1,500.000214,-2.849463,0.000214,0.000214,1,1
2,500.000305,-2.860094,0.000305,0.000305,1,1
3,500.000397,-2.435126,0.000397,0.000397,1,1
4,500.000488,-2.615657,0.000488,0.000488,1,1


In [10]:
# df_train.to_pickle("../features/train_clean.pkl")
# df_test.to_pickle("../features/test_clean.pkl")