In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, time
pd.set_option('display.max_columns', 500)
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold, GroupKFold

from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_toolbelt import losses as L
from nn_utils import *
from feature_engineering import *
from network import *

LOG_DATE = "04_17_"
NOTE = "cnn_rnn_baseline"
FILE_NAME = LOG_DATE + NOTE
FILE = "./logs/" + FILE_NAME + ".log"
if not os.path.exists(FILE):
    os.mknod(FILE)

logger = get_logger(FILE)
set_seeds(42)

df_train_raw = pd.read_pickle('../features/train_clean.pkl')
df_test_raw = pd.read_pickle('../features/test_clean.pkl')
TARGET = "open_channels"
df_test_raw[TARGET] = 0

print("Raw feature shape is", df_train_raw.shape, df_test_raw.shape)

Raw feature shape is (4500000, 6) (2000000, 7)


In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold, GroupKFold

from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_toolbelt import losses as L
from nn_utils import *
from feature_engineering import *
from network import *
from wavenet import *

LOG_DATE = "04_25_"
NOTE = "crnn_wavelet_1feature"
FILE_NAME = LOG_DATE + NOTE
FILE = "./logs/" + FILE_NAME + ".log"
if not os.path.exists(FILE):
    os.mknod(FILE)

logger = get_logger(FILE)
set_seeds(42)

df_train_raw = pd.read_pickle('../features/train_clean.pkl')
df_test_raw = pd.read_pickle('../features/test_clean.pkl')
TARGET = "open_channels"
df_test_raw[TARGET] = 0

print("Raw feature shape is", df_train_raw.shape, df_test_raw.shape)

# RFC features
Y_train_proba = np.load("../features/Y_train_proba.npy")
Y_test_proba = np.load("../features/Y_test_proba.npy")
Y_train_proba = np.delete(Y_train_proba, list(range(3500000,4000000)), 0)

for i in range(11):
    df_train_raw[f"proba_{i}"] = Y_train_proba[:, i]
    df_test_raw[f"proba_{i}"] = Y_test_proba[:, i]

Raw feature shape is (4500000, 6) (2000000, 7)


ValueError: Length of values does not match length of index

In [36]:
Y_test_proba.shape

(2000000, 11)

In [2]:
# # feature engineering
# df_train_raw = fe(df_train_raw, 1) # 1 is train
# df_test_raw = fe(df_test_raw, 0) # 0 is test

# print(df_train_raw.shape, df_test_raw.shape)

In [3]:
use_cols = [
    col for col in df_train_raw.columns if col not in
    ["time", "local_time", "open_channels", "batch", "mini_batch"]
]
print("feature # is", len(use_cols))
print("Used columns is", use_cols)

feature # is 1
Used columns is ['signal']


In [4]:
print(df_train_raw.signal.min(), df_test_raw.signal.min())
print(df_train_raw.signal.max(), df_test_raw.signal.max())

-3.9107 -4.032600849280485
11.3431 11.276


In [5]:
NUM_BINS = 200
signal_bins = np.linspace(-3.8, 11.2, NUM_BINS - 1).reshape([-1])

signal_dig = np.digitize(df_train_raw[use_cols].values.reshape([-1]), bins=signal_bins)
signal_dig_test = np.digitize(df_test_raw[use_cols].values.reshape([-1]), bins=signal_bins)

df_signal_dig = pd.get_dummies(signal_dig)
df_signal_dig_test = pd.get_dummies(signal_dig_test)
print("digital shape", df_signal_dig.shape, df_signal_dig_test.shape)
df_train_raw = pd.concat([df_train_raw, df_signal_dig], axis=1)
df_test_raw = pd.concat([df_test_raw, df_signal_dig_test], axis=1)


digital shape (4500000, 200) (2000000, 200)


In [6]:
df_train_raw.head(1)

Unnamed: 0,time,signal,open_channels,local_time,batch,mini_batch,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,0.0001,-2.76,0,0.0001,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
df_test_raw.head(1)

Unnamed: 0,time,signal,local_time,mini_local_time,batch,mini_batch,open_channels,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,500.0001,-2.649831,0.0001,0.0001,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
use_cols = [
    col for col in df_train_raw.columns if col not in
    ["time", "local_time", "open_channels", "batch", "mini_batch", "signal"]
]
print("Used columns is", len(use_cols))
print(use_cols)

Used columns is 200
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199]


In [9]:

def chop_seq(df_batch_i):

    df_batch_i_features = []
    df_batch_i_y = []

    for i in range(200):

        # (2500, 5)
        tmp = df_batch_i[(2500 * i):(2500 * (i + 1))]
        df_batch_i_features.append(tmp[use_cols].values)
        df_batch_i_y.append(tmp[TARGET].values)

    return df_batch_i_features, df_batch_i_y

# TRAIN
df_train = []
df_train_y = []

for batch_i in [1, 2, 3, 4, 5, 6, 7, 9, 10]:
    df_batch_i = df_train_raw[df_train_raw.batch == batch_i]
    df_batch_i_features, df_batch_i_y = chop_seq(df_batch_i)
    df_train.append(df_batch_i_features)
    df_train_y.append(df_batch_i_y)

df_train = np.array(df_train).reshape([-1, 2500, np.array(df_train).shape[-1]]).transpose([0, 2, 1])
df_train_y = np.array(df_train_y).reshape([-1, 2500])

print("TRAIN:", df_train.shape, df_train_y.shape)

# TEST
df_test = []
df_test_y = []

for batch_i in [1, 2, 3, 4]:
    df_batch_i = df_test_raw[df_test_raw.batch == batch_i]
    df_batch_i_features, df_batch_i_y = chop_seq(df_batch_i)
    df_test.append(df_batch_i_features)
    df_test_y.append(df_batch_i_y)

df_test = np.array(df_test).reshape([-1, 2500, np.array(df_test).shape[-1]]).transpose([0, 2, 1])
df_test_y = np.array(df_test_y).reshape([-1, 2500])

print("TEST:", df_test.shape, df_test_y.shape)

TRAIN: (1800, 200, 2500) (1800, 2500)
TEST: (800, 200, 2500) (800, 2500)


In [10]:
df_train_raw.head()

Unnamed: 0,time,signal,open_channels,local_time,batch,mini_batch,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,0.0001,-2.76,0,0.0001,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.0002,-2.8557,0,0.0002,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.0003,-2.4074,0,0.0003,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.0004,-3.1404,0,0.0004,1.0,1.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.0005,-3.1525,0,0.0005,1.0,1.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
