In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import confusion_matrix, f1_score
from scipy.stats import norm

import warnings
warnings.filterwarnings("ignore")

In [2]:
# chris clean dataset
df_train = pd.read_csv("../input/train_clean.csv")
df_test = pd.read_csv("../input/test_clean.csv")

TARGET = "open_channels"
df_test[TARGET] = 0

df_train["local_time"] = df_train.time % 50
df_train.loc[df_train.local_time == 0.0000, "local_time"] = 50

df_test["local_time"] = df_test.time % 50
df_test.loc[df_test.local_time == 0.0000, "local_time"] = 50

df_test["mini_local_time"] = df_test.time % 10
df_test.loc[df_test.local_time == 0.0000, "mini_local_time"] = 10

BATCH_SIZE = 500000

# train
for batch_i in range(10):
    df_train.loc[BATCH_SIZE * batch_i:BATCH_SIZE * batch_i + 500000, 'batch'] = batch_i + 1

    df_train.loc[BATCH_SIZE * batch_i:BATCH_SIZE * batch_i + 100000, 'mini_batch'] = 1
    df_train.loc[BATCH_SIZE * batch_i + 100000:BATCH_SIZE * batch_i + 200000, 'mini_batch'] = 2
    df_train.loc[BATCH_SIZE * batch_i + 200000:BATCH_SIZE * batch_i + 300000, 'mini_batch'] = 3
    df_train.loc[BATCH_SIZE * batch_i + 300000:BATCH_SIZE * batch_i + 400000, 'mini_batch'] = 4
    df_train.loc[BATCH_SIZE * batch_i + 400000:BATCH_SIZE * batch_i + 500000, 'mini_batch'] = 5
# test
for batch_i in range(4):
    df_test.loc[BATCH_SIZE * batch_i:BATCH_SIZE * batch_i + 500000, 'batch'] = batch_i + 1

    df_test.loc[BATCH_SIZE * batch_i:BATCH_SIZE * batch_i + 100000, 'mini_batch'] = 1
    df_test.loc[BATCH_SIZE * batch_i + 100000:BATCH_SIZE * batch_i + 200000, 'mini_batch'] = 2
    df_test.loc[BATCH_SIZE * batch_i + 200000:BATCH_SIZE * batch_i + 300000, 'mini_batch'] = 3
    df_test.loc[BATCH_SIZE * batch_i + 300000:BATCH_SIZE * batch_i + 400000, 'mini_batch'] = 4
    df_test.loc[BATCH_SIZE * batch_i + 400000:BATCH_SIZE * batch_i + 500000, 'mini_batch'] = 5
    
df_train = df_train.drop(df_train[(df_train.batch.isin([8]))].index)


In [3]:
# channel 0 - batch 1
channel_0_batch_1 = df_train[(df_train.batch == 1) & (df_train.open_channels == 0)]
channel_0_batch_1_threshold = channel_0_batch_1.signal.quantile(0.99999)
channel_0_batch_1.loc[channel_0_batch_1.signal > channel_0_batch_1_threshold, "signal"] = channel_0_batch_1_threshold
df_train.loc[(df_train.batch == 1) & (df_train.open_channels == 0), "signal"] = channel_0_batch_1.signal

In [4]:
# channel 0 - batch 2
channel_0_batch_2 = df_train[(df_train.batch == 2) & (df_train.open_channels == 0)]
channel_0_batch_2_threshold = channel_0_batch_2.signal.quantile(0.99999)
channel_0_batch_2.loc[channel_0_batch_2.signal > channel_0_batch_2_threshold, "signal"] = channel_0_batch_2_threshold
df_train.loc[(df_train.batch == 2) & (df_train.open_channels == 0), "signal"] = channel_0_batch_2.signal

In [5]:
sub = pd.read_csv("../input/sample_submission.csv")
sub['time'] = [format(sub.time.values[x], '.4f') for x in range(2000000)]

print(df_train.shape, df_test.shape, sub.shape)
df_train.head()

(4500000, 6) (2000000, 7) (2000000, 2)


Unnamed: 0,time,signal,open_channels,local_time,batch,mini_batch
0,0.0001,-2.76,0,0.0001,1.0,1.0
1,0.0002,-2.8557,0,0.0002,1.0,1.0
2,0.0003,-2.4074,0,0.0003,1.0,1.0
3,0.0004,-3.1404,0,0.0004,1.0,1.0
4,0.0005,-3.1525,0,0.0005,1.0,1.0


In [6]:
color_list = [
    "b", "g", "r", "c", "m", "k", "y", '#0000FF', '#8A2BE2', '#A52A2A',
    '#DEB887', '#5F9EA0'
]

# drop useless features
drop_features = [
    "time",
    "open_channels",
    "local_time",
    "batch",
    "mini_batch",
]
all_features = [col for col in df_train.columns if col not in drop_features]

print("train/test shape is:", df_train.shape, df_test.shape)
print("features used # is", len(all_features))
df_train[all_features].head()

train/test shape is: (4500000, 6) (2000000, 7)
features used # is 1


Unnamed: 0,signal
0,-2.76
1,-2.8557
2,-2.4074
3,-3.1404
4,-3.1525


In [7]:
class ViterbiClassifier:
    def __init__(self):
        self._p_trans = None
        self._p_signal = None
    
    def fit(self, x, y):
        self._states = np.unique(y)
        self._n_states = len(self._states)
        
        self._p_trans = self.markov_p_trans(y)
        
        self._dists = []
        for s in np.arange(y.min(), y.max() + 1):
            self._dists.append((np.mean(x[y == s]), np.std(x[y == s])))
        
        return self
        
    def predict(self, x, p_signal=None, proba=False):
        if p_signal is None:
            p_signal = self.markov_p_signal(x)

        preds, probs = self.viterbi(self._p_trans, p_signal[self._states], x)
        
        if proba:
            return probs
        else:
            return preds
    
    def markov_p_signal(self, signal):
        p_signal = np.zeros((self._n_states, len(signal)))
        for k, dist in enumerate(self._dists):
            p_signal[k, :] = norm.pdf(signal, *dist)
            
        return p_signal
    
    def markov_p_trans(self, states):
        # https://www.kaggle.com/friedchips/the-viterbi-algorithm-a-complete-solution
        max_state = np.max(states)
        states_next = np.roll(states, -1)
        matrix = []
        for i in range(max_state + 1):
            current_row = np.histogram(states_next[states == i], bins=np.arange(max_state + 2))[0]
            if np.sum(current_row) == 0: # if a state doesn't appear in states...
                current_row = np.ones(max_state + 1) / (max_state + 1) # ...use uniform probability
            else:
                current_row = current_row / np.sum(current_row) # normalize to 1
            matrix.append(current_row)
        return np.array(matrix)
    
    def viterbi(self, p_trans, p_signal, signal):
        # https://www.kaggle.com/friedchips/the-viterbi-algorithm-a-complete-solution
        offset = 10**(-20) # added to values to avoid problems with log2(0)

        p_trans_tlog  = np.transpose(np.log2(p_trans  + offset)) # p_trans, logarithm + transposed
        p_signal_tlog = np.transpose(np.log2(p_signal + offset)) # p_signal, logarithm + transposed
        
        T1 = np.zeros(p_signal.shape)
        T2 = np.zeros(p_signal.shape)

        T1[:, 0] = p_signal_tlog[0, :]
        T2[:, 0] = 0

        for j in range(1, p_signal.shape[1]):
            for i in range(len(p_trans)):
                T1[i, j] = np.max(T1[:, j - 1] + p_trans_tlog[:, i] + p_signal_tlog[j, i])
                T2[i, j] = np.argmax(T1[:, j - 1] + p_trans_tlog[:, i] + p_signal_tlog[j, i])
        
        x = np.empty(p_signal.shape[1], 'B')
        x[-1] = np.argmax(T1[:, p_signal.shape[1] - 1])
        for i in reversed(range(1, p_signal.shape[1])):
            x[i - 1] = T2[x[i], i]
    
        return x, T1
    
class PosteriorDecoder:
    def __init__(self):
        self._p_trans = None
        self._p_signal = None
    
    def fit(self, x, y):
        self._states = np.unique(y)
        self._n_states = len(self._states)
        
        self._dists = []
        for s in np.arange(y.min(), y.max() + 1):
            self._dists.append((np.mean(x[y == s]), np.std(x[y == s])))
        
        self._p_trans = self.markov_p_trans(y)
        
        return self
        
    def predict(self, x, p_signal=None, proba=False):
        if p_signal is None:
            p_signal = self.markov_p_signal(x)
        preds = self.posterior_decoding(self._p_trans, p_signal[self._states])
        
        if proba:
            return probs
        else:
            return preds
    
    def markov_p_signal(self, signal):
        p_signal = np.zeros((self._n_states, len(signal)))
        for k, dist in enumerate(self._dists):
            p_signal[k, :] = norm.pdf(signal, *dist)
            
        return p_signal
    
    def markov_p_trans(self, states):
        # https://www.kaggle.com/friedchips/the-viterbi-algorithm-a-complete-solution
        max_state = np.max(states)
        states_next = np.roll(states, -1)
        matrix = []
        for i in range(max_state + 1):
            current_row = np.histogram(states_next[states == i], bins=np.arange(max_state + 2))[0]
            if np.sum(current_row) == 0: # if a state doesn't appear in states...
                current_row = np.ones(max_state + 1) / (max_state + 1) # ...use uniform probability
            else:
                current_row = current_row / np.sum(current_row) # normalize to 1
            matrix.append(current_row)
        return np.array(matrix)
    
    def forward(self, p_trans, p_signal):
        """Calculate the probability of being in state `k` at time `t`, 
           given all previous observations `x_1 ... x_t`"""
        T1 = np.zeros(p_signal.shape)
        T1[:, 0] = p_signal[:, 0]
        T1[:, 0] /= np.sum(T1[:, 0])

        for j in range(1, p_signal.shape[1]):
            for i in range(len(p_trans)):
                T1[i, j] = p_signal[i, j] * np.sum(T1[:, j - 1] * p_trans[i, :])
            T1[:, j] /= np.sum(T1[:, j])

        return T1

    def backward(self, p_trans, p_signal):
        """Calculate the probability of observing `x_{t + 1} ... x_n` if we 
           start in state `k` at time `t`."""
        T1 = np.zeros(p_signal.shape)
        T1[:, -1] = p_signal[:, -1]
        T1[:, -1] /= np.sum(T1[:, -1])

        for j in range(p_signal.shape[1] - 2, -1, -1):
            for i in range(len(p_trans)):
                T1[i, j] = np.sum(T1[:, j + 1] * p_trans[:, i] * p_signal[:, j + 1])
            T1[:, j] /= np.sum(T1[:, j])

        return T1
    
    def posterior_decoding(self, p_trans, p_signal):
        fwd = self.forward(p_trans, p_signal)
        bwd = self.backward(p_trans, p_signal)

        x = np.empty(p_signal.shape[1], 'B')
        for i in range(p_signal.shape[1]):
            x[i] = np.argmax(fwd[:, i] * bwd[:, i])

        return x
    

### model 5 - batch 5&10

In [8]:
# model 5: batch 5&10
BATCH_GROUP_5 = [5,10]
df_train_5 = df_train[df_train.batch.isin(BATCH_GROUP_5)]
df_train_5.loc[df_train_5.open_channels==0, "open_channels"] = 1

oof_pred = np.zeros(df_train_5.shape[0])

df_train_5["group"] = df_train_5["batch"].astype("str") + df_train_5["mini_batch"].astype("str")
df_train_5 = df_train_5.reset_index(drop=True)

In [9]:
group_kfold = GroupKFold(n_splits=10)
for train_index, valid_index in group_kfold.split(df_train_5, df_train_5[TARGET], df_train_5["group"]):
    valid_pred = pd.DataFrame()

    for col in df_train_5.loc[train_index]["group"].unique():

        tmp = df_train_5[df_train_5.group == col]

        viterbi = ViterbiClassifier().fit(tmp["signal"], tmp[TARGET] - 1)
        valid_pred[col] = viterbi.predict(df_train_5.loc[valid_index]["signal"])

#         print(col, "viterbi F1 is", f1_score(y_pred=valid_pred[col], y_true=df_train_5.loc[valid_index][TARGET] - 1, average='macro'))
    valid_pred["avg"] = (valid_pred.sum(axis=1)/9).astype("int")
    
    print("Valid viterbi F1 is", f1_score(y_pred=valid_pred["avg"], y_true=df_train_5.loc[valid_index][TARGET] - 1, average='macro'))

Valid viterbi F1 is 0.8581854159277574
Valid viterbi F1 is 0.8603880916175708
Valid viterbi F1 is 0.853435986000602
Valid viterbi F1 is 0.8560073081305587
Valid viterbi F1 is 0.8562576978136631
Valid viterbi F1 is 0.8504030989665043
Valid viterbi F1 is 0.8420419526835901
Valid viterbi F1 is 0.8584857603035709
Valid viterbi F1 is 0.85221220156568
Valid viterbi F1 is 0.8398913201530519
