In [1]:
import os
import pickle
import warnings
warnings.filterwarnings('ignore')
import gc

from collections import defaultdict

import numpy as np 
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import signal

pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

from sklearn.linear_model import LinearRegression
from scipy.signal import hilbert
from scipy.signal import hann
from scipy.signal import convolve
from scipy.signal import welch, find_peaks
from scipy import stats
from scipy.special import entr
from scipy.stats import entropy
from scipy.stats import percentileofscore
from tsfresh.feature_extraction import feature_calculators

In [2]:
pdf_trn = pd.read_csv('../input/train_clean.csv')
pdf_tst = pd.read_csv('../input/test_clean.csv')

with open('../input/batch_ids_trn.pkl', 'rb') as f:
    batch_id_trn = pickle.load(f)
with open('../input/batch_ids_tst.pkl', 'rb') as f:
    batch_id_tst = pickle.load(f)

---

In [3]:
trn_dat = pdf_trn['signal'].values
trb_lbl = pdf_trn['open_channels'].values

In [5]:
g0_dat = np.concatenate([trn_dat[batch_id_trn[i]] for i in (0, 1,)], axis=0)
g0_lbl = np.concatenate([trb_lbl[batch_id_trn[i]] for i in (0, 1,)], axis=0)
g1_dat = np.concatenate([trn_dat[batch_id_trn[i]] for i in (2, 6,)], axis=0)
g1_lbl = np.concatenate([trb_lbl[batch_id_trn[i]] for i in (2, 6,)], axis=0)
g2_dat = np.concatenate([trn_dat[batch_id_trn[i]] for i in (3, 7,)], axis=0)
g2_lbl = np.concatenate([trb_lbl[batch_id_trn[i]] for i in (3, 7,)], axis=0)
g3_dat = np.concatenate([trn_dat[batch_id_trn[i]] for i in (5, 8,)], axis=0)
g3_lbl = np.concatenate([trb_lbl[batch_id_trn[i]] for i in (5, 8,)], axis=0)
g4_dat = np.concatenate([trn_dat[batch_id_trn[i]] for i in (4, 9,)], axis=0)
g4_lbl = np.concatenate([trb_lbl[batch_id_trn[i]] for i in (4, 9,)], axis=0)

In [19]:
target_encode_dict = {}
for i, (g_d, g_l) in enumerate(zip([g0_dat, g1_dat, g2_dat, g3_dat, g4_dat], [g0_lbl, g1_lbl, g2_lbl, g3_lbl, g4_lbl])):
    unq_ls = np.unique(g_l)
    for l in unq_ls:
        target_encode_dict.update({str(i) + '_' + str(l): g_d[g_l == l]})

In [21]:
with open('../input/target_codes.pkl', 'wb') as f:
    pickle.dump(target_encode_dict, f)

---

In [39]:
g_pdf = pdf_trn.iloc[batch_id_trn[0]].copy().reset_index(drop=True)

In [40]:
g_pdf['100cuts'] = pd.qcut(g_pdf['signal'], 100, labels=False)

In [54]:
def group_adjacent_stat(data, group, adjacent):
    '''
    data: pandas series of raw signal.
    group: group info for each signal, i.e. same length as data.
    skip: how adjacent to take stat.
    '''
    dat_wip = data.copy().reset_index(drop=True)
    grp_wip = group.copy().reset_index(drop=True)
    
    skip_collection = defaultdict(lambda: np.nan)
    for i in np.unique(grp_wip):
        skip_collection.update(
            {
                i: dat_wip.shift(adjacent).loc[grp_wip == i].values
            }
        )
        
    
    return pd.Series([stats.percentileofscore(skip_collection[v], k) for k, v in zip(data.shift(adjacent).values, grp_wip.values)])

In [55]:
a = group_adjacent_stat(g_pdf['signal'], g_pdf['100cuts'], 1)