In [1]:
import os
import joblib
import math
import warnings
import gc
warnings.filterwarnings('ignore')
from tqdm import tqdm
import pickle

import bloscpack as bp

import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import NuSVC
from sklearn.model_selection import GroupKFold, StratifiedKFold, train_test_split, RepeatedStratifiedKFold
from sklearn import metrics


pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

In [2]:
with open('../input/batch_ids_trn.pkl', 'rb') as f:
    batch_id_trn = pickle.load(f)
with open('../input/batch_ids_tst.pkl', 'rb') as f:
    batch_id_tst = pickle.load(f)

In [3]:
trn_w500 = bp.unpack_ndarray_from_file('../input/feats_tblr/trn_dat_all_w500_fixed.bp')
trn_orig = pd.read_pickle('../input/feats_tblr/trn_dat_orig_v2_all.pkl')
trn_orig = trn_orig.loc[:, [c for c in trn_orig.columns if c not in ('time', 'batch', 'open_channels')]]

trn = np.concatenate([trn_orig.values, trn_w500], axis=1)
del trn_orig, trn_w500

lbl = pd.read_pickle('../input/feats_tblr/trn_lbl_orig_v2_all.pkl').values

In [4]:
tst_w500 = bp.unpack_ndarray_from_file(
    os.path.join(
        '../input/feats_tblr',
        [f for f in os.listdir('../input/feats_tblr') if ('tst' in f) and ('w500' in f)][0]
    )
)
tst_orig = pd.read_pickle('../input/feats_tblr/tst_dat_orig_v2_all.pkl')
tst_orig = tst_orig.loc[:, [c for c in tst_orig.columns if c not in ('time', 'batch', 'open_channels')]]

tst = np.concatenate([tst_orig.values, tst_w500], axis=1)
del tst_orig, tst_w500

In [38]:
trn.shape

(500000, 66)

In [None]:
tst.shape

In [36]:
for c in range(trn.shape[1]):
    # drop useless column
    if np.nanstd(trn[:, c]) == 0:
        print('{} column has std of zero.'.format(c))
        continue
        
    # process infinite value
    isinf = ~np.isfinite(trn[:, c])
    trn[:, c][trn[:, c] == np.inf] = np.nanmax(trn[:, c][~isinf])
    trn[:, c][trn[:, c] == -np.inf] = np.nanmin(trn[:, c][~isinf])
    
    #isinf = ~np.isfinite(tst[:, c])
    #tst[:, c][tst[:, c] == np.inf] = np.nanmax(tst[:, c][~isinf])
    #tst[:, c][tst[:, c] == -np.inf] = np.nanmin(tst[:, c][~isinf])
    
    # process nan
    isnan_trn = np.isnan(trn[:, c])
    c_avg = np.nanmean(trn[:, c])
    c_std = np.nanstd(trn[:, c])
    
    if isnan_trn.sum() > 0:
        trn[:, c][isnan_trn] = c_avg
    
    #isnan_tst = np.isnan(tst[:, c])
    #if isnan_tst.sum() > 0:
    #    tst[:, c][isnan_tst] = c_avg
    
    # finally scale
    trn[:, c] = (trn[:, c] - c_avg) / c_std
    #tst[:, c] = (tst[:, c] - c_avg) / c_std
    
    # show progress
    print('{:03d} / {:03d}'.format(c+1, trn.shape[1]), end='\r')

55 column has std of zero.
067 / 067

In [8]:
for c in [79, 89, 99, 109, 119, 129, 139, 149, 159, 169, 179, 189, 199, 209, 219, 229, 700, 759, 768, 1046, 1105, 1114]:
    print(np.nanstd(trn[:, c]), np.nanstd(tst[:, c]))

0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0


In [9]:
trn = np.delete(trn, [79, 89, 99, 109, 119, 129, 139, 149, 159, 169, 179, 189, 199, 209, 219, 229, 700, 759, 768, 1046, 1105, 1114], axis=1)
tst = np.delete(tst, [79, 89, 99, 109, 119, 129, 139, 149, 159, 169, 179, 189, 199, 209, 219, 229, 700, 759, 768, 1046, 1105, 1114], axis=1)

In [12]:
bp.pack_ndarray_to_file(trn, '../input/feats_tblr/trn_dat_all_origv2_w500.bp')

In [13]:
bp.pack_ndarray_to_file(tst, '../input/feats_tblr/tst_dat_all_origv2_w500.bp')

---

In [5]:
trn_f_v2 = sorted([f for f in os.listdir('../input') if ('trn' in f) and ('v2' in f) and ('dat' in f) and ('w100' in f)])
trn_v2 = np.concatenate([bp.unpack_ndarray_from_file(os.path.join('../input/', f)) for f in trn_f_v2], axis=0)

tst_f_v2 = sorted([f for f in os.listdir('../input') if ('tst' in f) and ('v2' in f) and ('dat' in f) and ('w100' in f)])
tst_f_v2 = tst_f_v2[:1] + tst_f_v2[11:] + tst_f_v2[1:11]
tst_v2 = np.concatenate([bp.unpack_ndarray_from_file(os.path.join('../input/', f)) for f in tst_f_v2], axis=0)

In [6]:
for c in range(trn_v2.shape[1]):
    # drop useless column
    if np.nanstd(trn_v2[:, c]) == 0:
        print('{} column has std of zero.'.format(c))
        print(np.nanstd(tst_v2[:, c]))
        continue
        
    # process infinite value
    isinf = ~np.isfinite(trn_v2[:, c])
    trn_v2[:, c][trn_v2[:, c] == np.inf] = np.nanmax(trn_v2[:, c][~isinf])
    trn_v2[:, c][trn_v2[:, c] == -np.inf] = np.nanmin(trn_v2[:, c][~isinf])
    
    isinf = ~np.isfinite(tst_v2[:, c])
    tst_v2[:, c][tst_v2[:, c] == np.inf] = np.nanmax(tst_v2[:, c][~isinf])
    tst_v2[:, c][tst_v2[:, c] == -np.inf] = np.nanmin(tst_v2[:, c][~isinf])
    
    # process nan
    isnan_trn = np.isnan(trn_v2[:, c])
    c_avg = np.nanmean(trn_v2[:, c])
    c_std = np.nanstd(trn_v2[:, c])
    
    if isnan_trn.sum() > 0:
        trn_v2[:, c][isnan_trn] = c_avg
    
    isnan_tst = np.isnan(tst_v2[:, c])
    if isnan_tst.sum() > 0:
        tst_v2[:, c][isnan_tst] = c_avg
    
    # finally scale
    trn_v2[:, c] = (trn_v2[:, c] - c_avg) / c_std
    tst_v2[:, c] = (tst_v2[:, c] - c_avg) / c_std
    
    # show progress
    print('{:03d} / {:03d}'.format(c+1, trn_v2.shape[1]), end='\r')

78 column has std of zero.
0.0


In [7]:
trn_v2 = np.delete(trn_v2, [78], axis=1)
tst_v2 = np.delete(tst_v2, [78], axis=1)

In [8]:
bp.pack_ndarray_to_file(trn_v2, '../input/trn_dat_v2_w100_welch.bp')
bp.pack_ndarray_to_file(tst_v2, '../input/tst_dat_v2_w100_welch.bp')