In [1]:
import os
import gc
import random

import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt

import bloscpack as bp

from tsfresh.feature_extraction import feature_calculators

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR

from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, GroupKFold
from sklearn.metrics import f1_score

from NNs import Wave_Classifier, WaveTRSFM_Classifier

In [2]:
DEVICE = 'cuda:0'
EPOCHS = 128
BATCHSIZE = 32
SEED = 19550423
LR = 0.001
SPLITS = 5

torch.manual_seed(SEED)

<torch._C.Generator at 0x7f98f1c7acb0>

In [4]:
series_trn = pd.read_pickle('../input/train_clean_encoded.pkl')

series_lbl = bp.unpack_ndarray_from_file(os.path.join('../input/feats_srs', 'trn_srs_lbl_all_s500_w500_feat_target_encoded.bp'))

series_grp = np.concatenate(
    [np.ones(shape=(series_lbl.shape[0] // 10,)) * i for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]],
    axis=0
).astype(int)

In [6]:
series_trn

Unnamed: 0,time,signal,open_channels,signal_0,signal_1,signal_2,signal_3,signal_4,signal_5,signal_6,...,signal_39,signal_40,signal_41,signal_42,signal_43,signal_44,signal_45,signal_46,signal_47,signal_48
0,0.0001,-2.760000,0,0.435937,0.0,0.453637,0.000000,0.462303,0.000000,0.470284,...,1.0,0.673797,0.008540,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.0002,-2.855700,0,0.292407,0.0,0.308404,0.000000,0.315577,0.000000,0.335668,...,1.0,0.566845,0.003701,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.0003,-2.407400,0,0.897187,0.0,0.902944,0.000259,0.908428,0.000139,0.885295,...,1.0,0.898396,0.063194,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.0004,-3.140400,0,0.045574,0.0,0.050236,0.000000,0.051367,0.000000,0.066310,...,1.0,0.310160,0.000569,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.0005,-3.152500,0,0.041195,0.0,0.045377,0.000000,0.046395,0.000000,0.061386,...,1.0,0.301248,0.000569,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,499.9996,2.919274,7,0.999996,1.0,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.0,1.000000,1.000000,1.0,1.0,0.990843,0.267996,0.000131,0.000000,0.000000
4999996,499.9997,2.697906,7,0.999996,1.0,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.0,1.000000,1.000000,1.0,1.0,0.966391,0.124179,0.000008,0.000000,0.000000
4999997,499.9998,4.516337,8,0.999998,1.0,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.0,1.000000,1.000000,1.0,1.0,1.000000,0.999254,0.604478,0.003535,0.000000
4999998,499.9999,5.639669,9,1.000000,1.0,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.0,1.000000,1.000000,1.0,1.0,1.000000,1.000000,0.998614,0.500105,0.001428


In [19]:
np.unique(series_grp, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]))

In [10]:
a

[(500,),
 (500,),
 (500,),
 (500,),
 (500,),
 (500,),
 (500,),
 (500,),
 (500,),
 (500,)]

In [7]:
series_grp

array([0, 0, 0, ..., 9, 9, 9])

In [6]:
lbl_skewness = [feature_calculators.binned_entropy(lst, max_bins=20) for lst in series_trn[:, :, 0].tolist()]

lbl_skewness = pd.qcut(pd.Series(lbl_skewness), q=10, duplicates='drop')

In [8]:
skf_grp = [str(a) + '_' + str(b) for a, b in zip(series_grp, lbl_skewness)]
us = np.unique(skf_grp)
umap = {u: i for u, i in zip(us, range(len(us)))}
skf_grp = [umap[u] for u in skf_grp]

In [9]:
class Waveset(Dataset):
    def __init__(self, data, labels=None):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        
        if self.labels is None:
            return data.astype(np.float32)
        else:
            labels = self.labels[idx]
            return (data.astype(np.float32), labels.astype(int))

In [10]:
def fold_train_validate(model, optimizer, criterion, scheduler, training_loader, validation_loader, fold_number, save_path='./saved_models/wavenet_model_fold{:03d}_checkpoint.pth'):

    trn_losses = [np.nan]
    vld_losses = [np.nan]
    vld_f1s = [np.nan]

    for epc in range(EPOCHS):
        print('===========================================================')

        epoch_trn_losses = []
        epoch_trn_lbls = []
        epoch_trn_prds = []
        epoch_vld_losses = []
        epoch_vld_lbls = []
        epoch_vld_prds = []

        # ------ training ------
        model.train()
        for i, (trn_batch_dat, trn_batch_lbl) in enumerate(training_loader):
            trn_batch_dat, trn_batch_lbl = trn_batch_dat.to(DEVICE), trn_batch_lbl.to(DEVICE)

            optimizer.zero_grad()
            trn_batch_prd = model(trn_batch_dat)
            trn_batch_prd = trn_batch_prd.view(-1, trn_batch_prd.size(-1))
            trn_batch_lbl = trn_batch_lbl.view(-1)
            loss = criterion(trn_batch_prd, trn_batch_lbl)
            loss.backward()
            optimizer.step()

            epoch_trn_losses.append(loss.item())
            epoch_trn_lbls.append(trn_batch_lbl.detach().cpu().numpy())
            epoch_trn_prds.append(trn_batch_prd.detach().cpu().numpy())

#             print(
#                 'Epoch {:03d}/{:03d} - Training batch {:04d}/{:04d}: Training loss {:.6f};'.format(
#                     epc + 1, EPOCHS, i + 1, len(training_loader), epoch_trn_losses[-1],
#                 ), 
#                 end='\r'
#             )

        # ------ validation ------
        model.eval()
        with torch.no_grad():
            for i, (vld_batch_dat, vld_batch_lbl) in enumerate(validation_loader):
                vld_batch_dat, vld_batch_lbl = vld_batch_dat.to(DEVICE), vld_batch_lbl.to(DEVICE)

                vld_batch_prd = model(vld_batch_dat)
                vld_batch_prd = vld_batch_prd.view(-1, vld_batch_prd.size(-1))
                vld_batch_lbl = vld_batch_lbl.view(-1)
                loss = criterion(trn_batch_prd, trn_batch_lbl)

                epoch_vld_losses.append(loss.item())
                epoch_vld_lbls.append(vld_batch_lbl.detach().cpu().numpy())
                epoch_vld_prds.append(vld_batch_prd.detach().cpu().numpy())

#                 print(
#                     'Epoch {:03d}/{:03d} - Validation batch {:04d}/{:04d}: Validation loss {:.6f};'.format(
#                         epc + 1, EPOCHS, i + 1, len(validation_loader), epoch_vld_losses[-1],
#                     ), 
#                     end='\r'
#                 )

        # ------ epoch end ------
        f1_trn = f1_score(
            np.concatenate(epoch_trn_lbls, axis=0), 
            np.concatenate(epoch_trn_prds, axis=0).argmax(1),
            labels=list(range(11)), 
            average='macro'
        )
        f1_vld = f1_score(
            np.concatenate(epoch_vld_lbls, axis=0), 
            np.concatenate(epoch_vld_prds, axis=0).argmax(1),
            labels=list(range(11)), 
            average='macro'
        )


        print(
            'Epoch {:03d}/{:03d} - Mean training loss {:.6f}; Mean training F1 {:.6f}; Mean validation loss {:.6f}; Mean validation F1 {:.6f}; Learning rate {:.6f};'.format(
                epc + 1, EPOCHS, np.mean(epoch_trn_losses), f1_trn, np.mean(epoch_vld_losses), f1_vld, scheduler.get_lr()[0],
            )
        )

        if f1_vld > np.nanmax(vld_f1s):
            torch.save(
                {
                    'epoch': epc + 1,
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'f1': f1_vld,
                    'loss': np.mean(epoch_vld_losses),
                }, 
                save_path.format(fold_number)
            )

        vld_f1s.append(f1_vld)

        scheduler.step()

In [11]:
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, random_state=SEED)

In [12]:
for fld, (ndcs_trn, ndcs_vld) in enumerate(skf.split(series_trn, skf_grp)):
    print('################################################################')
    print('Training/validation for fold {:d}/{:d};'.format(fld+1, N_FOLDS))
    
    # setup fold data
    dat_trn, lbl_trn = series_trn[ndcs_trn], series_lbl[ndcs_trn]
    dat_vld, lbl_vld = series_trn[ndcs_vld], series_lbl[ndcs_vld]
    
    waveset_trn = Waveset(dat_trn, lbl_trn)
    waveset_vld = Waveset(dat_vld, lbl_vld)

    loader_trn = DataLoader(waveset_trn, BATCHSIZE, shuffle=True, num_workers=2, pin_memory=True)
    loader_vld = DataLoader(waveset_vld, BATCHSIZE, shuffle=False, num_workers=2, pin_memory=True)
    
    # setup fold model
    mdl = Wave_Classifier(series_trn.shape[-1]).to(DEVICE)
    critrn = nn.CrossEntropyLoss()
    optimzr = torch.optim.AdamW(mdl.parameters(), lr=LR)
    schdlr = torch.optim.lr_scheduler.CosineAnnealingLR(optimzr, T_max=EPOCHS, eta_min=LR/100)
    
    # run
    fold_train_validate(
        model=mdl, optimizer=optimzr, criterion=critrn, scheduler=schdlr, 
        training_loader=loader_trn, validation_loader=loader_vld, 
        fold_number=fld,
        save_path='./saved_models/wavenet_model_backtobasic_s500w500_fold{:03d}_checkpoint.pth',
    )

################################################################
Training/validation for fold 1/5;
Epoch 001/128 - Mean training loss 0.454704; Mean training F1 0.752384; Mean validation loss 0.235655; Mean validation F1 0.868708; Learning rate 0.001000;
Epoch 002/128 - Mean training loss 0.183747; Mean training F1 0.904458; Mean validation loss 0.114920; Mean validation F1 0.910588; Learning rate 0.001000;
Epoch 003/128 - Mean training loss 0.142105; Mean training F1 0.920413; Mean validation loss 0.116121; Mean validation F1 0.930021; Learning rate 0.000999;
Epoch 004/128 - Mean training loss 0.127167; Mean training F1 0.925275; Mean validation loss 0.108196; Mean validation F1 0.922306; Learning rate 0.000998;
Epoch 005/128 - Mean training loss 0.120072; Mean training F1 0.927495; Mean validation loss 0.114903; Mean validation F1 0.883737; Learning rate 0.000997;
Epoch 006/128 - Mean training loss 0.118646; Mean training F1 0.927288; Mean validation loss 0.110621; Mean validation F1

KeyboardInterrupt: 

In [12]:
submission = pd.read_csv('../input/sample_submission.csv', dtype={'time': str, 'open_channels': 'Int64'})

In [13]:
submission_pred = np.zeros(shape=(submission.shape[0], 11))

waveset_tst = Waveset(series_tst)
loader_tst = DataLoader(waveset_tst, BATCHSIZE, shuffle=False, num_workers=2, pin_memory=True)

for fld in range(5):
    print('-------- fold {:d} --------'.format(fld))
    fld_weight = torch.load('./saved_models/wavenet_model_backtobasic_s500w500_fold{:03d}_checkpoint.pth'.format(fld))
    print('model validation loss: {:.3f}; validation f1: {:.3f};'.format(fld_weight['loss'], fld_weight['f1']))
#     mdl = Wave_Classifier(series_tst.shape[-1]).to(DEVICE)
#     mdl.load_state_dict(fld_weight['model'])
#     with torch.no_grad():
#         tst_fold_prd = []
#         for tst_batch_dat in loader_tst:
#             tst_batch_prd = mdl(tst_batch_dat.to(DEVICE))
#             tst_batch_prd = tst_batch_prd.view(-1, tst_batch_prd.size(-1)).detach().cpu().numpy()
#             tst_fold_prd.append(tst_batch_prd)
            
#         submission_pred += np.concatenate(tst_fold_prd, 0)

-------- fold 0 --------
model validation loss: 0.077; validation f1: 0.939;
-------- fold 1 --------
model validation loss: 0.144; validation f1: 0.938;
-------- fold 2 --------
model validation loss: 0.093; validation f1: 0.937;
-------- fold 3 --------
model validation loss: 0.096; validation f1: 0.939;
-------- fold 4 --------
model validation loss: 0.058; validation f1: 0.939;


---