In [1]:
import os
import gc
import random

import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt

import bloscpack as bp

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR

from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, GroupKFold
from sklearn.metrics import f1_score

from NNs import Wave_Classifier

In [2]:
DEVICE = 'cuda:0'
EPOCHS = 96
BATCHSIZE = 32
SEED = 19550423
LR = 0.001
SPLITS = 5

torch.manual_seed(SEED)

<torch._C.Generator at 0x7fb0279e67b0>

In [3]:
trn_fs = sorted([f for f in os.listdir('../input/') if (('trn_srs_dat' in f) and ('s1000' in f) and ('w4000' in f))])
lbl_fs = sorted([f for f in os.listdir('../input/') if ('trn_srs_lbl' in f) and ('s1000' in f) and ('w4000' in f)])

# tst_fs = sorted([f for f in os.listdir('../input/') if (('tst_srs_dat' in f) and ('s1000' in f) and ('w1000' in f))])
# tst_fs = [tst_fs[i] for i in [0, 11, 12, 13, 14, 15, 16, 17, 18, 19]] + tst_fs[1:11]

In [4]:
series_trn = np.concatenate(
    [bp.unpack_ndarray_from_file(os.path.join('../input/', f)) for f in trn_fs],
    axis=0
)

series_lbl = [bp.unpack_ndarray_from_file(os.path.join('../input/', f)) for f in lbl_fs]

series_grp = np.concatenate(
    [np.ones(shape=(arr.shape[0],)) * i for i, arr in zip([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], series_lbl)],
    axis=0
).astype(int)

series_lbl = np.concatenate(
    series_lbl,
    axis=0
)[:, :, None]

# series_tst = np.concatenate(
#     [bp.unpack_ndarray_from_file(os.path.join('../input/', f)) for f in tst_fs],
#     axis=0
# )


In [6]:
series_trn.shape

(4970, 4000, 49)

In [5]:
for i in range(series_trn.shape[-1]):
    high = series_trn[:, :, i].max()
    low = series_trn[:, :, i].min()
#     high= max(series_trn[:, :, i].max(), series_tst[:, :, i].max())
#     low = min(series_trn[:, :, i].min(), series_tst[:, :, i].min())
    series_trn[:, :, i] = 2 * (series_trn[:, :, i] - low) / (high - low) - 1
#     series_tst[:, :, i] = 2 * (series_tst[:, :, i] - low) / (high - low) - 1
#     print('---------')
#     print('{:d} - max {:.3f}; min {:.3f};'.format(i, series_trn[:, :, i].max(), series_trn[:, :, i].min()))
#     print('{:d} - max {:.3f}; min {:.3f};'.format(i, series_tst[:, :, i].max(), series_tst[:, :, i].min()))

In [5]:
# def skewness_class(skw):
#     if skw < -.7:
#         return 0
#     elif -.7 <= skw < -.42:
#         return 1
#     elif -.42 <= skw < -.27:
#         return 2
#     elif -.27 <= skw:
#         return 3
    
# lbl_skewness = [skewness_class(skew(lst)) for lst in series_lbl.squeeze(-1).tolist()]

lbl_skewness = [np.unique(lst).shape[0] for lst in series_lbl.squeeze(-1).tolist()]

In [6]:
skf_grp = [str(a) + '_' + str(b) for a, b in zip(series_grp, lbl_skewness)]
us = np.unique(skf_grp)
umap = {u: i for u, i in zip(us, range(len(us)))}
skf_grp = [umap[u] for u in skf_grp]

In [7]:
class Waveset(Dataset):
    def __init__(self, data, labels=None):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        
        if self.labels is None:
            return data.astype(np.float32)
        else:
            labels = self.labels[idx]
            return (data.astype(np.float32), labels.astype(int))

In [9]:
def fold_train_validate(model, optimizer, criterion, scheduler, training_loader, validation_loader, fold_number):

    trn_losses = [np.nan]
    vld_losses = [np.nan]
    vld_f1s = [np.nan]

    for epc in range(EPOCHS):
        print('===========================================================')

        epoch_trn_losses = []
        epoch_trn_lbls = []
        epoch_trn_prds = []
        epoch_vld_losses = []
        epoch_vld_lbls = []
        epoch_vld_prds = []

        # ------ training ------
        model.train()
        for i, (trn_batch_dat, trn_batch_lbl) in enumerate(training_loader):
            trn_batch_dat, trn_batch_lbl = trn_batch_dat.to(DEVICE), trn_batch_lbl.to(DEVICE)

            optimizer.zero_grad()
            trn_batch_prd = model(trn_batch_dat)
            trn_batch_prd = trn_batch_prd.view(-1, trn_batch_prd.size(-1))
            trn_batch_lbl = trn_batch_lbl.view(-1)
            loss = criterion(trn_batch_prd, trn_batch_lbl)
            loss.backward()
            optimizer.step()

            epoch_trn_losses.append(loss.item())
            epoch_trn_lbls.append(trn_batch_lbl.detach().cpu().numpy())
            epoch_trn_prds.append(trn_batch_prd.detach().cpu().numpy())

            print(
                'Epoch {:03d}/{:03d} - Training batch {:04d}/{:04d}: Training loss {:.6f};'.format(
                    epc + 1, EPOCHS, i + 1, len(training_loader), epoch_trn_losses[-1],
                ), 
                end='\r'
            )

        # ------ validation ------
        model.eval()
        with torch.no_grad():
            for i, (vld_batch_dat, vld_batch_lbl) in enumerate(validation_loader):
                vld_batch_dat, vld_batch_lbl = vld_batch_dat.to(DEVICE), vld_batch_lbl.to(DEVICE)

                vld_batch_prd = model(vld_batch_dat)
                vld_batch_prd = vld_batch_prd.view(-1, vld_batch_prd.size(-1))
                vld_batch_lbl = vld_batch_lbl.view(-1)
                loss = criterion(trn_batch_prd, trn_batch_lbl)

                epoch_vld_losses.append(loss.item())
                epoch_vld_lbls.append(vld_batch_lbl.detach().cpu().numpy())
                epoch_vld_prds.append(vld_batch_prd.detach().cpu().numpy())

                print(
                    'Epoch {:03d}/{:03d} - Validation batch {:04d}/{:04d}: Validation loss {:.6f};'.format(
                        epc + 1, EPOCHS, i + 1, len(validation_loader), epoch_vld_losses[-1],
                    ), 
                    end='\r'
                )

        # ------ epoch end ------
        f1_trn = f1_score(
            np.concatenate(epoch_trn_lbls, axis=0), 
            np.concatenate(epoch_trn_prds, axis=0).argmax(1),
            labels=list(range(11)), 
            average='macro'
        )
        f1_vld = f1_score(
            np.concatenate(epoch_vld_lbls, axis=0), 
            np.concatenate(epoch_vld_prds, axis=0).argmax(1),
            labels=list(range(11)), 
            average='macro'
        )


        print(
            'Epoch {:03d}/{:03d} - Mean training loss {:.6f}; Mean training F1 {:.6f}; Mean validation loss {:.6f}; Mean validation F1 {:.6f}; Learning rate {:.6f};'.format(
                epc + 1, EPOCHS, np.mean(epoch_trn_losses), f1_trn, np.mean(epoch_vld_losses), f1_vld, scheduler.get_lr()[0],
            )
        )

        if f1_vld > np.nanmax(vld_f1s):
            torch.save(
                {
                    'epoch': epc + 1,
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'f1': f1_vld,
                    'loss': np.mean(epoch_vld_losses),
                }, 
                './saved_models/wavenet_model_fold{:03d}_checkpoint.pth'.format(fold_number)
            )

        vld_f1s.append(f1_vld)

        scheduler.step()

In [9]:
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, random_state=SEED)

In [11]:
for fld, (ndcs_trn, ndcs_vld) in enumerate(skf.split(series_trn, skf_grp)):
    print('################################################################')
    print('Training/validation for fold {:d}/{:d};'.format(fld+1, N_FOLDS))
    
    # setup fold data
    dat_trn, lbl_trn = series_trn[ndcs_trn], series_lbl[ndcs_trn]
    dat_vld, lbl_vld = series_trn[ndcs_vld], series_lbl[ndcs_vld]
    
    waveset_trn = Waveset(dat_trn, lbl_trn)
    waveset_vld = Waveset(dat_vld, lbl_vld)

    loader_trn = DataLoader(waveset_trn, BATCHSIZE, shuffle=True, num_workers=2, pin_memory=True)
    loader_vld = DataLoader(waveset_vld, BATCHSIZE, shuffle=False, num_workers=2, pin_memory=True)
    
    # setup fold model
    mdl = Wave_Classifier(series_trn.shape[-1]).to(DEVICE)
    critrn = nn.CrossEntropyLoss()
    optimzr = torch.optim.AdamW(mdl.parameters(), lr=LR)
    schdlr = torch.optim.lr_scheduler.CosineAnnealingLR(optimzr, T_max=EPOCHS, eta_min=LR/100)
    
    # run
    fold_train_validate(model=mdl, optimizer=optimzr, criterion=critrn, scheduler=schdlr, training_loader=loader_trn, validation_loader=loader_vld, fold_number=fld)

################################################################
Training/validation for fold 1/5;
Epoch 001/096 - Training batch 0065/0125: Training loss nan;7980;

KeyboardInterrupt: 

In [34]:
submission = pd.read_csv('../input/sample_submission.csv', dtype={'time': str, 'open_channels': 'Int64'})

In [17]:
submission_pred = np.zeros(shape=(submission.shape[0], 11))

waveset_tst = Waveset(series_tst)
loader_tst = DataLoader(waveset_tst, BATCHSIZE, shuffle=False, num_workers=2, pin_memory=True)

for fld in range(5):
    print('-------- fold {:d} --------'.format(fld))
    fld_weight = torch.load('./saved_models/wavenet_model_fold{:03d}_checkpoint.pth'.format(fld))
    print('model validation loss: {:.3f}; validation f1: {:.3f};'.format(fld_weight['loss'], fld_weight['f1']))
    mdl = Classifier(series_tst.shape[-1]).to(DEVICE)
    mdl.load_state_dict(fld_weight['model'])
    with torch.no_grad():
        tst_fold_prd = []
        for tst_batch_dat in loader_tst:
            tst_batch_prd = mdl(tst_batch_dat.to(DEVICE))
            tst_batch_prd = tst_batch_prd.view(-1, tst_batch_prd.size(-1)).detach().cpu().numpy()
            tst_fold_prd.append(tst_batch_prd)
            
        submission_pred += np.concatenate(tst_fold_prd, 0)

-------- fold 0 --------
model validation loss: 0.107; validation f1: 0.941;
-------- fold 1 --------
model validation loss: 0.049; validation f1: 0.934;
-------- fold 2 --------
model validation loss: 0.091; validation f1: 0.938;
-------- fold 3 --------
model validation loss: 0.073; validation f1: 0.940;
-------- fold 4 --------
model validation loss: 0.075; validation f1: 0.939;


In [36]:
submission['open_channels'] = submission_pred.argmax(1)
submission.to_csv("../submissions/sub0_wavenet_myfeats.csv", index=False)

In [10]:
for ndcs_trn, ndcs_vld in skf.split(series_trn, skf_grp):
    dat_trn, lbl_trn = series_trn[ndcs_trn], series_lbl[ndcs_trn]
    dat_vld, lbl_vld = series_trn[ndcs_vld], series_lbl[ndcs_vld]
    break

In [11]:
mdl = Wave_Classifier(49).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(mdl.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=LR/100)

In [12]:
waveset_trn = Waveset(dat_trn, lbl_trn)
waveset_vld = Waveset(dat_vld, lbl_vld)

loader_trn = DataLoader(waveset_trn, BATCHSIZE, shuffle=True, num_workers=2, pin_memory=True)
loader_vld = DataLoader(waveset_vld, BATCHSIZE, shuffle=False, num_workers=2, pin_memory=True)

In [14]:
trn_batch_lbl

tensor([2, 2, 1,  ..., 1, 1, 1], device='cuda:0')

In [13]:
trn_losses = [np.nan]
vld_losses = [np.nan]
vld_f1s = [np.nan]

for epc in range(EPOCHS):
    print('===========================================================')
    
    epoch_trn_losses = []
    epoch_trn_lbls = []
    epoch_trn_prds = []
    epoch_vld_losses = []
    epoch_vld_lbls = []
    epoch_vld_prds = []
    
    # ------ training ------
    mdl.train()
    for i, (trn_batch_dat, trn_batch_lbl) in enumerate(loader_trn):
        trn_batch_dat, trn_batch_lbl = trn_batch_dat.to(DEVICE), trn_batch_lbl.to(DEVICE)
        
        optimizer.zero_grad()
        trn_batch_prd = mdl(trn_batch_dat)
        trn_batch_prd = trn_batch_prd.view(-1, trn_batch_prd.size(-1))
        trn_batch_lbl = trn_batch_lbl.view(-1)
        loss = criterion(trn_batch_prd, trn_batch_lbl)
        loss.backward()
        optimizer.step()
        
        epoch_trn_losses.append(loss.item())
        epoch_trn_lbls.append(trn_batch_lbl.detach().cpu().numpy())
        epoch_trn_prds.append(trn_batch_prd.detach().cpu().numpy())
        
        print(
            'Epoch {:03d}/{:03d} - Training batch {:04d}/{:04d}: Training loss {:.6f};'.format(
                epc + 1, EPOCHS, i + 1, len(loader_trn), epoch_trn_losses[-1],
            ), 
            end='\r'
        )
    
    # ------ validation ------
    mdl.eval()
    with torch.no_grad():
        for i, (vld_batch_dat, vld_batch_lbl) in enumerate(loader_vld):
            vld_batch_dat, vld_batch_lbl = vld_batch_dat.to(DEVICE), vld_batch_lbl.to(DEVICE)
            
            vld_batch_prd = mdl(vld_batch_dat)
            vld_batch_prd = vld_batch_prd.view(-1, vld_batch_prd.size(-1))
            vld_batch_lbl = vld_batch_lbl.view(-1)
            loss = criterion(trn_batch_prd, trn_batch_lbl)
            
            epoch_vld_losses.append(loss.item())
            epoch_vld_lbls.append(vld_batch_lbl.detach().cpu().numpy())
            epoch_vld_prds.append(vld_batch_prd.detach().cpu().numpy())
            
            print(
                'Epoch {:03d}/{:03d} - Validation batch {:04d}/{:04d}: Validation loss {:.6f};'.format(
                    epc + 1, EPOCHS, i + 1, len(loader_vld), epoch_vld_losses[-1],
                ), 
                end='\r'
            )
    
    # ------ epoch end ------
    f1_trn = f1_score(
        np.concatenate(epoch_trn_lbls, axis=0), 
        np.concatenate(epoch_trn_prds, axis=0).argmax(1),
        labels=list(range(11)), 
        average='macro'
    )
    f1_vld = f1_score(
        np.concatenate(epoch_vld_lbls, axis=0), 
        np.concatenate(epoch_vld_prds, axis=0).argmax(1),
        labels=list(range(11)), 
        average='macro'
    )
    
    
    print(
        'Epoch {:03d}/{:03d} - Mean training loss {:.6f}; Mean training F1 {:.6f}; Mean validation loss {:.6f}; Mean validation F1 {:.6f}; Learning rate {:.6f};'.format(
            epc + 1, EPOCHS, np.mean(epoch_trn_losses), f1_trn, np.mean(epoch_vld_losses), f1_vld, scheduler.get_lr()[0],
        )
    )
    
#     if f1_vld > np.max(vld_f1s):
#         torch.save(
#             {
#                 'epoch': epc + 1,
#                 'model': mdl.state_dict(),
#                 'optimizer': optimizer.state_dict(),
#                 'f1': f1_vld,
#                 'loss': np.mean(epoch_vld_losses),
#             }, 
#             './checkpoints/wavenet_model_fold{:03d}_checkpoint.pth'.format(fld)
#         )
    
#     vld_f1s.append(f1_vld)
    
    scheduler.step()

Epoch 001/096 - Training batch 0055/0125: Training loss nan;7175;

KeyboardInterrupt: 