# Train

## Library Import

In [150]:
# library import
import numpy as np
import pandas as pd
import os
import tqdm
import random
import time
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import Adam, AdamW
from torchvision.models import resnet18, resnet34, resnet50
from torchvision import datasets, transforms
import torch.utils.data as torchdata
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from contextlib import contextmanager
from typing import Optional
import logging
from numpy.random import beta
from pathlib import Path

from conformer import ConformerConvModule
from conformer import ConformerBlock


## Configuration

In [137]:
# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [138]:
# sample data
sample = torch.from_numpy(np.load('melspec.npy'))
print(sample.shape)
# channel = sample.unsqueeze(0)
# batch = channel.unsqueeze(0)
# print(batch.shape)

torch.Size([2813, 128])


In [157]:
class config:
    SEED = 42
    INPUT = Path("../input/rfcx-species-audio-detection/train")
    TRAIN_AUDIO_ROOT = Path("../input/rfcx-species-audio-detection/train_mel")
    DIM = sample.shape[1]
    SEQ_LEN = sample.shape[0]
    CLASS_NUM = 23
    KERNEL_SIZE = 3
    POOL_SIZE = 2
    POOL_STRIDE = 2
    NUM_BIRDS = 24

In [146]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
set_seed(config.SEED)

## Audio Transform

In [125]:
# transform all audio to 2d
# run audio_transformer

## Preprocessing

In [None]:
# Spec Augment








In [None]:
# any other augment here...









## Modeling

In [151]:
# Conformer
# https://arxiv.org/abs/2005.08100
class RainforestTransformer(nn.Module):
    def __init__(self):
        super(RainforestTransformer, self).__init__()         

        self.conv = nn.Conv2d(1, 1, config.KERNEL_SIZE)
        self.linear = nn.Linear(int(
                                    (
                                        ((config.DIM - config.KERNEL_SIZE + 1) - config.POOL_SIZE) / config.POOL_STRIDE
                                    ) + 1
                                ), config.DIM)
        self.dropout = nn.Dropout(0.2)
        
        self.conformerblock = ConformerBlock(
            dim = config.DIM,
            dim_head = 64,
            heads = 8,
            ff_mult = 4,
            conv_expansion_factor = 2,
            conv_kernel_size = 31,
            attn_dropout = 0.,
            ff_dropout = 0.,
            conv_dropout = 0.
        )
        self.decoder = nn.Linear(1 * int((((config.SEQ_LEN - config.KERNEL_SIZE + 1) -  config.POOL_SIZE) / config.POOL_STRIDE) + 1) * config.DIM, config.CLASS_NUM)

        # devided by stride
        
    def forward(self, x):
        h = F.relu(self.conv(x))
        h = F.max_pool2d(h, config.POOL_SIZE, stride=config.POOL_STRIDE)
        h = self.linear(h)
        h = h.transpose(0, 1)[0] # transpose batch and channel to delet channel dimension
        h = self.conformerblock(h)
        h = h.view(-1, 1 * int((((config.SEQ_LEN - config.KERNEL_SIZE + 1) -  config.POOL_SIZE) / config.POOL_STRIDE) + 1) * config.DIM)
        out = self.decoder(h)
        return out

## Transforms

In [155]:
# transforms
train_transform = transforms.Compose([
    # transforms.RandomCrop((128, 313), pad_if_needed=True, padding_mode="constant"),
    transforms.ToTensor(),
])
valid_transform = transforms.Compose([
    # transforms.CenterCrop((128, 313)),
    transforms.ToTensor()
])
label_transform = transforms.Compose([
    transforms.ToTensor()
])

## Dataset

In [160]:
# Data load
df_train = pd.DataFrame({
    "recording_id": [path.stem for path in Path(config.TRAIN_AUDIO_ROOT).glob("*.npy")],
})

In [161]:
df_train

Unnamed: 0,recording_id
0,00204008d
1,003b04435
2,003bec244
3,005f1f9a5
4,006ab765f
...,...
4722,ffc6031f8
4723,ffd88cd84
4724,ffebe7313
4725,fff163132


In [152]:
class RainforestDatasets(torch.utils.data.Dataset):
    def __init__(self, path, tt = None, vt = None, train = True):
        self.transform1 = tt
        self.transform2 = vt
        self.train = train

        # data load
        self.labelset = torchvision.datasets.CIFAR10(root = path, train = self.train, download = True)
        self.dataset = torchvision.datasets.CIFAR10(root = path, train = self.train, download = True)

        self.datanum = len(dataset)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        # get data
        out_label = self.labelset[idx]
        out_data = self.dataset[idx]
        
        # transform label
        out_data = label_transform(out_data)

        # transform data
        if self.tt:
            out_data = self.tt(out_data)
        if self.vt:
            out_data = self.vt(out_data)

        return out_data, out_label

In [None]:
# def get_dataloder():
#     # transform
#     train_transform = transforms.Compose([
#         # transforms.RandomCrop((128, 313), pad_if_needed=True, padding_mode="constant"),
#         transforms.ToTensor(),
#     ])
#     valid_transform = transforms.Compose([
#         # transforms.CenterCrop((128, 313)),
#         transforms.ToTensor()
#     ])
    
#     # dataset
#     train_datasets = datasets.ImageFolder(root=config.INPUT, transform=train_transform)
#     valid_datasets = datasets.ImageFolder(root=config.INPUT, transform=valid_transform)

#     # k-fold
#     skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.SEED)

#     # target
#     _t = train_datasets.targets
#     trn_idx, val_idx = [(trn_idx, val_idx) for trn_idx, val_idx in skf.split(_t, _t)][config.FOLD]

#     # subset
#     train_datasets = torch.utils.data.Subset(train_datasets, trn_idx)
#     valid_datasets = torch.utils.data.Subset(valid_datasets, val_idx)

#     # dataloader
#     train_data_loader = torch.utils.data.DataLoader(train_datasets, batch_size=config.TRAIN_BS, shuffle=True, num_workers=config.TRAIN_WORKS)
#     valid_data_loader = torch.utils.data.DataLoader(valid_datasets, batch_size=config.VALID_BS, shuffle=False, num_workers=config.VALID_WORKS)
    
#     return train_data_loader, valid_data_loader

# # dataloader
# train_data_loader, valid_data_loader = get_dataloder()

# # ?
# for d in train_data_loader:
#     break

# # ?
# img = d[0][0]
# plt.imshow(np.rollaxis(img.numpy(), 0, 3))

In [4]:
x = torch.randn(100, 1024, 5600)
# x = x.to(device)

In [5]:
layer = ConformerConvModule(
    dim = 5600,
    causal = False,             # auto-regressive or not - 1d conv will be made causal with padding if so
    expansion_factor = 2,       # what multiple of the dimension to expand for the depthwise convolution
    kernel_size = 31,           # kernel size, 17 - 31 was said to be optimal
    dropout = 0.                # dropout at the very end
)
x = layer(x) + x

In [6]:
x.shape

torch.Size([100, 1024, 5600])

In [8]:
block = ConformerBlock(
    dim = 5600,
    dim_head = 64,
    heads = 8,
    ff_mult = 4,
    conv_expansion_factor = 2,
    conv_kernel_size = 31,
    attn_dropout = 0.,
    ff_dropout = 0.,
    conv_dropout = 0.
)

# x = torch.randn(1, 1024, 512)
x = block(x) # (1, 1024, 512)

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:73] data. DefaultCPUAllocator: not enough memory: you tried to allocate 9175040000 bytes. Buy new RAM!

In [24]:
x.shape

tensor([[[-0.3272, -1.4688,  0.6977,  ..., -1.3553, -0.6360, -0.1129],
         [ 0.7752,  0.6278, -0.9865,  ...,  1.3034, -0.6864, -0.1707],
         [ 0.7214,  2.0796,  1.9614,  ..., -0.3620, -0.7741,  0.0137],
         ...,
         [ 0.1327,  0.9225,  0.5310,  ...,  0.1975, -0.6152,  0.8490],
         [ 0.3753,  0.7627,  1.0196,  ...,  1.2932,  0.3024,  0.0706],
         [ 0.4095,  0.5860,  0.3746,  ..., -0.6731, -0.2048,  0.6988]]],
       grad_fn=<NativeLayerNormBackward>)

## Modeling