# Train

## Library Import

In [6]:
# library import
import numpy as np
import pandas as pd
import os
import tqdm
import random
import time
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import Adam, AdamW
from torchvision.models import resnet18, resnet34, resnet50
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from contextlib import contextmanager
from typing import Optional
import logging
from numpy.random import beta
from pathlib import Path

from conformer import ConformerConvModule
from conformer import ConformerBlock


In [7]:
# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [8]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

In [9]:
class config:
    SEED = 416
    INPUT = Path("../input/rfcx-species-audio-detection/train")
    OUTPUT = "/kaggle/working"

## Audio Transform

In [18]:
# transform all audio to 2d
# run audio_transformer

## Modeling

In [45]:
class RainforestTransformer(nn.Module):
    def __init__(self):
        super(RainforestTransformer, self).__init__()         
        
        self.conformerblock = ConformerBlock(
            dim = 2813,
            dim_head = 64,
            heads = 8,
            ff_mult = 4,
            conv_expansion_factor = 2,
            conv_kernel_size = 31,
            attn_dropout = 0.,
            ff_dropout = 0.,
            conv_dropout = 0.
        )
        self.decoder = nn.Linear(1 * 128 * 2813, 23)
        
        # embed = 512
        # self.conv = nn.Conv2d(in_channels=3, out_channels=embed, kernel_size=(128, 16))
        # #self.pe = PositionalEncoding(embed)  # not good work for me
        # encoder_layer = nn.TransformerEncoderLayer(d_model=embed, nhead=4)
        # self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
        # self.decoder = nn.Linear(embed, config.N_LABEL)
        
        # self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, x):
        h = self.conformerblock(x)
        h = h.view(-1, 1 * 128 * 2813)
        out = self.decoder(h)
        # h = self.pe(h)  # not good work for me
        # h = self.transformer_encoder(h)
        # logits, _ = self.decoder(h).max(1)
        return out

In [46]:
model = RainforestTransformer()
model.to(device)
print(model)

RainforestTransformer(
  (conformerblock): ConformerBlock(
    (ff1): Scale(
      (fn): PreNorm(
        (fn): FeedForward(
          (net): Sequential(
            (0): Linear(in_features=2813, out_features=11252, bias=True)
            (1): Swish()
            (2): Dropout(p=0.0, inplace=False)
            (3): Linear(in_features=11252, out_features=2813, bias=True)
            (4): Dropout(p=0.0, inplace=False)
          )
        )
        (norm): LayerNorm((2813,), eps=1e-05, elementwise_affine=True)
      )
    )
    (attn): PreNorm(
      (fn): Attention(
        (to_q): Linear(in_features=2813, out_features=512, bias=False)
        (to_kv): Linear(in_features=2813, out_features=1024, bias=False)
        (to_out): Linear(in_features=512, out_features=2813, bias=True)
        (rel_pos_emb): Embedding(1025, 64)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (norm): LayerNorm((2813,), eps=1e-05, elementwise_affine=True)
    )
    (conv): ConformerConvModule(
      

In [47]:
# 訓練モードの変更（Dropoutなど）
model.train()

# データよみこみ
sample = torch.from_numpy(np.load('melspec.npy')).unsqueeze(0)
print(sample.shape)

# forward
y_pred = model(sample.to(device))

torch.Size([1, 128, 2813])


## Test Transformer

In [48]:
y_pred.shape

torch.Size([1, 23])

## Preprocessing

## Data Loader

In [None]:
# def get_dataloder():
#     # transform
#     train_transform = transforms.Compose([
#         # transforms.RandomCrop((128, 313), pad_if_needed=True, padding_mode="constant"),
#         transforms.ToTensor(),
#     ])
#     valid_transform = transforms.Compose([
#         # transforms.CenterCrop((128, 313)),
#         transforms.ToTensor()
#     ])
    
#     # dataset
#     train_datasets = datasets.ImageFolder(root=config.INPUT, transform=train_transform)
#     valid_datasets = datasets.ImageFolder(root=config.INPUT, transform=valid_transform)

#     # k-fold
#     skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.SEED)

#     # target
#     _t = train_datasets.targets
#     trn_idx, val_idx = [(trn_idx, val_idx) for trn_idx, val_idx in skf.split(_t, _t)][config.FOLD]

#     # subset
#     train_datasets = torch.utils.data.Subset(train_datasets, trn_idx)
#     valid_datasets = torch.utils.data.Subset(valid_datasets, val_idx)

#     # dataloader
#     train_data_loader = torch.utils.data.DataLoader(train_datasets, batch_size=config.TRAIN_BS, shuffle=True, num_workers=config.TRAIN_WORKS)
#     valid_data_loader = torch.utils.data.DataLoader(valid_datasets, batch_size=config.VALID_BS, shuffle=False, num_workers=config.VALID_WORKS)
    
#     return train_data_loader, valid_data_loader

# # dataloader
# train_data_loader, valid_data_loader = get_dataloder()

# # ?
# for d in train_data_loader:
#     break

# # ?
# img = d[0][0]
# plt.imshow(np.rollaxis(img.numpy(), 0, 3))

In [4]:
x = torch.randn(100, 1024, 5600)
# x = x.to(device)

In [5]:
layer = ConformerConvModule(
    dim = 5600,
    causal = False,             # auto-regressive or not - 1d conv will be made causal with padding if so
    expansion_factor = 2,       # what multiple of the dimension to expand for the depthwise convolution
    kernel_size = 31,           # kernel size, 17 - 31 was said to be optimal
    dropout = 0.                # dropout at the very end
)
x = layer(x) + x

In [6]:
x.shape

torch.Size([100, 1024, 5600])

In [8]:
block = ConformerBlock(
    dim = 5600,
    dim_head = 64,
    heads = 8,
    ff_mult = 4,
    conv_expansion_factor = 2,
    conv_kernel_size = 31,
    attn_dropout = 0.,
    ff_dropout = 0.,
    conv_dropout = 0.
)

# x = torch.randn(1, 1024, 512)
x = block(x) # (1, 1024, 512)

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:73] data. DefaultCPUAllocator: not enough memory: you tried to allocate 9175040000 bytes. Buy new RAM!

In [24]:
x.shape

tensor([[[-0.3272, -1.4688,  0.6977,  ..., -1.3553, -0.6360, -0.1129],
         [ 0.7752,  0.6278, -0.9865,  ...,  1.3034, -0.6864, -0.1707],
         [ 0.7214,  2.0796,  1.9614,  ..., -0.3620, -0.7741,  0.0137],
         ...,
         [ 0.1327,  0.9225,  0.5310,  ...,  0.1975, -0.6152,  0.8490],
         [ 0.3753,  0.7627,  1.0196,  ...,  1.2932,  0.3024,  0.0706],
         [ 0.4095,  0.5860,  0.3746,  ..., -0.6731, -0.2048,  0.6988]]],
       grad_fn=<NativeLayerNormBackward>)

## Modeling