# Train

## Library Import

In [303]:
# library import
import numpy as np
import pandas as pd
import os
import tqdm
import random
import time
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import Adam, AdamW
from torchvision.models import resnet18, resnet34, resnet50
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import Subset
import torch.utils.data as torchdata
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from contextlib import contextmanager
from typing import Optional
import logging
from numpy.random import beta
from pathlib import Path

from conformer import ConformerConvModule
from conformer import ConformerBlock

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)



## Configuration

In [137]:
# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [138]:
# sample data
sample = torch.from_numpy(np.load('melspec.npy'))
print(sample.shape)
# channel = sample.unsqueeze(0)
# batch = channel.unsqueeze(0)
# print(batch.shape)

torch.Size([2813, 128])


In [310]:
class config:
    SEED = 42
    INPUT = Path("../input/rfcx-species-audio-detection/train")
    TRAIN_AUDIO_ROOT = Path("../input/rfcx-species-audio-detection/train_mel")
    TEST_AUDIO_ROOT = Path("../input/rfcx-species-audio-detection/test_mel")
    TRAIN_TP = Path("../input/rfcx-species-audio-detection/train_tp.csv")
    DIM = sample.shape[1]
    SEQ_LEN = sample.shape[0]
    CLASS_NUM = 23
    KERNEL_SIZE = 3
    POOL_SIZE = 2
    POOL_STRIDE = 2
    NUM_BIRDS = 24
    N_FOLDS = 5
    BTCH_NUM = 50
    EPOCH_NUM = 100
    lr = 0.001

In [146]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
set_seed(config.SEED)

## Audio Transform

In [125]:
# transform all audio to 2d
# run audio_transformer

## Preprocessing

In [None]:
# Spec Augment








In [None]:
# any other augment here...









## Modeling

In [151]:
# Conformer
# https://arxiv.org/abs/2005.08100
class RainforestTransformer(nn.Module):
    def __init__(self):
        super(RainforestTransformer, self).__init__()         

        self.conv = nn.Conv2d(1, 1, config.KERNEL_SIZE)
        self.linear = nn.Linear(int(
                                    (
                                        ((config.DIM - config.KERNEL_SIZE + 1) - config.POOL_SIZE) / config.POOL_STRIDE
                                    ) + 1
                                ), config.DIM)
        self.dropout = nn.Dropout(0.2)
        
        self.conformerblock = ConformerBlock(
            dim = config.DIM,
            dim_head = 64,
            heads = 8,
            ff_mult = 4,
            conv_expansion_factor = 2,
            conv_kernel_size = 31,
            attn_dropout = 0.,
            ff_dropout = 0.,
            conv_dropout = 0.
        )
        self.decoder = nn.Linear(1 * int((((config.SEQ_LEN - config.KERNEL_SIZE + 1) -  config.POOL_SIZE) / config.POOL_STRIDE) + 1) * config.DIM, config.CLASS_NUM)

        # devided by stride
        
    def forward(self, x):
        h = F.relu(self.conv(x))
        h = F.max_pool2d(h, config.POOL_SIZE, stride=config.POOL_STRIDE)
        h = self.linear(h)
        h = h.transpose(0, 1)[0] # transpose batch and channel to delet channel dimension
        h = self.conformerblock(h)
        h = h.view(-1, 1 * int((((config.SEQ_LEN - config.KERNEL_SIZE + 1) -  config.POOL_SIZE) / config.POOL_STRIDE) + 1) * config.DIM)
        out = self.decoder(h)
        return out

## Metric

In [307]:
# LRAP. Instance-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1
def LRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = (scores.sum(-1) / labels.sum(-1)).mean()
    return score.item()

# label-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1
def LWLRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    # Number of GT labels per instance
    num_labels = labels.sum(-1)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = scores.sum() / labels.sum()
    return score.item()

## Transforms

In [163]:
# transforms
train_transform = transforms.Compose([
    # transforms.RandomCrop((128, 313), pad_if_needed=True, padding_mode="constant"),
    transforms.ToTensor(),
])
valid_transform = transforms.Compose([
    # transforms.CenterCrop((128, 313)),
    transforms.ToTensor()
])
label_transform = transforms.Compose([
    transforms.ToTensor()
])

## Dataset

In [225]:
# Data load
df_train_tp = pd.read_csv(config.TRAIN_TP)

# add column
for col in range(24):
    df_train_tp[col] = 0

# bit
for index, row in df_train_tp.iterrows():
    specId = row["species_id"]
    for col in range(24):
        if int(specId) == col:
            df_train_tp.iloc[index, df_train_tp.columns.get_loc(col)] = 1

# grouping
df_train_tp = df_train_tp.groupby("recording_id", as_index=False).max()

# check
df_train_tp[df_train_tp["recording_id"] == "77299bde7"].head(100)


Unnamed: 0,recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
538,77299bde7,21,1,42.3787,3750.0,43.472,5531.25,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [254]:
# load data
record_ids = []
labels = []
for index, row in df_train_tp.iterrows():
    record_ids.append(row.values[0])
    labels.append(row.values[7:31])

print(record_ids[0])
print(labels[0])


003bec244
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]


In [268]:
class RainforestDatasets(torch.utils.data.Dataset):
    def __init__(self, transform = None, train = True):
        self.transform = transform
        self.train = train

        # data load
        self.labelset = labels
        self.dataset = []
        for rid in record_ids:
            # read npy
            melspec = np.load(os.path.join(config.TRAIN_AUDIO_ROOT, rid + ".npy"))
            melspec = torch.from_numpy(melspec)
            melspec = melspec.unsqueeze(0) # add channel for first convolution
            self.dataset.append(melspec)

        self.datanum = len(self.dataset)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        # get data
        out_label = self.labelset[idx]
        out_data = self.dataset[idx]
        
        # transform label
        out_data = self.transform(out_data)
        out_label = label_transform(out_label)

        return out_data, out_label

In [293]:
# skf
# skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.SEED)
msss = MultilabelStratifiedShuffleSplit(n_splits=config.N_FOLDS, test_size=0.2, random_state=config.SEED)

In [294]:
# transform
train_datasets = RainforestDatasets(transform=train_transform)
valid_datasets = RainforestDatasets(transform=valid_transform)

In [304]:
for kfoldidx, (train_index, test_index) in enumerate(msss.split(labels, labels)):
    X = Subset(train_datasets, train_index)
    train_dataloader = DataLoader(X, 20, shuffle=True)



(902,)
(230,)
(905,)
(227,)
(906,)
(226,)
(906,)
(226,)
(903,)
(229,)


In [305]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x2d9af310370>

## Train

In [311]:
criterion = nn.BCEWithLogitsLoss().cuda()
optimizer = Adam(params=model.parameters(), lr=config.lr, amsgrad=False)