# RFCX

## 準備

In [None]:
!nvidia-smi

In [None]:
%%time

!pip install -q kaggle
!mkdir -p .kaggle
!cp "./drive/My Drive/Study/config/kaggle.json" .kaggle/
!chmod 600 .kaggle/kaggle.json
!mv .kaggle /root

!kaggle datasets download "theoviel/rcfx-spectrograms-32-khz"
!unzip rcfx-spectrograms-32-khz.zip > /dev/null
!rm -rf rcfx-spectrograms-32-khz.zip 

In [None]:
!pip install iterative-stratification

In [None]:
!pip install --upgrade wandb
!wandb login e0792bb688a0d18e359df7438c45da90f8794091

## 実行コード

### ライブラリとデータの読み込み

In [None]:
import os
import tqdm
import random

from matplotlib import pyplot as plt
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import pandas as pd
import numpy as np
from numpy.random import beta

import torch
from torchvision.models import resnet18
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchvision import transforms
from torch.nn import functional as F

import wandb

device = torch.device("cuda")

In [None]:
DATA_ROOT = "./drive/MyDrive/Study/RFCX/input"

sample_submission = pd.read_csv(f"{DATA_ROOT}/sample_submission.csv")
train_fp = pd.read_csv(f"{DATA_ROOT}/train_fp.csv")
train_tp = pd.read_csv(f"{DATA_ROOT}/train_tp.csv")

### 関数群

In [None]:
label_dict = {}
for recording_id, df in train_tp.groupby("recording_id"):
    # label
    ohe_label = np.array([np.eye(24)[i] for i in df["species_id"].unique()]).sum(0)
    # position
    pos = (df[["t_min", "t_max"]].values/60*118).astype(int)
    position_label = np.zeros((24, 118))
    for i, (h, t) in enumerate(pos):
        position_label[df["species_id"].iloc[i], h:t] = 1
    # make dict
    label_dict[recording_id] = np.hstack([ohe_label.reshape(24, 1), position_label])

for recording_id, df in train_fp.groupby("recording_id"):
    try:
        _ = label_dict[recording_id]
        continue
    except KeyError:
        label_dict[recording_id] = np.zeros((24, 119))

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

In [None]:
SIZE = 224

class TimeMask:
    def __init__(self, T=40, num_masks=1, replace_with_zero=True):
        self.T = T
        self.num_masks = num_masks
        self.replace_with_zero = replace_with_zero

    def __call__(self, spec):
        cloned = spec.clone()
        len_spectro = cloned.shape[2]
    
        for i in range(0, self.num_masks):
            t = random.randrange(0, self.T)
            t_zero = random.randrange(0, len_spectro - t)

            # avoids randrange error if values are equal and range is empty
            if (t_zero == t_zero + t): return cloned

            mask_end = random.randrange(t_zero, t_zero + t)
            if (self.replace_with_zero): cloned[:,:,t_zero:mask_end] = 0
            else: cloned[:,:,t_zero:mask_end] = cloned.mean()
        return cloned

class FreqMask:
    def __init__(self, F=30, num_masks=1, replace_with_zero=True):
        self.F = F
        self.num_masks = num_masks
        self.replace_with_zero = replace_with_zero

    def __call__(self, spec):
        cloned = spec.clone()
        num_mel_channels = cloned.shape[1]
    
        for i in range(0, self.num_masks):        
            f = random.randrange(0, self.F)
            f_zero = random.randrange(0, num_mel_channels - f)

            # avoids randrange error if values are equal and range is empty
            if (f_zero == f_zero + f): return cloned

            mask_end = random.randrange(f_zero, f_zero + f) 
            if (self.replace_with_zero): cloned[:, f_zero:mask_end] = 0
            else: cloned[:, f_zero:mask_end] = cloned.mean()
    
        return cloned

def mono_to_color(
    X: np.ndarray, mean=None, std=None,
    norm_max=None, norm_min=None, eps=1e-6
):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

class SpectrogramFromNpz(torch.utils.data.Dataset):
    def __init__(self, X, y, mode):
        self.X = X
        self.y = y
        self.mode = mode
        self.to_tensor = transforms.ToTensor()
        #self.resize = transforms.Resize((SIZE, SIZE), interpolation=2)
        self.norm = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        self.augument_funcs = transforms.RandomApply([
            #transforms.Lambda(lambda img: transforms.functional.adjust_gamma(img, gamma=2, gain=1)),
            #FreqMask(),
            TimeMask(),
        ], p=0.5),

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        fname = self.X[idx]
        label = self.y[idx]

        if self.mode in ["train", "valid"]:
            path = f"./train/{fname}.npy"
        elif self.mode == "test":
            path = f"./test/{fname}.npy"
        mel = np.load(path)

        image = mono_to_color(mel)
        image = self.to_tensor(image)
        image = self.norm(image)
        #image = self.resize(image)
        if self.mode == "train":
            for aug in self.augument_funcs:
                image = aug(image)

        return image, label

In [None]:
# https://www.kaggle.com/c/rfcx-species-audio-detection/discussion/198418

# LRAP. Instance-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1
def LRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = (scores.sum(-1) / labels.sum(-1)).mean()
    return score.item()

# label-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1
def LWLRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    # Number of GT labels per instance
    num_labels = labels.sum(-1)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = scores.sum() / labels.sum()
    return score.item()

In [None]:
def train_loop(train_data_loader, model, optimizer, scheduler):
    model.train()
    losses, lrs = [], []
    for X, y in tqdm.tqdm_notebook(train_data_loader):
        pos_y = y[:,:,1:].to(device)
        y = y[:,:,0]
        X, y = X.to(device), y.to(device)

        b = beta(config.alpha, config.alpha)
        X, y = mixup(X, y, b)

        pseudo_label, clipwise_preds, attention_preds = model(X)
        loss1 = nn.BCEWithLogitsLoss()(clipwise_preds, y)
        loss2 = nn.BCEWithLogitsLoss()(attention_preds, pseudo_label)
        loss3 = nn.BCEWithLogitsLoss()(attention_preds, pos_y)
        #loss = (loss1 + loss2 + loss3) / 3
        loss = loss1 + loss2*0.5 + loss3*0.5

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if scheduler is not None:
            scheduler.step()

        losses.append(loss.item())
        lrs.append(np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean())
    t_loss = np.array(losses).mean()
    lr =  np.array(lrs).mean()
    return t_loss, lr

def valid_loop(valid_data_loader, model):
    model.eval()
    v_scores, v_losses = [], []
    for X, y in tqdm.tqdm_notebook(valid_data_loader):
        X, y = X.to(device), y.to(device)
        y = y[:,:,0]

        with torch.no_grad():
            # pred = model(X)
            _, pred, _ = model(X)
        pred = pred.sigmoid()

        score = LRAP(pred.cpu().sigmoid(), y.cpu())
        v_scores.append(score)

        loss = nn.BCEWithLogitsLoss()(pred, y)
        loss = loss.item()
        v_losses.append(loss)
    valid_score, valid_loss = np.array(v_scores).mean(), np.array(v_losses).mean()
    return valid_score, valid_loss

In [None]:
N_FOLD = 5
WORKS = 4

def parse_labels(recording_id):
    try:
        label = label_dict[recording_id]
    except KeyError:
        label = np.zeros(24)
    return label

def get_data_loader():
    labels = np.array(list(label_dict.values()))
    fnames = np.array(list(label_dict.keys()))

    mskf = MultilabelStratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=config.seed)
    for fold, (train_index, valid_index) in enumerate(mskf.split(fnames, labels[:, :, 0])):
        if config.fold != fold:
            continue
        train_X, train_y = fnames[train_index], labels[train_index]
        valid_X, valid_y = fnames[valid_index], labels[valid_index]

        train_datasets = SpectrogramFromNpz(train_X, train_y, "train")
        valid_datasets = SpectrogramFromNpz(valid_X, valid_y, "valid")  # ←修正

        train_data_loader = torch.utils.data.DataLoader(train_datasets, batch_size=config.batch_size, shuffle=True, num_workers=WORKS)
        valid_data_loader = torch.utils.data.DataLoader(valid_datasets, batch_size=config.batch_size, shuffle=False, num_workers=WORKS)

    return train_data_loader, valid_data_loader

In [None]:
def mixup(input, target, gamma):
    # target is onehot format!
    perm = torch.randperm(input.size(0))
    perm_input = input[perm]
    perm_target = target[perm]
    return input.mul_(gamma).add_(1 - gamma, perm_input), target.mul_(gamma).add_(1 - gamma, perm_target)

### モデル

In [None]:
class BirdcallNet(nn.Module):
    def __init__(self):
        super(BirdcallNet, self).__init__()
        self.n_label = 24
        resnet = resnet18(pretrained=True)
        self.resnet_head = nn.Sequential(*list(resnet.children())[:-2])
        self.l8_a = nn.Conv1d(512, self.n_label, 1, bias=False)
        self.l8_b = nn.Conv1d(512, self.n_label, 1, bias=False)

    def forward(self, x):  # input x: (batch, channel, Hz, time)
        frames_num = x.shape[3]
        x = x.transpose(3, 2)  # (batch, channel, time, Hz)

        h = self.resnet_head(x)  # (batch, unit, time, Hz)
        
        h = F.relu(h)
        h  = torch.mean(h, dim=3)  # (batch, unit, time)
        
        xa = self.l8_a(h)  # (batch, n_class, time)
        xb = self.l8_b(h)  # (batch, n_class, time)
        xb = torch.softmax(xb, dim=2)

        pseudo_label = (xa.sigmoid() >= 0.5).float()
        clipwise_preds = torch.sum(xa * xb, dim=2)
        attention_preds = xb
        
        return pseudo_label, clipwise_preds, attention_preds

### 実行パラメーター

In [None]:
EXP_NAME = "exp0001_first_sample"
OUTPUT = f"./drive/MyDrive/Study/RFCX/output/{EXP_NAME}"
!mkdir -p {config.OUTPUT}

config = wandb.config

config.seed = 416
config.learning_rate = 1e-3
config.batch_size = 64
config.num_epochs = 50
#config.t_max = 10
config.fold = 0
config.alpha = 0.1

wandb.init(project="rfcx")

### 実行

In [None]:
set_seed(config.seed)

train_data_loader, valid_data_loader = get_data_loader()

model = BirdcallNet()
model.to(device)
optimizer = Adam(model.parameters(), lr=config.learning_rate)
#scheduler = CosineAnnealingLR(optimizer, T_max=len(train_data_loader)*config.t_max, eta_min=0.0)
scheduler = None

wandb.watch(model)

best_score = 0
for epoch in range(config.num_epochs):
    print(f"### epoch {epoch} ###")
    t_loss, lr = train_loop(train_data_loader, model, optimizer, scheduler)
    v_score, v_loss = valid_loop(valid_data_loader, model)

    if best_score < v_score:
        print("best model update !!!")
        torch.save(model.state_dict(), f"{OUTPUT}/birdcallnet_f{config.fold}_latest_model.bin")
        best_score = v_score
    
    wandb.log({"train loss": t_loss, "lr": lr, "valid loss": v_loss, "valid score": v_score, "best score": best_score})

画像確認

In [None]:
# 0.7393
path = f"./train/64e252faf.npy"
mel = np.load(path)

image = mono_to_color(mel)
image = image[:,:200,:]
print(image.shape)
plt.imshow(image)

image = transforms.ToTensor()(image)
plt.imshow(np.moveaxis((image).numpy(), 0, 2))

モデルの挙動確認

In [None]:
train_data_loader, valid_data_loader = get_data_loader()
for X, y in tqdm.tqdm_notebook(train_data_loader):
    break