In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q kaggle
!mkdir -p .kaggle
!cp "./drive/My Drive/Study/config/kaggle.json" .kaggle/
!chmod 600 .kaggle/kaggle.json
!mv .kaggle /root

#!kaggle datasets download "birdcall-spectrogram-images-cut-multi"
#!unzip birdcall-spectrogram-images-cut-multi.zip > /dev/null
#!rm -rf birdcall-spectrogram-images-cut-multi.zip
#!cp -r "./drive/My Drive/Study/Bird/input/nocall_20200824" train_img_2/0_nocall

!kaggle datasets download "birdcall-spectrogram-images"
!unzip birdcall-spectrogram-images.zip > /dev/null
!rm -rf birdcall-spectrogram-images.zip

!pip install panns-inference

In [None]:
import numpy as np
import pandas as pd
import os
import tqdm
import random
import time

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import Adam, AdamW, SGD
from torchvision.models import densenet161
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, CyclicLR

import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, average_precision_score
from sklearn.model_selection import StratifiedKFold

from contextlib import contextmanager
from typing import Optional
import logging
from numpy.random import beta
from PIL import Image

from panns_inference.pytorch_utils import interpolate, pad_framewise_output

device = torch.device('cuda')

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

class config:
    SEED = 416
    N_FOLDS = 5
    FOLD = 0
    PRETRAINED = True
    INPUT = "./train_img_8"
    OUTPUT = "./drive/My Drive/Study/Bird/output/from_densenet161_12"
    N_LABEL = 264
    BS = 256//4
    WORKS = 0
    INITIAL_EPOCH = 21
    EPOCHS = 55
    ALPHA = 0.2
    T_MAX = 10

!mkdir -p "{config.OUTPUT}"

In [None]:
class FreqMask:
    def __init__(self, F=30, num_masks=1, replace_with_zero=True):
        self.F = F
        self.num_masks = num_masks
        self.replace_with_zero = replace_with_zero

    def __call__(self, spec):
        cloned = spec.clone()
        num_mel_channels = cloned.shape[1]
    
        for i in range(0, self.num_masks):        
            f = random.randrange(0, self.F)
            f_zero = random.randrange(0, num_mel_channels - f)

            # avoids randrange error if values are equal and range is empty
            if (f_zero == f_zero + f): return cloned

            mask_end = random.randrange(f_zero, f_zero + f) 
            if (self.replace_with_zero): cloned[:, f_zero:mask_end] = 0
            else: cloned[:, f_zero:mask_end] = cloned.mean()
    
        return cloned

def get_dataloder():
    train_transform = transforms.Compose([
        transforms.RandomCrop((128, 313), pad_if_needed=True, padding_mode="constant"),
        transforms.RandomApply([
            transforms.Lambda(lambda img: transforms.functional.adjust_gamma(img, gamma=2, gain=1)),
        ], p=0.5),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        transforms.RandomApply([
            FreqMask(replace_with_zero=False),
        ], p=0.5), 
    ])
    valid_transform = transforms.Compose([
        transforms.CenterCrop((128, 313)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])

    train_datasets = datasets.ImageFolder(root=config.INPUT, transform=train_transform)
    valid_datasets = datasets.ImageFolder(root=config.INPUT, transform=valid_transform)

    skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.SEED)
    _t = train_datasets.targets
    trn_idx, val_idx = [(trn_idx, val_idx) for trn_idx, val_idx in skf.split(_t, _t)][config.FOLD]

    train_datasets = torch.utils.data.Subset(train_datasets, trn_idx)
    valid_datasets = torch.utils.data.Subset(valid_datasets, val_idx)

    train_data_loader = torch.utils.data.DataLoader(train_datasets, batch_size=config.BS, shuffle=True, num_workers=config.WORKS)
    valid_data_loader = torch.utils.data.DataLoader(valid_datasets, batch_size=config.BS, shuffle=False, num_workers=config.WORKS)
    
    return train_data_loader, valid_data_loader

data_loader, _ = get_dataloder()
for d in data_loader:
    break
img = d[0][0]
plt.imshow(np.rollaxis(img.numpy(), 0, 3))

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2):
        super().__init__()
        self.gamma = gamma

    def forward(self, logit, target):
        target = target.float()
        max_val = (-logit).clamp(min=0)
        loss = logit - logit * target + max_val + \
               ((-max_val).exp() + (-logit - max_val).exp()).log()

        invprobs = F.logsigmoid(-logit * (target * 2.0 - 1.0))
        loss = (invprobs * self.gamma).exp() * loss
        if len(loss.size())==2:
            loss = loss.sum(dim=1)
        return loss.mean()


def mixup(input, target, gamma):
    # target is onehot format!
    perm = torch.randperm(input.size(0))
    perm_input = input[perm]
    perm_target = target[perm]
    return input.mul_(gamma).add_(1 - gamma, perm_input), target.mul_(gamma).add_(1 - gamma, perm_target)


class BirdcallNet(nn.Module):
    def __init__(self):
        super(BirdcallNet, self).__init__()
        densenet = densenet161(pretrained=config.PRETRAINED)
        self.features = densenet.features

        self.l8_a = nn.Conv1d(2208, config.N_LABEL, 1, bias=False)
        #self.l8_b = nn.Conv1d(2208, config.N_LABEL, 1, bias=False)

        self.l8_bs = nn.ModuleList([nn.Conv1d(2208, config.N_LABEL, 1, bias=False) for _ in range(3)])

    def forward(self, x, perm=None, gamma=None):
        # input: (batch, channel, Hz, time)
        frames_num = x.shape[3]
        x = x.transpose(3, 2)  # (batch, channel, time, Hz)
        h = self.features(x)  # (batch, unit, time, Hz)

        h = F.relu(h, inplace=True)
        h  = torch.mean(h, dim=3)  # (batch, unit, time)
 
        xa = self.l8_a(h)  # (batch, n_class, time)
        #xb = self.l8_b(h)  # (batch, n_class, time)
        #xb = torch.softmax(xb, dim=2)

        xb_lst = [torch.softmax(l8_b(h), dim=2) for l8_b in self.l8_bs]
        xb = torch.stack(xb_lst).sum(0)/3

        pseudo_label = (xa.sigmoid() >= 0.5).float()
        clipwise_preds = torch.sum(xa * xb, dim=2)
        attention_preds = xb

        return clipwise_preds, attention_preds, pseudo_label

In [None]:
def train_fn(train_data_loader, model, optimizer, scheduler=None):
    losses, lrs = [], []
    model.train()
    t = tqdm.notebook.tqdm(train_data_loader, total=len(train_data_loader))
    for (X, y) in t:
        y_onehot = torch.eye(config.N_LABEL+1)[y][:, 1:]

        b = beta(config.ALPHA, config.ALPHA)
        _X, _y = mixup(X, y_onehot, b)

        clipwise_preds, attention_preds, pseudo_label = model(_X.to(device))
        loss1 = nn.BCEWithLogitsLoss()(clipwise_preds, _y.to(device))
        loss2 = nn.BCEWithLogitsLoss()(attention_preds, pseudo_label)
        #loss2 = FocalLoss()(attention_preds, pseudo_label)
        loss = loss1 + loss2

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()

        losses.append(loss.item())
        lrs.append(np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean())
    
    return sum(losses)/len(losses), lrs


def get_single_label_from_multi_predict(y, y_pred, threshould):
    lst = []
    for idx in range(len(y_pred)):
        p = y_pred[idx]
        if sum(p >= threshould) < 2:
            _p = p.argmax().numpy()
        else:
            _p = np.where(p >= threshould)
            _p = _p[0]
            if y[idx].numpy() in _p:
                _p = y[idx].numpy()
            else:
                _p = p.argmax().numpy()
        lst.append(_p)
    return np.array(lst)


def valid_fn(valid_data_loader, model, threshould=0.5):
    losses, f1_lst_a, f1_lst_b, mAP_lst = [], [], [], []
    model.eval()
    t = tqdm.notebook.tqdm(valid_data_loader, total=len(valid_data_loader))
    for (X, y) in t:

        lst = []
        with torch.no_grad():
            y_pred, _, _ = model(X.to(device))

        _y = torch.eye(config.N_LABEL+1)[y][:, 1:]
        loss = nn.BCEWithLogitsLoss()(y_pred, _y.to(device))
        losses.append(loss.item())

        y_pred_a = get_single_label_from_multi_predict(y-1, y_pred.sigmoid().cpu(), threshould)
        y_pred_b = y_pred.argmax(1).cpu()

        f1_a = f1_score(y-1, y_pred_a, average="micro")
        f1_b = f1_score(y-1, y_pred_b, average="micro")
        f1_lst_a.append(f1_a)
        f1_lst_b.append(f1_b)
        mAP_lst.append((y_pred.sigmoid().cpu().numpy(), _y.numpy()))

    mAP = average_precision_score(np.vstack([m[1] for m in mAP_lst]), np.vstack([m[0] for m in mAP_lst]), average=None)
    mAP = np.nan_to_num(mAP).mean()

    return sum(f1_lst_a)/len(f1_lst_a), sum(f1_lst_b)/len(f1_lst_b), sum(losses)/len(losses), mAP

In [None]:
print(f"### Fold-{config.FOLD} ###")

set_seed(config.SEED+config.FOLD)

train_data_loader, valid_data_loader = get_dataloder()
model = BirdcallNet()
model.to(device)

optimizer = Adam(model.parameters(), lr=1e-3)
scheduler = CosineAnnealingLR(optimizer, T_max=len(train_data_loader)*config.T_MAX, eta_min=0.0)

if config.INITIAL_EPOCH == 0:
    best_loss, best_score_a, best_score_b, best_mAP = 9999, 0, 0, 0
    trn_losses, trn_lrs, val_losses, val_scores_a, val_scores_b, mAP_scores = [], [], [], [], [], []
else:
    model.load_state_dict(torch.load(f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_latest_model.bin"))
    optimizer.load_state_dict(torch.load(f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_latest_optimizer.bin"))
    scheduler.load_state_dict(torch.load(f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_latest_scheduler.bin"))
    log_df = pd.read_csv(f"{config.OUTPUT}/valid_f1_fold{config.FOLD}_score.csv")
    _, trn_losses, val_losses, val_scores_a, val_scores_b, mAP_scores = log_df.values.T.tolist()
    best_loss, best_score_a, best_score_b, best_mAP = min(val_losses), max(val_scores_a), max(val_scores_b), max(mAP_scores)
    trn_lrs = []

for epoch in range(config.INITIAL_EPOCH, config.EPOCHS):
    print(f"{epoch} epoch")
    
    tloss, lrs = train_fn(train_data_loader, model, optimizer, scheduler)
    val_f1_a, val_f1_b, vloss, mAP = valid_fn(valid_data_loader, model)

    # save best score model
    if best_score_a <= val_f1_a:
        best_score_a = val_f1_a
        torch.save(model.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_best_score_a.bin")
        print(f"Best Score A Update!!! -> {best_score_a}")

    # save best score model
    if best_score_b <= val_f1_b:
        best_score_b = val_f1_b
        torch.save(model.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_best_score_b.bin")
        print(f"Best Score B Update!!! -> {best_score_b}")

    # save best loss model
    if best_loss >= vloss:
        best_loss = vloss
        torch.save(model.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_best_loss.bin")
        print(f"Best Loss Update!!! -> {best_loss}")

    # save best mAP model
    if best_mAP <= mAP:
        best_mAP = mAP
        torch.save(model.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_best_mAP.bin")
        print(f"Best mAP Update!!! -> {best_mAP}")

    torch.save(model.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_latest_model.bin")
    torch.save(optimizer.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_latest_optimizer.bin")
    torch.save(scheduler.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_latest_scheduler.bin")

    # save training logs
    trn_losses.append(tloss)
    val_losses.append(vloss)
    val_scores_a.append(val_f1_a)
    val_scores_b.append(val_f1_b)
    mAP_scores.append(mAP)
    trn_lrs.extend(lrs)

    log_df = pd.DataFrame(zip(trn_losses, val_losses, val_scores_a, val_scores_b, mAP_scores), columns=["train loss", "valid loss", "score_a", "score_b", "mAP"])
    log_df.to_csv(f"{config.OUTPUT}/valid_f1_fold{config.FOLD}_score.csv", index=True)

In [None]:
print(f"Best Score　A: {best_score_a} / Best Score　B: {best_score_b} / Best Loss: {best_loss} / Best mAP: {best_mAP}")
plt.plot(trn_lrs); plt.show()
plt.plot(val_scores_a); plt.show()
plt.plot(val_scores_b); plt.show()
plt.plot(trn_losses)
plt.plot(val_losses)
plt.show()

## Check Attention

In [None]:
def interpolate(x: torch.Tensor, ratio: int):
    """Interpolate data in time domain. This is used to compensate the
    resolution reduction in downsampling of a CNN.

    Args:
      x: (batch_size, time_steps, classes_num)
      ratio: int, ratio to interpolate
    Returns:
      upsampled: (batch_size, time_steps * ratio, classes_num)
    """
    (batch_size, time_steps, classes_num) = x.shape

    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled

def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
    """Pad framewise_output to the same length as input frames. The pad value
    is the same as the value of the last frame.
    Args:
      framewise_output: (batch_size, frames_num, classes_num)
      frames_num: int, number of frames to pad
    Outputs:
      output: (batch_size, frames_num, classes_num)
    """
    pad = framewise_output[:, -1:, :].repeat(
        1, frames_num - framewise_output.shape[1], 1)
    """tensor for padding"""

    output = torch.cat((framewise_output, pad), dim=1)
    """(batch_size, frames_num, classes_num)"""

    return output

#model = BirdcallNet()
#model.to(device)
model.load_state_dict(torch.load(f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_best_score_a.bin"))
for X, y in data_loader:
    outout = model(X.to(device))
    logits = outout[0].sigmoid().detach().cpu()
    seq_preds = outout[1].detach().cpu()
    break

In [None]:
idx = 2
img = X[idx]
print("target:", y[idx] - 1, "/ pred:", logits[idx].argmax())
plt.imshow(np.rollaxis(img.numpy(), 0, 3));plt.show()

_seq_preds = seq_preds.transpose(2, 1)
_seq_preds = interpolate(_seq_preds, 34)
_seq_preds = pad_framewise_output(_seq_preds, 313)
_seq_preds = _seq_preds.transpose(2, 1)
plt.imshow(_seq_preds[idx], aspect=0.25);plt.show()
print("target")
plt.plot(seq_preds[idx, y[idx] - 1, :]);plt.show()
print("predict")
plt.plot(seq_preds[idx, logits[idx].argmax() , :])

## SWA

In [None]:
from torch.optim.swa_utils import AveragedModel, SWALR

class SWADataSet(torch.utils.data.Dataset):
    def __init__(self, samples, transform):
        self.samples = samples
        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        im, t = train_samples[idx]
        image = Image.open(im).convert("RGB")
        onehot = torch.eye(config.N_LABEL+1)[t][1:]
        tensor_image = self.transform(image)
        return tensor_image.to(device), onehot.to(device)

BEST_MODEL = "./drive/My Drive/Study/Bird/output/from_densenet161_00"

In [None]:
set_seed(config.SEED+config.FOLD)

train_data_loader, valid_data_loader = get_dataloder()
model = BirdcallNet()
model.to(device)

model.load_state_dict(torch.load(f"{BEST_MODEL}/birdcallnet_f{config.FOLD}_best_score_a.bin"))

optimizer = SGD(model.parameters(), lr=1e-3)

swa_model = AveragedModel(model)
#swa_scheduler = SWALR(optimizer, swa_lr=1e-4)
#swa_scheduler = SWALR(optimizer, anneal_strategy="linear", anneal_epochs=5, swa_lr=1e-4)

#scheduler = None
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=len(train_data_loader)//2, T_mult=1, eta_min=5e-4)


trn_losses, trn_lrs, val_log = [], [], []
for epoch in range(10):
    tloss, lrs = train_fn(train_data_loader, model, optimizer, scheduler)
    swa_model.update_parameters(model)
    #swa_scheduler.step()

    val_f1_a, val_f1_b, vloss, mAP = valid_fn(valid_data_loader, model)
    val_log.append([val_f1_a, val_f1_b, vloss, mAP])
    print(f"{epoch} epoch: score_a={val_f1_a}, score_b={val_f1_b}, valid_loss={vloss}, mAP={mAP}")

    trn_losses.append(tloss)
    trn_lrs.extend(lrs)

plt.plot(trn_losses);plt.show()
plt.plot(trn_lrs);plt.show()
display(pd.DataFrame(val_log, columns=["val_f1_a", "val_f1_b", "vloss", "mAP"]))

train_samples = [train_data_loader.dataset.dataset.samples[i] for i in train_data_loader.dataset.indices]
train_transform = train_data_loader.dataset.dataset.transform

swa_dataset = SWADataSet(train_samples, train_transform)
swa_data_loader = torch.utils.data.DataLoader(swa_dataset, batch_size=config.BS, shuffle=True, num_workers=config.WORKS)

print("update bn")
torch.optim.swa_utils.update_bn(swa_data_loader, swa_model)

val_f1_a, val_f1_b, vloss, mAP = valid_fn(valid_data_loader, swa_model)
print(f"SWA: score_a={val_f1_a}, score_b={val_f1_b}, valid_loss={vloss}, mAP={mAP}")

torch.save(swa_model.state_dict(), f"{BEST_MODEL}/birdcallnet_f{config.FOLD}_swa_model_cos_harf_lr.bin")