In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q kaggle
!mkdir -p .kaggle
!cp "./drive/My Drive/Study/config/kaggle.json" .kaggle/
!chmod 600 .kaggle/kaggle.json
!mv .kaggle /root

!kaggle datasets download "birdcall-spectrogram-images-cut-multi"
!unzip birdcall-spectrogram-images-cut-multi.zip > /dev/null
!rm -rf birdcall-spectrogram-images-cut-multi.zip

!pip install panns-inference

!cp -r "./drive/My Drive/Study/Bird/input/nocall_20200824" train_img_2/0_nocall

In [None]:
import numpy as np
import pandas as pd
import os
import tqdm
import random
import time

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import Adam, AdamW, SGD
from torchvision.models import resnet18, resnet50, densenet121, densenet161
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import CosineAnnealingLR

import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, average_precision_score
from sklearn.model_selection import StratifiedKFold

from contextlib import contextmanager
from typing import Optional
import logging
from numpy.random import beta

from panns_inference.pytorch_utils import interpolate, pad_framewise_output

device = torch.device('cuda')

In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

class config:
    SEED = 416
    N_FOLDS = 5
    FOLD = 0
    PRETRAINED = True
    INPUT = "./train_img_2"
    OUTPUT = "./drive/My Drive/Study/Bird/output/from_resnet18_57"
    N_LABEL = 264
    BS = 256//4
    WORKS = 0
    INITIAL_EPOCH = 0
    EPOCHS = 55
    ALPHA = 0.2
    T_MAX = 10

!mkdir -p "{config.OUTPUT}"

In [None]:
"""def make_weights_for_balanced_classes(images, nclasses):                        
    count = [0] * nclasses                                                      
    for item in images:                                                         
        count[item[1]] += 1                                                     
    weight_per_class = [0.] * nclasses                                      
    N = float(sum(count))                                                   
    for i in range(nclasses):                                                   
        weight_per_class[i] = N/float(count[i])                                 
    weight = [0] * len(images)                                              
    for idx, val in enumerate(images):                                          
        weight[idx] = weight_per_class[val[1]]                                  
    return weight  """

class FreqMask:
    def __init__(self, F=30, num_masks=1, replace_with_zero=True):
        self.F = F
        self.num_masks = num_masks
        self.replace_with_zero = replace_with_zero

    def __call__(self, spec):
        cloned = spec.clone()
        num_mel_channels = cloned.shape[1]
    
        for i in range(0, self.num_masks):        
            f = random.randrange(0, self.F)
            f_zero = random.randrange(0, num_mel_channels - f)

            # avoids randrange error if values are equal and range is empty
            if (f_zero == f_zero + f): return cloned

            mask_end = random.randrange(f_zero, f_zero + f) 
            if (self.replace_with_zero): cloned[:, f_zero:mask_end] = 0
            else: cloned[:, f_zero:mask_end] = cloned.mean()
    
        return cloned

def get_dataloder():
    train_transform = transforms.Compose([
        transforms.RandomCrop((128, 313), pad_if_needed=True, padding_mode="constant"),
        transforms.RandomApply([
            transforms.Lambda(lambda img: transforms.functional.adjust_gamma(img, gamma=2, gain=1)),
        ], p=0.5),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        transforms.RandomApply([
            FreqMask(replace_with_zero=False),
        ], p=0.5), 
    ])
    valid_transform = transforms.Compose([
        transforms.CenterCrop((128, 313)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])

    train_datasets = datasets.ImageFolder(root=config.INPUT, transform=train_transform)
    valid_datasets = datasets.ImageFolder(root=config.INPUT, transform=valid_transform)

    skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.SEED)
    _t = train_datasets.targets
    trn_idx, val_idx = [(trn_idx, val_idx) for trn_idx, val_idx in skf.split(_t, _t)][config.FOLD]

    # add sub label
    #multi_targets = []
    #for v in train["multi_ebird_code"].map(lambda x: [train_datasets.class_to_idx[xx] for xx in x]).values:
    #    multi_target = torch.eye(config.N_LABEL)[v].sum(0)
    #    multi_targets.append(multi_target)
    #train_datasets.samples = [(s[0], torch.cat([torch.tensor([float(s[1])]), multi_targets[i]])) for i, s in enumerate(train_datasets.samples)]
    #valid_datasets.samples = [(s[0], torch.cat([torch.tensor([float(s[1])]), multi_targets[i]])) for i, s in enumerate(valid_datasets.samples)]

    train_datasets = torch.utils.data.Subset(train_datasets, trn_idx)
    valid_datasets = torch.utils.data.Subset(valid_datasets, val_idx)

    #weights = make_weights_for_balanced_classes([train_datasets.dataset.imgs[i] for i in trn_idx], config.N_LABEL+1)
    #weights = torch.DoubleTensor(weights)                         
    #sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))              

    train_data_loader = torch.utils.data.DataLoader(train_datasets, batch_size=config.BS, shuffle=True, num_workers=config.WORKS)
    #train_data_loader = torch.utils.data.DataLoader(train_datasets, batch_size=config.BS, sampler=sampler, num_workers=config.WORKS)
    valid_data_loader = torch.utils.data.DataLoader(valid_datasets, batch_size=config.BS, shuffle=False, num_workers=config.WORKS)
    
    return train_data_loader, valid_data_loader

data_loader, _ = get_dataloder()
for d in data_loader:
    break
img = d[0][0]
plt.imshow(np.rollaxis(img.numpy(), 0, 3))

In [None]:
"""def loss_fn(output, target, onehot):
    loss1 = nn.BCEWithLogitsLoss()(output["clipwise_output"], target)
    pool, _ = output["framewise_output"].max(1)
    #onehot = onehot * (1 - 0.2) + (0.2/config.N_LABEL)
    loss2 = nn.BCEWithLogitsLoss()(pool, onehot)
    loss = loss1 + loss2*0.5
    return loss"""

def loss_fn(output, target):
    loss = nn.BCEWithLogitsLoss()(output, target)
    return loss

def mixup(input, target, gamma):
    # target is onehot format!
    perm = torch.randperm(input.size(0))
    perm_input = input[perm]
    perm_target = target[perm]
    return input.mul_(gamma).add_(1 - gamma, perm_input), target.mul_(gamma).add_(1 - gamma, perm_target)

def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)

def init_bn(bn):
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.0)

class AttBlock(nn.Module):
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 activation="linear",
                 temperature=1.0):
        super().__init__()

        self.activation = activation
        self.temperature = temperature
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

        self.bn_att = nn.BatchNorm1d(out_features)
        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)
        init_bn(self.bn_att)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        #norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        norm_att = torch.tanh(self.bn_att(self.att(x)))
        norm_att = torch.tanh(norm_att/10)*10
        norm_att = torch.softmax(norm_att, dim=-1)

        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)

class ROI1(nn.Module):
    def forward(self, x):
        x  = torch.mean(x, dim=3)
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x  = x1 + x2
        return x

class BirdcallNet(nn.Module):
    def __init__(self):
        super(BirdcallNet, self).__init__()
        densenet = densenet161(pretrained=config.PRETRAINED)
        self.features = densenet.features
        #self.roi = nn.Sequential(
        #    ROI1(),
        #    nn.Conv1d(2208, 2208, 1, bias=False),
        #    nn.BatchNorm1d(2208),
        #    nn.ReLU(inplace=True),
        #)
        #self.att_block = AttBlock(2208, config.N_LABEL, activation="linear")
        #self.fc = nn.Linear(2208, config.N_LABEL)

        self.l8_a = nn.Conv1d(2208, config.N_LABEL, 1, bias=False)
        self.l8_b = nn.Conv1d(2208, config.N_LABEL, 1, bias=False)
        #self.bn = nn.BatchNorm1d(config.N_LABEL)

    def forward(self, x, perm=None, gamma=None):
        # input: (batch, channel, Hz, time)
        frames_num = x.shape[3]
        x = x.transpose(3, 2)  # (batch, channel, time, Hz)
        h = self.features(x)  # (batch, unit, time, Hz)
        #h = self.roi(h)  # (batch, unit, time)
        #h, _ = h.max(2)
        #logits = self.fc(h)
        #return logits

        h = F.relu(h, inplace=True)
        h  = torch.mean(h, dim=3)  # (batch, unit, time)

        xa = self.l8_a(h) #.sigmoid()  # (batch, n_class, time)
        xb = self.l8_b(h) #.sigmoid() # (batch, n_class, time)
        #xb = self.bn(xb)
        xb = torch.softmax(xb, dim=2)

        pseudo_label = (xa.sigmoid() >= 0.5).float()
        clipwise_preds = torch.sum(xa * xb, dim=2)
        attention_preds = xb

        return clipwise_preds, attention_preds, pseudo_label

        #(clipwise_output, norm_att, segmentwise_output) = self.att_block(h)
        # clipwise_output: (batch, n_class)
        # segmentwise_output: (batch, n_class, time) 
        #segmentwise_output = segmentwise_output.transpose(1, 2)
        #framewise_output = interpolate(segmentwise_output, 32)
        #framewise_output = pad_framewise_output(framewise_output, frames_num)
        #output_dict = {
        #    'framewise_output': framewise_output,
        #    'clipwise_output': clipwise_output
        #}
        #return output_dict
"""
class BirdcallNet(nn.Module):
    def __init__(self):
        super(BirdcallNet, self).__init__()
        self.densenet = densenet161(pretrained=config.PRETRAINED)
        self.densenet.classifier = nn.Linear(2208, config.N_LABEL)
        #self.densenet.classifier = nn.Sequential(
        #                            nn.Linear(2208, 1024), nn.ReLU(), nn.Dropout(p=0.2),
        #                            #nn.Linear(1024, 1024), nn.ReLU(), nn.Dropout(p=0.2),
        #                            nn.Linear(1024, config.N_LABEL))

    def forward(self, x):
        return self.densenet(x)"""

In [None]:
def train_fn(train_data_loader, model, optimizer, scheduler=None):
    losses, lrs = [], []
    model.train()
    t = tqdm.notebook.tqdm(train_data_loader, total=len(train_data_loader))
    for (X, y) in t:
        #y_onehot = torch.eye(config.N_LABEL)[y[:,0].long()]
        y_onehot = torch.eye(config.N_LABEL+1)[y][:, 1:]

        b = beta(config.ALPHA, config.ALPHA)
        _X, _y = mixup(X, y_onehot, b)
        #_, sub_y = mixup(X, y[:,1:], b)

        #output_dict = model(_X.to(device))
        #loss = loss_fn(output_dict,  _y.to(device), sub_y.to(device))
        #output = model(_X.to(device))
        #loss = loss_fn(output,  _y.to(device))
        clipwise_preds, attention_preds, pseudo_label = model(_X.to(device))
        loss1 = nn.BCEWithLogitsLoss()(clipwise_preds, _y.to(device))
        loss2 = nn.BCEWithLogitsLoss()(attention_preds, pseudo_label)
        loss = loss1 + loss2

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()

        losses.append(loss.item())
        lrs.append(np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean())
    
    return sum(losses)/len(losses), lrs


def get_single_label_from_multi_predict(y, y_pred, threshould):
    lst = []
    for idx in range(len(y_pred)):
        p = y_pred[idx]
        if sum(p >= threshould) < 2:
            _p = p.argmax().numpy()
        else:
            _p = np.where(p >= threshould)
            _p = _p[0]
            if y[idx].numpy() in _p:
                _p = y[idx].numpy()
            else:
                _p = p.argmax().numpy()
        lst.append(_p)
    return np.array(lst)


def valid_fn(valid_data_loader, model, threshould=0.5):
    losses, f1_lst_a, f1_lst_b, mAP_lst = [], [], [], []
    model.eval()
    t = tqdm.notebook.tqdm(valid_data_loader, total=len(valid_data_loader))
    for (X, y) in t:

        lst = []
        with torch.no_grad():
            #output_dict = model(X.to(device))
            #y_pred = model(X.to(device))
            y_pred, _, _ = model(X.to(device))

        #y_pred = output_dict["clipwise_output"]

        #_y = torch.eye(config.N_LABEL)[y[:,0].long()]
        #loss = loss_fn(output_dict,  _y.to(device), y[:,1:].to(device))
        _y = torch.eye(config.N_LABEL+1)[y][:, 1:]
        loss = loss_fn(y_pred, _y.to(device))
        losses.append(loss.item())

        y_pred_a = get_single_label_from_multi_predict(y-1, y_pred.sigmoid().cpu(), threshould)
        y_pred_b = y_pred.argmax(1).cpu()

        f1_a = f1_score(y-1, y_pred_a, average="micro")
        f1_b = f1_score(y-1, y_pred_b, average="micro")
        f1_lst_a.append(f1_a)
        f1_lst_b.append(f1_b)
        mAP_lst.append((y_pred.sigmoid().cpu().numpy(), _y.numpy()))

    mAP = average_precision_score(np.vstack([m[1] for m in mAP_lst]), np.vstack([m[0] for m in mAP_lst]), average=None)
    mAP = np.nan_to_num(mAP).mean()

    return sum(f1_lst_a)/len(f1_lst_a), sum(f1_lst_b)/len(f1_lst_b), sum(losses)/len(losses), mAP

In [None]:
"""print(f"### Fold-{config.FOLD} ###")

set_seed(config.SEED+config.FOLD)

train_data_loader, valid_data_loader = get_dataloder()

model = BirdcallNet()
model.to(device)

for name, param in model.densenet.named_parameters():
    if name.split(".")[0] == "features":
        param.requires_grad = False
    else:
        param.requires_grad = True

optimizer = Adam(model.parameters(), lr=1e-2)

for epoch in range(5):
    print(f"warmup {epoch} epoch")
    tloss, lrs = train_fn(train_data_loader, model, optimizer)
    val_f1_a, val_f1_b, vloss, mAP = valid_fn(valid_data_loader, model)
    print(f"epoch-{epoch}: train loss={tloss}, valid loss={vloss}, score_a={val_f1_a}, score_b={val_f1_b}, mAP={mAP}")

for name, param in model.densenet.named_parameters():
    param.requires_grad = True"""

In [None]:
print(f"### Fold-{config.FOLD} ###")

set_seed(config.SEED+config.FOLD)

train_data_loader, valid_data_loader = get_dataloder()
model = BirdcallNet()
model.to(device)

optimizer = Adam(model.parameters(), lr=1e-3)
scheduler = CosineAnnealingLR(optimizer, T_max=len(train_data_loader)*config.T_MAX, eta_min=0.0)

if config.INITIAL_EPOCH == 0:
    best_loss, best_score_a, best_score_b, best_mAP = 9999, 0, 0, 0
    trn_losses, trn_lrs, val_losses, val_scores_a, val_scores_b, mAP_scores = [], [], [], [], [], []
else:
    model.load_state_dict(torch.load(f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_latest_model.bin"))
    optimizer.load_state_dict(torch.load(f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_latest_optimizer.bin"))
    scheduler.load_state_dict(torch.load(f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_latest_scheduler.bin"))
    log_df = pd.read_csv(f"{config.OUTPUT}/valid_f1_fold{config.FOLD}_score.csv")
    _, trn_losses, val_losses, val_scores_a, val_scores_b, mAP_scores = log_df.values.T.tolist()
    best_loss, best_score_a, best_score_b, best_mAP = min(val_losses), max(val_scores_a), max(val_scores_b), max(mAP_scores)
    trn_lrs = []

for epoch in range(config.INITIAL_EPOCH, config.EPOCHS):
    print(f"{epoch} epoch")
    tloss, lrs = train_fn(train_data_loader, model, optimizer, scheduler)
    val_f1_a, val_f1_b, vloss, mAP = valid_fn(valid_data_loader, model)

    # save best score model
    if best_score_a <= val_f1_a:
        best_score_a = val_f1_a
        torch.save(model.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_best_score_a.bin")
        print(f"Best Score A Update!!! -> {best_score_a}")

    # save best score model
    if best_score_b <= val_f1_b:
        best_score_b = val_f1_b
        torch.save(model.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_best_score_b.bin")
        print(f"Best Score B Update!!! -> {best_score_b}")

    # save best loss model
    if best_loss >= vloss:
        best_loss = vloss
        torch.save(model.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_best_loss.bin")
        print(f"Best Loss Update!!! -> {best_loss}")

    # save best mAP model
    if best_mAP <= mAP:
        best_mAP = mAP
        torch.save(model.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_best_mAP.bin")
        print(f"Best mAP Update!!! -> {best_mAP}")

    torch.save(model.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_latest_model.bin")
    torch.save(optimizer.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_latest_optimizer.bin")
    torch.save(scheduler.state_dict(), f"{config.OUTPUT}/birdcallnet_f{config.FOLD}_latest_scheduler.bin")

    # save training logs
    trn_losses.append(tloss)
    val_losses.append(vloss)
    val_scores_a.append(val_f1_a)
    val_scores_b.append(val_f1_b)
    mAP_scores.append(mAP)
    trn_lrs.extend(lrs)

    log_df = pd.DataFrame(zip(trn_losses, val_losses, val_scores_a, val_scores_b, mAP_scores), columns=["train loss", "valid loss", "score_a", "score_b", "mAP"])
    log_df.to_csv(f"{config.OUTPUT}/valid_f1_fold{config.FOLD}_score.csv", index=True)

In [None]:
print(f"Best Score　A: {best_score_a} / Best Score　B: {best_score_b} / Best Loss: {best_loss} / Best mAP: {best_mAP}")
plt.plot(trn_lrs); plt.show()
plt.plot(val_scores_a); plt.show()
plt.plot(val_scores_b); plt.show()
plt.plot(trn_losses)
plt.plot(val_losses)
plt.show()

In [None]:
for X, y in data_loader:
    break

In [None]:
outout = model(X.to(device))
logits = outout[0].sigmoid().detach()
seq_preds = outout[1].detach()

In [None]:
idx = 4
img = X[idx]
print("target:", y[idx] - 1, "/ pred:", logits[idx].cpu().argmax())
plt.imshow(np.rollaxis(img.numpy(), 0, 3));plt.show()
plt.plot(seq_preds.cpu()[idx, y[idx] - 1, :])