In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%time
# 大体10分くらい

!pip install -q kaggle
!mkdir -p .kaggle
!cp "./drive/My Drive/Study/config/kaggle.json" .kaggle/
!chmod 600 .kaggle/kaggle.json
!mv .kaggle /root

!kaggle datasets download "theoviel/rcfx-spectrograms-32-khz"
!unzip rcfx-spectrograms-32-khz.zip > /dev/null
!rm -rf rcfx-spectrograms-32-khz.zip 

!pip install -U iterative-stratification albumentations wandb  > /dev/null
!wandb login e0792bb688a0d18e359df7438c45da90f8794091

In [None]:
import os
import tqdm
import random

from matplotlib import pyplot as plt
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from datetime import datetime

import pandas as pd
import numpy as np
from numpy.random import beta

import torch
from torchvision.models import resnet18
import torch.nn as nn
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
from torchvision import transforms
from torch.nn import functional as F

import albumentations as A

import wandb

device = torch.device("cuda")

In [None]:
DATA_ROOT = "./drive/MyDrive/Study/RFCX/input"

sample_submission = pd.read_csv(f"{DATA_ROOT}/sample_submission.csv")
train_fp = pd.read_csv(f"{DATA_ROOT}/train_fp.csv")
train_tp = pd.read_csv(f"{DATA_ROOT}/train_tp.csv")

(train_tp["f_max"].max()/16000)*128, (train_tp["f_min"].min()/16000)*128

In [None]:
label_dict = {}
pos_dict = {}
for recording_id, df in train_tp.groupby("recording_id"):
    position_label = np.zeros((24, 3751))
    middle = []
    for species_id, t_min, t_max in df.values[:, [1, 3, 5]]:
        h, t = int(3751*(t_min/60)), int(3751*(t_max/60))
        position_label[species_id, h:t] = 1
        m = (t + h)//2
        middle.append(m)
    label_dict[recording_id] = position_label
    pos_dict[recording_id] = middle

counts_df = train_tp["species_id"].value_counts()
max_counts = counts_df.max()
label_weight = max_counts / counts_df
pos_weights = torch.Tensor(label_weight.sort_index().values).to(device)

fnames = np.array(list(label_dict.keys()))
labels = np.array(list(label_dict.values())).sum(2)

test_fnames = sample_submission["recording_id"].values

In [None]:
#train_tp["species_id"].value_counts()

In [None]:
fp_label_dict = {}
fp_pos_dict = {}
for recording_id, df in train_fp.groupby("recording_id"):
    position_label = np.zeros((24, 3751))
    middle = []
    for species_id, t_min, t_max in df.values[:, [1, 3, 5]]:
        h, t = int(3751*(t_min/60)), int(3751*(t_max/60))
        m = (t + h)//2
        middle.append(m)
    fp_label_dict[recording_id] = position_label
    fp_pos_dict[recording_id] = middle

In [None]:
# https://www.kaggle.com/c/rfcx-species-audio-detection/discussion/198418

# LRAP. Instance-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1
def LRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = (scores.sum(-1) / labels.sum(-1)).mean()
    return score.item()

# label-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1
def LWLRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    # Number of GT labels per instance
    num_labels = labels.sum(-1)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = scores.sum() / labels.sum()
    return score.item()

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

def parse_labels(recording_id):
    try:
        label = label_dict[recording_id]
    except KeyError:
        label = np.zeros(24)
    return label

def mixup(input, target, gamma):
    # target is onehot format!
    perm = torch.randperm(input.size(0))
    perm_input = input[perm]
    perm_target = target[perm]
    return input.mul_(gamma).add_(1 - gamma, perm_input), target.mul_(gamma).add_(1 - gamma, perm_target)

def last_layer_mixup(target, gamma):
    perm = torch.randperm(target.size(0))
    perm_target = target[perm]
    return perm, target.mul_(gamma).add_(1 - gamma, perm_target)

In [None]:
class TimeMask:
    def __init__(self, T=40, num_masks=1, replace_with_zero=True):
        self.T = T
        self.num_masks = num_masks
        self.replace_with_zero = replace_with_zero

    def __call__(self, spec):
        cloned = spec.clone()
        len_spectro = cloned.shape[2]
    
        for i in range(0, self.num_masks):
            t = random.randrange(0, self.T)
            t_zero = random.randrange(0, len_spectro - t)

            # avoids randrange error if values are equal and range is empty
            if (t_zero == t_zero + t): return cloned

            mask_end = random.randrange(t_zero, t_zero + t)
            if (self.replace_with_zero): cloned[:,:,t_zero:mask_end] = 0
            else: cloned[:,:,t_zero:mask_end] = cloned.mean()
        return cloned

class FreqMask:
    def __init__(self, F=30, num_masks=1, replace_with_zero=True):
        self.F = F
        self.num_masks = num_masks
        self.replace_with_zero = replace_with_zero

    def __call__(self, spec):
        cloned = spec.clone()
        num_mel_channels = cloned.shape[1]
    
        for i in range(0, self.num_masks):        
            f = random.randrange(0, self.F)
            f_zero = random.randrange(0, num_mel_channels - f)

            # avoids randrange error if values are equal and range is empty
            if (f_zero == f_zero + f): return cloned

            mask_end = random.randrange(f_zero, f_zero + f) 
            if (self.replace_with_zero): cloned[:, f_zero:mask_end] = 0
            else: cloned[:, f_zero:mask_end] = cloned.mean()
    
        return cloned

def mono_to_color(
    X: np.ndarray, mean=None, std=None,
    norm_max=None, norm_min=None, eps=1e-6
):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

H_POS = [32, 64, 128, 256]
WINDOW = 512
#H_POS = [32, 64, 128]
#WINDOW = 256
MAX_SEQ = 3751 - 1
def extract_ht_pos(pos):
    h_pos = [p for p in H_POS if (pos - p) > 0]
    if len(h_pos) > 0:
        h_pos = random.choice(h_pos)
    else:
        h_pos = pos
    t_pos = WINDOW - h_pos
    h, t = pos-h_pos, pos+t_pos
    if t > MAX_SEQ:
        h, t = MAX_SEQ-WINDOW, MAX_SEQ
    return h, t

class SpectrogramFromNpz(torch.utils.data.Dataset):
    def __init__(self, fname, mode):
        self.fname = fname
        self.mode = mode
        self.to_tensor = transforms.ToTensor()
        self.norm = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        self.augument_funcs_a = A.Compose([
            A.RandomBrightnessContrast(p=0.5),
            #A.RandomCrop(height=100, width=3751, p=0.5),
        ])
        self.augument_funcs_b = transforms.RandomApply([
            transforms.Lambda(lambda img: transforms.functional.adjust_gamma(img, gamma=2, gain=1)),
            TimeMask(),
            FreqMask(),
        ], p=0.5)

    def __len__(self):
        return len(self.fname)

    def __getitem__(self, idx):
        fname = self.fname[idx]

        # position
        try:
            pos = pos_dict[fname]
            pos = random.choice(pos)
            h, t = extract_ht_pos(pos)
        except KeyError:
            try:
                pos = fp_pos_dict[fname]
                pos = random.choice(pos)
                h, t = extract_ht_pos(pos)
            except KeyError:
                h, t = None, None

        # load images
        if self.mode in ["train", "valid"]:
            path = f"./train/{fname}.npy"
        elif self.mode == "test":
            path = f"./test/{fname}.npy"
        mel = np.load(path)
        image = mono_to_color(mel)
        #image = image[-110:, :, :]  # low pass filter

        # augument
        if self.mode == "train":
            image = self.augument_funcs_a(image=image)["image"]
            image = self.to_tensor(image)
            image = self.augument_funcs_b(image)
            image = image[:, :, h:t]
        else:
            image = self.to_tensor(image)
        image = self.norm(image)

        # label
        try:
            label = label_dict[fname]
        except KeyError:
            try:
                label = fp_label_dict[fname]
            except KeyError:
                label = None
        if self.mode == "train":
            label = label[:, h:t]

        return image, label

In [None]:
EPS = 1e-7
def train_loop(train_data_loader, model, optimizer, scheduler):
    model.train()
    losses, lrs = [], []
    for X, y in train_data_loader:
        X = X.to(device)
        y = (y.sum(2) > 0).int().float().to(device)

        #LABEL_SMOOTHING = 0.2
        #y = y * (1 - LABEL_SMOOTHING) + (LABEL_SMOOTHING/24)  # label smoothing

        b = beta(config.alpha, config.alpha)

        perm, _y = last_layer_mixup(y, b)
        pseudo_label, clipwise_preds, attention_preds, clipwise_preds_max = model(X, perm, b)

        loss1 = nn.BCEWithLogitsLoss(pos_weight=pos_weights)(clipwise_preds, _y)
        loss_none = nn.BCEWithLogitsLoss(reduction="none")(attention_preds, pseudo_label)
        loss2 = (loss_none.mean(2) * pos_weights).mean()
        loss3 = nn.BCEWithLogitsLoss(pos_weight=pos_weights)(clipwise_preds_max, _y)
        loss_a = loss1 + loss2 + loss3*0.5


        """
        _X, y = mixup(X, y, b)
        _pseudo_label, _clipwise_preds, _attention_preds, _clipwise_preds_max = model(_X)

        _loss1 = nn.BCEWithLogitsLoss(pos_weight=pos_weights)(_clipwise_preds, y)
        _loss_none = nn.BCEWithLogitsLoss(reduction="none")(_attention_preds, _pseudo_label)
        _loss2 = (_loss_none.mean(2) * pos_weights).mean()
        _loss3 = nn.BCEWithLogitsLoss(pos_weight=pos_weights)(_clipwise_preds_max, y)
        loss_b = _loss1 + _loss2 + _loss3*0.5


        loss = (loss_a + loss_b)/2
        """
        loss = loss_a

        #_loss1 = nn.BCEWithLogitsLoss(reduction="none")(clipwise_preds, y)
        #_loss1 = (_loss1 * (y != 1).int()).mean()
        #loss1 += _loss1

        #loss2 = nn.BCEWithLogitsLoss()(attention_preds, pseudo_label)
        
        #_loss3 = nn.BCEWithLogitsLoss(reduction="none")(clipwise_preds_max, y)
        #_loss3 = (_loss3 * (y != 1).int()).mean()
        #loss3 += _loss3

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if scheduler is not None:
            scheduler.step()

        losses.append(loss.item())
        lrs.append(np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean())
    t_loss = np.array(losses).mean()
    lr =  np.array(lrs).mean()
    return t_loss, lr

eval_img_pos = [[0, WINDOW]]
for idx in range(1, 8):
#for idx in range(1, 18):
    h, t = eval_img_pos[idx-1][0], eval_img_pos[idx-1][1]
    h = t - 51
    #h = t - 50
    t = h + WINDOW
    eval_img_pos.append([h, t])
    
def valid_loop(valid_data_loader, model):
    model.eval()
    v_scores, v_losses = [], []
    for X, y in valid_data_loader:
        
        preds = []
        for h, t in eval_img_pos:
            with torch.no_grad():
                #_, pred, _ = model(X[:,:,:,h:t].to(device))
                _, pred, _, pred_max = model(X[:,:,:,h:t].to(device))
            pred = pred.sigmoid()
            pred_max = pred_max.sigmoid()
            pred = (pred + pred_max)/2
            preds.append(pred)
        max_pred, _  = torch.max(torch.stack(preds), dim=0)

        score = LRAP(max_pred.cpu(), (y.sum(2) > 0).int())
        loss = nn.BCEWithLogitsLoss()(max_pred.cpu(), (y.sum(2) > 0).float())
        v_scores.append(score)
        v_losses.append(loss.item())

    valid_score, valid_loss = np.array(v_scores).mean(), np.array(v_losses).mean()
    return valid_score, valid_loss

In [None]:
class RFCXNet(nn.Module):
    def __init__(self):
        super(RFCXNet, self).__init__()
        self.n_label = 24
        resnet = resnet18(pretrained=True)
        self.resnet_head = nn.Sequential(*list(resnet.children())[:-2])
        self.l8_a = nn.Conv1d(512, self.n_label, 1, bias=False)
        self.l8_b = nn.Conv1d(512, self.n_label, 1, bias=False)

    def forward(self, x, perm=None, gamma=None):  # input x: (batch, channel, Hz, time)
        frames_num = x.shape[3]
        x = x.transpose(3, 2)  # (batch, channel, time, Hz)

        h = self.resnet_head(x)  # (batch, unit, time, Hz)
        if perm is not None:
            h = gamma * h + (1 - gamma) * h[perm]
        
        h = F.relu(h)
        h  = torch.mean(h, dim=3)  # (batch, unit, time)
        
        xa = self.l8_a(h)  # (batch, n_class, time)
        xb = self.l8_b(h)  # (batch, n_class, time)
        xb = torch.softmax(xb, dim=2)

        pseudo_label = (xa.sigmoid() >= 0.7).float()
        clipwise_preds = torch.sum(xa * xb, dim=2)
        clipwise_preds_max, _ = torch.max(xa, dim=2)
        attention_preds = xb
        
        return pseudo_label, clipwise_preds, attention_preds, clipwise_preds_max

In [None]:
SEED = 416
N_FOLD = 5
WORKS = 0
EXP_NAME = "exp0040_with_last_mixup"
OUTPUT = f"./drive/MyDrive/Study/RFCX/output/{EXP_NAME}"

!mkdir -p {OUTPUT}

In [None]:
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold, (train_index, valid_index) in enumerate(mskf.split(fnames, labels)):
    #if fold in [0, 1, 2]:
    #    continue
    print(datetime.now(), f"\t: ### FOLD-{fold} ###")
    set_seed(SEED)
    wandb.init(project="rfcx", name=f"{EXP_NAME}_f{fold}")

    # hyper parameter
    config = wandb.config
    config.seed = 416
    config.learning_rate = 1e-3
    config.batch_size = 64
    config.num_epochs = 50
    config.alpha = 0.1
    config.t_max = 10
    config.factor = 0.5

    config.exp_name = EXP_NAME
    config.fold = fold

    train_fname = fnames[train_index]
    valid_fname = fnames[valid_index]

    #train_datasets = SpectrogramFromNpz(train_fname, "train")
    fp_fnames = random.sample(list(fp_label_dict.keys()), 50)
    train_datasets = SpectrogramFromNpz(train_fname.tolist()+fp_fnames, "train")
    train_data_loader = torch.utils.data.DataLoader(train_datasets, batch_size=config.batch_size, shuffle=True, num_workers=WORKS)
    valid_datasets = SpectrogramFromNpz(valid_fname, "valid")
    valid_data_loader = torch.utils.data.DataLoader(valid_datasets, batch_size=config.batch_size, shuffle=False, num_workers=WORKS)

    model = RFCXNet()
    model.to(device)
    optimizer = Adam(model.parameters(), lr=config.learning_rate)
    
    scheduler = CosineAnnealingLR(optimizer, T_max=len(train_data_loader)*config.t_max, eta_min=0.0)
    del train_datasets, train_data_loader

    wandb.watch(model)

    print(datetime.now(), "\t: start train")
    best_score, best_loss = 0, 9999
    for epoch in range(config.num_epochs):
        fp_fnames = random.sample(list(fp_label_dict.keys()), 50)
        train_datasets = SpectrogramFromNpz(train_fname.tolist()+fp_fnames, "train")
        train_data_loader = torch.utils.data.DataLoader(train_datasets, batch_size=config.batch_size, shuffle=True, num_workers=WORKS)

        t_loss, lr = train_loop(train_data_loader, model, optimizer, scheduler)
        v_score, v_loss = valid_loop(valid_data_loader, model)
    
        if best_score < v_score:
            print(f"epoch {epoch}: best score update !!!")
            torch.save(model.state_dict(), f"{OUTPUT}/rfcxnet_f{config.fold}_best_score_model.bin")
            best_score = v_score
        if best_loss > v_loss:
            print(f"epoch {epoch}: best loss update !!!")
            torch.save(model.state_dict(), f"{OUTPUT}/rfcxnet_f{config.fold}_best_loss_model.bin")
            best_loss = v_loss
    
        wandb.log({"train loss": t_loss, "lr": lr, "valid loss": v_loss, "valid score": v_score, "best score": best_score, "best loss": best_loss})
    print(datetime.now(), "\t: finish train")

    # predict test data
    model.load_state_dict(torch.load(f"{OUTPUT}/rfcxnet_f{config.fold}_best_score_model.bin"))

    test_datasets = SpectrogramFromNpz(test_fnames, "test")

    lst = []
    for idx, (X, _) in tqdm.tqdm_notebook(enumerate(test_datasets), total=1992):
        preds = []
        for h, t in eval_img_pos:
            with torch.no_grad():
                #_, pred, _,  = model(X[:,:,h:t].unsqueeze(0).to(device))
                _, pred, _, pred_max = model(X[:,:,h:t].unsqueeze(0).to(device))
            pred = pred.sigmoid()
            pred_max = pred_max.sigmoid()
            pred = (pred + pred_max)/2
            preds.append(pred)
        max_pred, _  = torch.max(torch.stack(preds), dim=0)
        pred = max_pred.cpu().numpy()[0].tolist()

        row = [test_fnames[idx]] + pred
        lst.append(row)

    fold_sub = pd.DataFrame(lst, columns=["recording_id"]+[f"s{i}" for i in range(24)])
    fold_sub.to_csv(f"{OUTPUT}/rfcxnet_f{config.fold}_predict.csv", index=None)

In [None]:
#import pandas as pd
#import torch
#!ls ./drive/MyDrive/Study/RFCX/output/

In [None]:
#EXP_NAME = "exp0036_add_fp_data_max_posw_frame0.3"
#OUTPUT = f"./drive/MyDrive/Study/RFCX/output/{EXP_NAME}"

In [None]:
all_v_lst = []
for fold in range(5):
    df = pd.read_csv(f"{OUTPUT}/rfcxnet_f{fold}_predict.csv")
    ids, v_lst = [], []
    for row in df.values:
        recording_id = row[0]
        ids.append(recording_id)
        v = torch.Tensor(row[1:].astype(float))
        v_lst.append(v)
    all_v_lst.append(torch.stack(v_lst, axis=0))

all_preds = torch.stack(all_v_lst, axis=2).mean(2)
#all_preds, _ = torch.stack(all_v_lst, axis=2).max(2)
sub = pd.DataFrame(all_preds.tolist(), columns=df.columns[1:])
sub = pd.concat([df[["recording_id"]], sub], axis=1)
sub.to_csv(f"./submission_{EXP_NAME}.csv", index=None)
#sub.to_csv(f"./submission_{EXP_NAME}_max.csv", index=None)

In [None]:
#exp0021 = pd.read_csv("submission_exp0021_cut_and_sum.csv")
#exp0025 = pd.read_csv("submission_exp0025_window256.csv")
#df = pd.DataFrame((exp0021.values[:, 1:] + exp0025.values[:, 1:])/2, columns=exp0025.columns[1:])
#pd.concat([exp0021[["recording_id"]], df], axis=1).to_csv("exp0025_mix256_512.csv", index=None)