In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%time
# 大体10分くらい

!pip install -q kaggle
!mkdir -p .kaggle
!cp "./drive/My Drive/Study/config/kaggle.json" .kaggle/
!chmod 600 .kaggle/kaggle.json
!mv .kaggle /root

!kaggle datasets download "theoviel/rcfx-spectrograms-32-khz"
!unzip rcfx-spectrograms-32-khz.zip > /dev/null
!rm -rf rcfx-spectrograms-32-khz.zip 

!pip install -U iterative-stratification albumentations wandb  > /dev/null
!wandb login e0792bb688a0d18e359df7438c45da90f8794091

!pip install timm
!pip install imbalanced-learn

In [None]:
import gc
import os
import tqdm
import random
import pickle

from matplotlib import pyplot as plt
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from datetime import datetime

import pandas as pd
import numpy as np
from numpy.random import beta

import torch
from torchvision.models import resnet18, densenet121
import torch.nn as nn
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau, CosineAnnealingWarmRestarts
from torchvision import transforms
from torch.nn import functional as F

import albumentations as A

from sklearn.metrics import classification_report, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
import wandb
import timm

device = torch.device("cuda")

In [None]:
EXP = "exp0127_resnet18_second_st_only_label_loss"
BEST_MODEL = f"./drive/MyDrive/Study/RFCX/output/{EXP}"
DATA_ROOT = "./drive/MyDrive/Study/RFCX/input"

PSEUDO_THR_P = 0.5
PSEUDO_THR_N = 0.01
SEED = 416

MODEL_NAME = "resnet18"
N_LABEL = 24
N_SPLIT_IMG = 8
WINDOW = 512
COVER = 49

In [None]:
def mono_to_color(
    X: np.ndarray, mean=None, std=None,
    norm_max=None, norm_min=None, eps=1e-6
):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

class SpectrogramFromNpz(torch.utils.data.Dataset):
    def __init__(self, fname, mode):
        self.fname = fname
        self.mode = mode
        self.to_tensor = transforms.ToTensor()
        self.norm = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))

    def __len__(self):
        return len(self.fname)

    def __getitem__(self, idx):
        fname = self.fname[idx]

        # load image        
        _fname = fname.replace("_posi", "").replace("_nega", "")
        path = f"./{self.mode}/{_fname}.npy"
        mel = np.load(path)
        
        image = mono_to_color(mel)
        image = self.to_tensor(image)
        image = self.norm(image)

        return image

In [None]:
MODEL_HEADER_INFO = {
    "resnet18": (-2, 512),
    "densenet121": (-2, 1024),
    "efficientnet_b0": (-5, 320),
    "resnest50d": (-2, 2048),
    "mobilenetv2_100": (-2, 1280),
}

def interpolate(x: torch.Tensor, ratio: int):
    x = x.transpose(1, 2)
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    upsampled = upsampled.transpose(1, 2)
    return upsampled

class RFCXNet(nn.Module):
    def __init__(self, model_name):
        super(RFCXNet, self).__init__()
        self.n_label = N_LABEL

        base_model = timm.create_model(model_name, pretrained=True)
        h_idx, n_dense = MODEL_HEADER_INFO[model_name]        

        self.resnet_head = nn.Sequential(*list(base_model.children())[:h_idx])
        
        self.fc_a = nn.Conv1d(n_dense, self.n_label, 1, bias=False)
        self.fc_b = nn.Conv1d(n_dense, self.n_label, 1, bias=False)

    def forward(self, x, perm=None, gamma=None):  # input x: (batch, channel, Hz, time)
        frames_num = x.shape[3]
        x = x.transpose(3, 2)  # (batch, channel, time, Hz)

        h = self.resnet_head(x)  # (batch, unit, time, Hz)

        if perm is not None:
            h = gamma * h + (1 - gamma) * h[perm]
            
        h = F.relu(h)
        ti_pool = torch.mean(h, dim=3)  # (batch, unit, time)

        xa = self.fc_a(ti_pool)  # (batch, n_class, time)
        xb = self.fc_b(ti_pool)  # (batch, n_class, time)
        xb = torch.softmax(xb, dim=2)

        # time pool
        clipwise_preds_att_ti = torch.sum(xa * xb, dim=2)
        segmentwise_output_ti = interpolate(xa, 32)

        return {
            "clipwise_preds_att_ti": clipwise_preds_att_ti,
            "segmentwise_output_ti": segmentwise_output_ti,
        }

In [None]:
def LWLRAP(preds, labels):
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    num_labels = labels.sum(-1)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = scores.sum() / labels.sum()
    return score.item()

In [None]:
train_fp = pd.read_csv(f"{DATA_ROOT}/train_fp.csv")
train_tp = pd.read_csv(f"{DATA_ROOT}/train_tp.csv")

tp_fnames, tp_labels = [], []
for recording_id, df in train_tp.groupby("recording_id"):
    v = sum([np.eye(N_LABEL)[i] for i in df["species_id"].tolist()])
    v = (v  == 1).astype(int).tolist()
    tp_fnames.append(recording_id+"_posi")
    tp_labels.append(v)

fp_fnames, fp_labels = [], []
for recording_id, df in train_fp.groupby("recording_id"):
    v = sum([np.eye(N_LABEL)[i] for i in df["species_id"].tolist()])
    v = (v  == 1).astype(int).tolist()
    fp_fnames.append(recording_id+"_nega")
    fp_labels.append(v)

model = RFCXNet(MODEL_NAME)
model.to(device)

slide_img_pos = [[0, WINDOW]]
for idx in range(1, N_SPLIT_IMG):
    h, t = slide_img_pos[idx-1][0], slide_img_pos[idx-1][1]
    h = t - COVER
    t = h + WINDOW
    slide_img_pos.append([h, t])

print(slide_img_pos)

# OOF

## Positive

In [None]:
valid_preds_dfs = []
scores = []
tp_oof = np.zeros((len(tp_fnames), 8, 24))
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold, (train_index, valid_index) in enumerate(mskf.split(tp_fnames, tp_labels)):
    valid_fnames = np.array(tp_fnames)[valid_index]

    print(f"### {fold} ###")
    model.load_state_dict(torch.load(f"{BEST_MODEL}/rfcxnet_f{fold}_best_score_model.bin"))
    model.eval()

    valid_dataset = SpectrogramFromNpz(valid_fnames, "train")
    lst = []
    for idx, X in tqdm.tqdm_notebook(enumerate(valid_dataset), total=len(valid_dataset)):
        preds = []
        for patch, (h, t) in enumerate(slide_img_pos):
            with torch.no_grad():
                outputs = model(X[:,:,h:t].unsqueeze(0).to(device))
            pred = outputs["clipwise_preds_att_ti"].sigmoid().cpu().numpy()[0]
            row = [valid_fnames[idx], patch] + pred.tolist()
            lst.append(row)

    valid_preds_df = pd.DataFrame(lst, columns=["recording_id", "patch"]+ [f"s{i}" for i in range(24)])
    preds = valid_preds_df.groupby("recording_id").max().drop("patch", axis=1)
    s = LWLRAP(
        torch.tensor(np.stack(preds.values)), 
        torch.tensor(np.array(tp_labels)[valid_index])
    )
    auc_lst = [roc_auc_score(t, p) for p, t in zip(preds.values.T, np.array(tp_labels)[valid_index].T)]
    a = sum(auc_lst)/len(auc_lst)
    scores.append((s, a))
    valid_preds_dfs.append(valid_preds_df)

In [None]:
LWLRAP(
        torch.tensor(tp_result_df.groupby("recording_id").max().drop("patch", axis=1).loc[tp_fnames].values), 
        torch.tensor(np.array(tp_labels))
)

## Negative

In [None]:
fp_preds_dfs = []
for fold in range(5):
    print(f"### {fold} ###")
    model.load_state_dict(torch.load(f"{BEST_MODEL}/rfcxnet_f{fold}_best_score_model.bin"))
    model.eval()

    fp_dataset = SpectrogramFromNpz(fp_fnames, "train")
    lst = []
    for idx, X in tqdm.tqdm_notebook(enumerate(fp_dataset), total=len(fp_dataset)):
        preds = []
        for patch, (h, t) in enumerate(slide_img_pos):
            with torch.no_grad():
                outputs = model(X[:,:,h:t].unsqueeze(0).to(device))
            pred = outputs["clipwise_preds_att_ti"].sigmoid().cpu().numpy()[0]
            row = [fp_fnames[idx], patch] + pred.tolist()
            lst.append(row)
    fp_preds_df = pd.DataFrame(lst, columns=["recording_id", "patch"]+ [f"s{i}" for i in range(24)])
    fp_preds_dfs.append(fp_preds_df)

In [None]:
lst = []
for fold in range(5):
    v = fp_preds_dfs[fold].values[:, 2:]
    lst.append(v)

fp_result_df = pd.DataFrame(np.hstack([fp_preds_dfs[0].values[:, :2], np.array(lst).mean(0)]),
                                    columns=["recording_id", "patch"]+ [f"s{i}" for i in range(24)])

## Merge

In [None]:
all_oof_result_df = pd.concat([tp_result_df, fp_result_df]).reset_index(drop=True)
all_oof_result_df["org_recording_id"] = all_oof_result_df["recording_id"].map(lambda x: x.split("_")[0])
all_oof_result_df

In [None]:
dfs = []
for recording_id, df in all_oof_result_df.groupby("org_recording_id"):
    if len(df) > 8:
        df = df[df["recording_id"].map(lambda x: "_posi" in x)]
    dfs.append(df)
all_oof_result_df = pd.concat(dfs).reset_index(drop=True)
all_oof_result_df["recording_id"] = all_oof_result_df["org_recording_id"]
all_oof_result_df = all_oof_result_df.drop("org_recording_id", axis=1)

In [None]:
all_oof_result_df.to_csv("oof_toda_v1.csv", index=None)

# Test

In [None]:
sample_submission = pd.read_csv(f"{DATA_ROOT}/sample_submission.csv")
test_fnames = sample_submission["recording_id"].values
test_datasets = SpectrogramFromNpz(test_fnames, "test")

In [None]:
test_dfs = []
for fold in range(5):
    print(f"### {fold} ###")
    model.load_state_dict(torch.load(f"{BEST_MODEL}/rfcxnet_f{fold}_best_score_model.bin"))
    model.eval()

    lst = []
    for idx, X in tqdm.tqdm_notebook(enumerate(test_datasets), total=len(test_datasets)):
        preds = []
        for patch, (h, t) in enumerate(slide_img_pos):
            with torch.no_grad():
                outputs = model(X[:,:,h:t].unsqueeze(0).to(device))
            pred = outputs["clipwise_preds_att_ti"].sigmoid().cpu().numpy()[0]
            row = [test_fnames[idx], patch] + pred.tolist()
            lst.append(row)
    test_df = pd.DataFrame(lst, columns=["recording_id", "patch"]+ [f"s{i}" for i in range(24)])
    test_dfs.append(test_df)

In [None]:
lst = []
for fold in range(5):
    v = test_dfs[fold].values[:, 2:]
    lst.append(v)
oof_test_pred_avg = pd.DataFrame(np.hstack([test_dfs[0].values[:, :2], np.array(lst).mean(0)]),
                                    columns=["recording_id", "patch"]+ [f"s{i}" for i in range(24)])

In [None]:
oof_test_pred_avg.to_csv("test_toda_v1.csv", index=None)