In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%time
# 大体10分くらい

!pip install -q kaggle
!mkdir -p .kaggle
!cp "./drive/My Drive/Study/config/kaggle.json" .kaggle/
!chmod 600 .kaggle/kaggle.json
!mv .kaggle /root

!kaggle datasets download "theoviel/rcfx-spectrograms-32-khz"
!unzip rcfx-spectrograms-32-khz.zip > /dev/null
!rm -rf rcfx-spectrograms-32-khz.zip 

!pip install -U iterative-stratification albumentations wandb  > /dev/null
!wandb login e0792bb688a0d18e359df7438c45da90f8794091

In [None]:
import os
import tqdm
import random

from matplotlib import pyplot as plt
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from datetime import datetime

import pandas as pd
import numpy as np
from numpy.random import beta

import torch
from torchvision.models import resnet18
import torch.nn as nn
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
from torchvision import transforms
from torch.nn import functional as F

import albumentations as A

import wandb

device = torch.device("cuda")

In [None]:
DATA_ROOT = "./drive/MyDrive/Study/RFCX/input"

sample_submission = pd.read_csv(f"{DATA_ROOT}/sample_submission.csv")
train_fp = pd.read_csv(f"{DATA_ROOT}/train_fp.csv")
train_tp = pd.read_csv(f"{DATA_ROOT}/train_tp.csv")

In [None]:
def get_label_and_middle_posi(_recording_id):
    middle = []
    #label = np.zeros((24, 128, 3751))
    label = np.zeros((24, 3751))
    for species_id, t_min, f_min, t_max, f_max in tp_dict[_recording_id]:
        h, t = int(3751*(t_min/60)), int(3751*(t_max/60))
        #d, u = int(128*f_min/16000), int(128*f_max/16000)
        #label[species_id, d:u, h:t] = 1
        label[species_id, h:t] = 1
        m = (t + h)//2
        middle.append(m)
    return label, middle

def get_label_and_middle_nega(_recording_id):
    middle = []
    #label = np.zeros((24, 128, 3751))
    label = np.zeros((24, 3751))
    for species_id, t_min, f_min, t_max, f_max in fp_dict[_recording_id]:
        h, t = int(3751*(t_min/60)), int(3751*(t_max/60))
        m = (t + h)//2
        middle.append(m)
    return label, middle

counts_df = train_tp["species_id"].value_counts()
max_counts = counts_df.max()
label_weight = max_counts / counts_df
pos_weights = torch.Tensor(label_weight.sort_index().values).to(device)

tp_dict = {}
for recording_id, df in train_tp.groupby("recording_id"):
    tp_dict[recording_id] = df.values[:, [1,3,4,5,6]]

fp_dict = {}
for recording_id, df in train_fp.groupby("recording_id"):
    fp_dict[recording_id+"_nega"] = df.values[:, [1,3,4,5,6]]

fnames, labels = [], []
for recording_id, df in train_tp.groupby("recording_id"):
    v = sum([np.eye(24)[i] for i in df["species_id"].tolist()])
    v = (v  == 1).astype(int).tolist()
    fnames.append(recording_id)
    labels.append(v)

fnames = np.array(fnames)
labels = np.array(labels)
fp_id_list = [lab+"_nega" for lab in train_fp["recording_id"].unique()]
test_fnames = sample_submission["recording_id"].values

In [None]:
# https://www.kaggle.com/c/rfcx-species-audio-detection/discussion/198418

# LRAP. Instance-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1
def LRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = (scores.sum(-1) / labels.sum(-1)).mean()
    return score.item()

# label-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1
def LWLRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    # Number of GT labels per instance
    num_labels = labels.sum(-1)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = scores.sum() / labels.sum()
    return score.item()

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

def parse_labels(recording_id):
    try:
        label = label_dict[recording_id]
    except KeyError:
        label = np.zeros(24)
    return label

def mixup(input, gamma, perm):
    perm_input = input[perm]
    return input.mul_(gamma).add_(1 - gamma, perm_input)

def interpolate(x: torch.Tensor, ratio: int):
    x = x.transpose(1, 2)
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    upsampled = upsampled.transpose(1, 2)
    return upsampled

In [None]:
class TimeMask:
    def __init__(self, T=40, num_masks=1, replace_with_zero=True):
        self.T = T
        self.num_masks = num_masks
        self.replace_with_zero = replace_with_zero

    def __call__(self, spec):
        cloned = spec.clone()
        len_spectro = cloned.shape[2]
    
        for i in range(0, self.num_masks):
            t = random.randrange(0, self.T)
            t_zero = random.randrange(0, len_spectro - t)

            # avoids randrange error if values are equal and range is empty
            if (t_zero == t_zero + t): return cloned

            mask_end = random.randrange(t_zero, t_zero + t)
            if (self.replace_with_zero): cloned[:,:,t_zero:mask_end] = 0
            else: cloned[:,:,t_zero:mask_end] = cloned.mean()
        return cloned

class FreqMask:
    def __init__(self, F=30, num_masks=1, replace_with_zero=True):
        self.F = F
        self.num_masks = num_masks
        self.replace_with_zero = replace_with_zero

    def __call__(self, spec):
        cloned = spec.clone()
        num_mel_channels = cloned.shape[1]
    
        for i in range(0, self.num_masks):        
            f = random.randrange(0, self.F)
            f_zero = random.randrange(0, num_mel_channels - f)

            # avoids randrange error if values are equal and range is empty
            if (f_zero == f_zero + f): return cloned

            mask_end = random.randrange(f_zero, f_zero + f) 
            if (self.replace_with_zero): cloned[:, f_zero:mask_end] = 0
            else: cloned[:, f_zero:mask_end] = cloned.mean()
    
        return cloned

def mono_to_color(
    X: np.ndarray, mean=None, std=None,
    norm_max=None, norm_min=None, eps=1e-6
):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

H_POS = [32, 64, 128, 256]
WINDOW = 512
#H_POS = [32, 64, 128]
#WINDOW = 256
MAX_SEQ = 3751 - 1
def extract_ht_pos(pos):
    h_pos = [p for p in H_POS if (pos - p) > 0]
    if len(h_pos) > 0:
        h_pos = random.choice(h_pos)
    else:
        h_pos = pos
    t_pos = WINDOW - h_pos
    h, t = pos-h_pos, pos+t_pos
    if t > MAX_SEQ:
        h, t = MAX_SEQ-WINDOW, MAX_SEQ
    return h, t

class SpectrogramFromNpz(torch.utils.data.Dataset):
    def __init__(self, fname, mode):
        self.fname = fname
        self.mode = mode
        self.to_tensor = transforms.ToTensor()
        self.norm = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        self.augument_funcs_a = A.Compose([
            A.RandomBrightnessContrast(p=0.5)
        ])
        self.augument_funcs_b = transforms.RandomApply([
            transforms.Lambda(lambda img: transforms.functional.adjust_gamma(img, gamma=2, gain=1)),
            TimeMask(),
            FreqMask(),
        ], p=0.5)

    def __len__(self):
        return len(self.fname)

    def __getitem__(self, idx):
        fname = self.fname[idx]

        # load labels
        try:
            label, middle = get_label_and_middle_posi(fname)
            pos = random.choice(middle)
            h, t = extract_ht_pos(pos)
        except KeyError:
            try:
                label, middle = get_label_and_middle_nega(fname)
                pos = random.choice(middle)
                h, t = extract_ht_pos(pos)
            except KeyError:
                label, h, t = None, None, None
        if self.mode == "train":
            #label = label[:, :, h:t]
            label = label[:, h:t]
        
        # load images
        if self.mode in ["train", "valid"]:
            if "_nega" in fname:
                _fname = fname.replace("_nega", "")
                path = f"./train/{_fname}.npy"
            else:
                path = f"./train/{fname}.npy"
        elif self.mode == "test":
            path = f"./test/{fname}.npy"
        mel = np.load(path)
        image = mono_to_color(mel)

        # augument
        if self.mode == "train":
            image = self.augument_funcs_a(image=image)["image"]
            image = self.to_tensor(image)
            image = self.augument_funcs_b(image)
            image = image[:, :, h:t]
        else:
            image = self.to_tensor(image)
        image = self.norm(image)

        return image, label

In [None]:
pred_w = [0.4, 0.4, 0.1, 0.1]

def rfcx_criterion(outputs, targets):
    # prediction
    ## time pool
    pseudo_label_ti = outputs["pseudo_label_ti"]
    clipwise_preds_att_ti = outputs["clipwise_preds_att_ti"]
    attention_preds_ti = outputs["attention_preds_ti"]
    clipwise_preds_max_ti = outputs["clipwise_preds_max_ti"]
    segmentwise_output_ti = outputs["segmentwise_output_ti"]
    ## Hz pool
    #pseudo_label_hz = outputs["pseudo_label_hz"]
    #clipwise_preds_att_hz = outputs["clipwise_preds_att_hz"]
    #attention_preds_hz = outputs["attention_preds_hz"]
    #clipwise_preds_max_hz = outputs["clipwise_preds_max_hz"]
    #segmentwise_output_hz = outputs["segmentwise_output_hz"]

    # target
    #clip_y, seq_y, hz_y = targets
    clip_y, seq_y = targets

    # loss
    ## clip wise att
    loss1_ti = nn.BCEWithLogitsLoss(pos_weight=pos_weights)(clipwise_preds_att_ti, clip_y)
    #loss1_hz = nn.BCEWithLogitsLoss(pos_weight=pos_weights)(clipwise_preds_att_hz, clip_y)
    #loss1 = loss1_ti + loss1_hz*0.25
    ## clip wise max
    loss2_ti = nn.BCEWithLogitsLoss(pos_weight=pos_weights)(clipwise_preds_max_ti, clip_y)
    #loss2_hz = nn.BCEWithLogitsLoss(pos_weight=pos_weights)(clipwise_preds_max_hz, clip_y)
    #loss2 = loss2_ti + loss2_hz*0.25
    ## pseudo wise
    loss3_ti = nn.BCEWithLogitsLoss(reduction="none")(attention_preds_ti, pseudo_label_ti)
    loss3_ti = (loss3_ti.mean(2) * pos_weights).mean()
    #loss3_hz = nn.BCEWithLogitsLoss(reduction="none")(attention_preds_hz, pseudo_label_hz)
    #loss3_hz = (loss3_hz.mean(2) * pos_weights).mean()
    #loss3 = loss3_ti + loss3_hz*0.25
    ## seq or Hz wise
    loss4_ti = nn.BCEWithLogitsLoss(reduction="none")(segmentwise_output_ti, seq_y)
    loss4_ti = (loss4_ti.mean(2) * pos_weights).mean()
    #loss4_hz = nn.BCEWithLogitsLoss(reduction="none")(segmentwise_output_hz, hz_y)
    #loss4_hz = (loss4_hz.mean(2) * pos_weights).mean()
    #loss4 = (loss4_ti + loss4_hz) / 2

    #loss = loss1 + loss2*0.5 + loss3 + loss4*0.5
    #loss = (loss1_ti + loss2_ti + loss3_ti + loss4_ti) / 4
    #loss = loss1 + loss2*0.5 + loss3 + loss4_ti*0.5
    loss = loss1_ti + loss2_ti*0.5 + loss3_ti + loss4_ti*0.5

    return loss


def train_loop(train_data_loader, model, optimizer, scheduler):
    model.train()
    losses, lrs = [], []
    for X, y in train_data_loader:
        
        X = X.to(device)
        #clip_y = (y.sum((2, 3)) != 0).float().to(device)
        #seq_y = (y.sum(2) != 0).float().to(device)
        #hz_y = (y.sum(3) != 0).float().to(device)
        clip_y = (y.sum(2) != 0).float().to(device)
        seq_y = y.to(device)

        # mixup
        b = beta(0.1, 0.1)
        perm = torch.randperm(X.size(0))
        X = mixup(X, b, perm)
        clip_y = mixup(clip_y, b, perm)
        seq_y = mixup(seq_y, b, perm)
        #hz_y = mixup(hz_y, b, perm)

        outputs = model(X)
        #loss = rfcx_criterion(outputs, (clip_y, seq_y, hz_y))
        loss = rfcx_criterion(outputs, (clip_y, seq_y))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if scheduler is not None:
            scheduler.step()

        losses.append(loss.item())
        lrs.append(np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean())
    t_loss = np.array(losses).mean()
    lr =  np.array(lrs).mean()
    return t_loss, lr

eval_img_pos = [[0, WINDOW]]
for idx in range(1, 8):
#for idx in range(1, 18):
    h, t = eval_img_pos[idx-1][0], eval_img_pos[idx-1][1]
    h = t - 51
    #h = t - 50
    t = h + WINDOW
    eval_img_pos.append([h, t])
    
def valid_loop(valid_data_loader, model):
    model.eval()
    v_scores, v_losses = [], []
    for X, y in valid_data_loader:
        X = X.to(device)
        #clip_y = (y.sum((2, 3)) != 0).float().to(device)
        #seq_y = (y.sum(2) != 0).float().to(device)
        #hz_y = (y.sum(3) != 0).float().to(device)
        clip_y = (y.sum(2) != 0).float().to(device)
        seq_y = y.to(device)

        preds = []
        for h, t in eval_img_pos:
            with torch.no_grad():
                outputs = model(X)
            p1 = outputs["clipwise_preds_att_ti"].sigmoid() #* pred_w[0]
            p2 = outputs["clipwise_preds_max_ti"].sigmoid() #* pred_w[1]
            #p3 = outputs["clipwise_preds_att_hz"].sigmoid() * pred_w[2]
            #p4 = outputs["clipwise_preds_max_hz"].sigmoid() * pred_w[3]
            #pred = p1 + p2 + p3 + p4
            pred = (p1 + p2)/2
            preds.append(pred)
        max_pred, _  = torch.max(torch.stack(preds), dim=0)

        score = LRAP(max_pred.cpu(), clip_y.cpu())
        #loss = rfcx_criterion(outputs, (clip_y, seq_y, hz_y))
        loss = nn.BCEWithLogitsLoss(pos_weight=pos_weights)(max_pred, clip_y).cpu().numpy()
        v_scores.append(score)
        v_losses.append(loss.item())

    valid_score, valid_loss = np.array(v_scores).mean(), np.array(v_losses).mean()
    return valid_score, valid_loss

In [None]:
class RFCXNet(nn.Module):
    def __init__(self):
        super(RFCXNet, self).__init__()
        self.n_label = 24
        resnet = resnet18(pretrained=True)
        self.resnet_head = nn.Sequential(*list(resnet.children())[:-2])

        self.fc_a = nn.Conv1d(512, self.n_label, 1, bias=False)
        self.fc_b = nn.Conv1d(512, self.n_label, 1, bias=False)
        #self.fc_c = nn.Conv1d(512, self.n_label, 1, bias=False)
        #self.fc_d = nn.Conv1d(512, self.n_label, 1, bias=False)

    def forward(self, x, perm=None, gamma=None):  # input x: (batch, channel, Hz, time)
        frames_num = x.shape[3]
        x = x.transpose(3, 2)  # (batch, channel, time, Hz)

        h = self.resnet_head(x)  # (batch, unit, time, Hz)
        if perm is not None:
            h = gamma * h + (1 - gamma) * h[perm]
        
        h = F.relu(h)
        ti_pool = torch.mean(h, dim=3)  # (batch, unit, time)
        #hz_pool = torch.mean(h, dim=2)  # (batch, unit, Hz)
        
        xa = self.fc_a(ti_pool)  # (batch, n_class, time)
        xb = self.fc_b(ti_pool)  # (batch, n_class, time)
        xb = torch.softmax(xb, dim=2)

        #xc = self.fc_c(hz_pool)  # (batch, n_class, Hz)
        #xd = self.fc_d(hz_pool)  # (batch, n_class, Hz)
        #xd = torch.softmax(xd, dim=2)

        # time pool
        pseudo_label_ti = (xa.sigmoid() >= 0.7).float()
        clipwise_preds_att_ti = torch.sum(xa * xb, dim=2)
        clipwise_preds_max_ti, _ = torch.max(xa, dim=2)
        attention_preds_ti = xb
        segmentwise_output_ti = interpolate(xa, 32)

        # Hz pool
        #pseudo_label_hz = (xc.sigmoid() >= 0.7).float()
        #clipwise_preds_att_hz = torch.sum(xc * xd, dim=2)
        #clipwise_preds_max_hz, _ = torch.max(xc, dim=2)
        #attention_preds_hz = xd
        #segmentwise_output_hz = interpolate(xc*xd, 32)

        return {
            "pseudo_label_ti": pseudo_label_ti,
            "clipwise_preds_att_ti": clipwise_preds_att_ti,
            "attention_preds_ti": attention_preds_ti,
            "clipwise_preds_max_ti": clipwise_preds_max_ti,
            "segmentwise_output_ti": segmentwise_output_ti,
            #"pseudo_label_hz": pseudo_label_hz,
            #"clipwise_preds_att_hz": clipwise_preds_att_hz,
            #"attention_preds_hz": attention_preds_hz,
            #"clipwise_preds_max_hz": clipwise_preds_max_hz,
            #"segmentwise_output_hz": segmentwise_output_hz,
        }

In [None]:
"""model = RFCXNet()
model.to(device)

train_datasets = SpectrogramFromNpz(fnames, "train")
train_data_loader = torch.utils.data.DataLoader(train_datasets, batch_size=4, shuffle=True, num_workers=0)

for X, y in train_data_loader:
    clip_y = (y.sum((2, 3)) != 0).float().to(device)
    seq_y = (y.sum(2) != 0).float().to(device)
    hz_y = (y.sum(3) != 0).float().to(device)

    output = model(X.to(device))
    loss = rfcx_criterion(output, (clip_y, seq_y, hz_y))
    break"""

In [None]:
SEED = 416
N_FOLD = 5
WORKS = 0
EXP_NAME = "exp0048_fix_nega_label"
OUTPUT = f"./drive/MyDrive/Study/RFCX/output/{EXP_NAME}"

!mkdir -p {OUTPUT}

In [None]:
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0)
test_datasets = SpectrogramFromNpz(test_fnames, "test")

for fold, (train_index, valid_index) in enumerate(mskf.split(fnames, labels)):
    #if fold in [0, 1, 2, 3]:
    #    continue
    print(datetime.now(), f"\t: ### FOLD-{fold} ###")
    set_seed(SEED)
    wandb.init(project="rfcx", name=f"{EXP_NAME}_f{fold}")

    # hyper parameter
    config = wandb.config
    config.seed = 416
    config.learning_rate = 1e-3
    config.batch_size = 64
    config.num_epochs = 50
    config.alpha = 0.1
    config.t_max = 10
    config.factor = 0.5

    config.exp_name = EXP_NAME
    config.fold = fold

    train_fname = fnames[train_index]
    valid_fname = fnames[valid_index]

    fp_fnames = random.sample(fp_id_list, 50)
    train_datasets = SpectrogramFromNpz(train_fname.tolist()+fp_fnames, "train")
    train_data_loader = torch.utils.data.DataLoader(train_datasets, batch_size=config.batch_size, shuffle=True, num_workers=WORKS)
    valid_datasets = SpectrogramFromNpz(valid_fname, "valid")
    valid_data_loader = torch.utils.data.DataLoader(valid_datasets, batch_size=config.batch_size, shuffle=False, num_workers=WORKS)

    model = RFCXNet()
    model.to(device)
    optimizer = Adam(model.parameters(), lr=config.learning_rate)
    
    scheduler = CosineAnnealingLR(optimizer, T_max=len(train_data_loader)*config.t_max, eta_min=0.0)
    del train_datasets, train_data_loader

    wandb.watch(model)

    print(datetime.now(), "\t: start train")
    best_score, best_loss = 0, 9999
    for epoch in range(config.num_epochs):
        fp_fnames = random.sample(fp_id_list, 50)
        train_datasets = SpectrogramFromNpz(train_fname.tolist()+fp_fnames, "train")
        train_data_loader = torch.utils.data.DataLoader(train_datasets, batch_size=config.batch_size, shuffle=True, num_workers=WORKS)

        t_loss, lr = train_loop(train_data_loader, model, optimizer, scheduler)
        v_score, v_loss = valid_loop(valid_data_loader, model)
    
        if best_score < v_score:
            print(f"epoch {epoch}: best score update !!!")
            torch.save(model.state_dict(), f"{OUTPUT}/rfcxnet_f{config.fold}_best_score_model.bin")
            best_score = v_score
        if best_loss > v_loss:
            print(f"epoch {epoch}: best loss update !!!")
            torch.save(model.state_dict(), f"{OUTPUT}/rfcxnet_f{config.fold}_best_loss_model.bin")
            best_loss = v_loss
    
        wandb.log({"train loss": t_loss, "lr": lr, "valid loss": v_loss, "valid score": v_score, "best score": best_score, "best loss": best_loss})
    print(datetime.now(), "\t: finish train")

    # predict test data
    model.load_state_dict(torch.load(f"{OUTPUT}/rfcxnet_f{config.fold}_best_score_model.bin"))

    lst = []
    for idx, (X, _) in tqdm.tqdm_notebook(enumerate(test_datasets), total=1992):
        preds = []
        for h, t in eval_img_pos:
            with torch.no_grad():
                outputs = model(X[:,:,h:t].unsqueeze(0).to(device))
            p1 = outputs["clipwise_preds_att_ti"].sigmoid() #* pred_w[0]
            p2 = outputs["clipwise_preds_max_ti"].sigmoid() #* pred_w[1]
            #p3 = outputs["clipwise_preds_att_hz"].sigmoid() * pred_w[2]
            #p4 = outputs["clipwise_preds_max_hz"].sigmoid() * pred_w[3]
            #pred = (p1 + p2 + p3 + p4) / 4
            pred = (p1 + p2)/2
            #pred = p2
            preds.append(pred)
        max_pred, _  = torch.max(torch.stack(preds), dim=0)
        pred = max_pred.cpu().numpy()[0].tolist()

        row = [test_fnames[idx]] + pred
        lst.append(row)

    fold_sub = pd.DataFrame(lst, columns=["recording_id"]+[f"s{i}" for i in range(24)])
    fold_sub.to_csv(f"{OUTPUT}/rfcxnet_f{config.fold}_predict.csv", index=None)
wandb.init()

In [None]:
#import pandas as pd
#import torch
#!ls ./drive/MyDrive/Study/RFCX/output/

In [None]:
#EXP_NAME = "exp0043_segmentwise_axb"
#OUTPUT = f"./drive/MyDrive/Study/RFCX/output/{EXP_NAME}"

In [None]:
all_v_lst = []
for fold in range(5):
    df = pd.read_csv(f"{OUTPUT}/rfcxnet_f{fold}_predict.csv")
    ids, v_lst = [], []
    for row in df.values:
        recording_id = row[0]
        ids.append(recording_id)
        v = torch.Tensor(row[1:].astype(float))
        v_lst.append(v)
    all_v_lst.append(torch.stack(v_lst, axis=0))

all_preds = torch.stack(all_v_lst, axis=2).mean(2)
sub = pd.DataFrame(all_preds.tolist(), columns=df.columns[1:])
sub = pd.concat([df[["recording_id"]], sub], axis=1)
sub.to_csv(f"./submission_{EXP_NAME}.csv", index=None)