In [1]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

In [2]:
import os
import gc
import ast
import cv2
import copy
import time
import yaml
import random
import shutil
import warnings
import torchaudio
import subprocess
import numpy as np
import pandas as pd
import IPython.display as ipd
import matplotlib.pyplot as plt
from glob import glob as glob_file
from tqdm import tqdm
from PIL import Image, ImageDraw
from shutil import copyfile
from IPython.core.display import Video, display
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, KFold, GroupKFold, StratifiedKFold

import timm
from torch import nn
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.nn.modules.loss import _WeightedLoss
from torchvision import models, transforms
from fastai.vision.all import *
from huggingface_hub import snapshot_download
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor

from albumentations.pytorch import ToTensorV2
from albumentations import (
    Compose, OneOf, Normalize, Resize,
    Flip, HorizontalFlip, VerticalFlip, CenterCrop, RandomResizedCrop,
    Rotate, ShiftScaleRotate, RandomRotate90, Transpose,
    RGBShift, ChannelShuffle, HueSaturationValue, RandomBrightnessContrast,
    Blur, MotionBlur, MedianBlur, GaussNoise, Cutout, CoarseDropout
)

warnings.simplefilter('ignore')
pd.set_option("max_columns", 150)
pd.set_option('display.max_rows', 150)

In [3]:
CFG = {
    "save_prev"     : [False, ["*.pth"]],
    "seed"          : 42,
    'device'        : "cuda:0" if torch.cuda.is_available() else "cpu",
    "base_path"     : "../input/birdclef-2022/",
    "img_sc_path"   : "../input/birdclef-spectrograms-full-dataset-10sec/",
    "img_ot_path"   : "../input/birdclef-spectrograms-full-dataset-10sec/",
    "output_path"   : './',
    "n_sample"      : [200],
    "pretrain"      : "../input/birdclef-models/birdclef_swin_wav2vec.pth",
    "save_name"     : "birdclef_swin_inf_second",
    "class_num"     : 1,
    "model"         : "swin_large_patch4_window7_224",
    "size"          : 224,
    "fold"          : 4,
    "test_max"      : 250,
    "batch_size"    : 8,
    "epochs"        : 5,
    "lr"            : 1e-4,
    "early_stopping": 3,
    "num_workers"   : 4
}

CFG

{'save_prev': [False, ['*.pth']],
 'seed': 42,
 'device': 'cuda:0',
 'base_path': '../input/birdclef-2022/',
 'img_sc_path': '../input/birdclef-spectrograms-full-dataset-10sec/',
 'img_ot_path': '../input/birdclef-spectrograms-full-dataset-10sec/',
 'output_path': './',
 'n_sample': [200],
 'pretrain': '../input/birdclef-models/birdclef_swin_wav2vec.pth',
 'save_name': 'birdclef_swin_inf_second',
 'class_num': 1,
 'model': 'swin_large_patch4_window7_224',
 'size': 224,
 'fold': 4,
 'test_max': 250,
 'batch_size': 8,
 'epochs': 5,
 'lr': 0.0001,
 'early_stopping': 3,
 'num_workers': 4}

In [4]:
def get_img(path):
    im_bgr = cv2.imread(path)
    im_rgb = im_bgr[:, :, ::-1]
    return im_rgb

def seed_everything(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(CFG["seed"])

In [5]:
if CFG['save_prev'][0]:
    for file_pattern in CFG['save_prev'][1]:
        for f in glob_file(f"../input/birdclef-trained-models-with-scored-bird/models/{file_pattern}"):
            filename = os.path.basename(f)
            print(filename)
            os.makedirs("./models", exist_ok=True)
            !cp {f} ./models/{filename.replace("n300_fold0","fold0")}
    print(os.listdir("./models"))

# Define functions

In [6]:
# Scored bird names
scored_birds = pd.read_json(f"{CFG['base_path']}scored_birds.json").values.flatten().tolist()
bird2label   = {}
for i, b in enumerate(scored_birds):
    bird2label[b] = i
    
# Define labels for other cases
label_other_birds = max(bird2label.values()) + 1
label_no_birds    = label_other_birds + 1

label_other_birds, label_no_birds

(21, 22)

In [7]:
def trans2label(bird):
    if bird in bird2label.keys():
        return bird2label[bird]
    return label_other_birds

# Load data

In [8]:
df_meta = pd.read_csv("../input/birdclef-2022/train_metadata.csv")
df_meta = df_meta[["primary_label","secondary_labels","rating","filename"]]
df_meta["filename"] = df_meta.filename.apply(lambda x: x[x.find("/")+1:])

print(df_meta.shape)
df_meta.head(2)

(14852, 4)


Unnamed: 0,primary_label,secondary_labels,rating,filename
0,afrsil1,[],2.5,XC125458.ogg
1,afrsil1,"['houspa', 'redava', 'zebdov']",3.5,XC175522.ogg


In [9]:
df_train = pd.DataFrame(glob.glob(CFG["img_sc_path"]+"*/*"), columns=["img_path"])
df_train["filename"]   = df_train.img_path.apply(lambda x: x[x.rfind("/")+1:x.rfind(".ogg")+4])
df_train = df_train.merge(df_meta, on="filename")
df_train["audio_path"] = CFG["base_path"] + "train_audio/" + df_train.primary_label + "/" + df_train.filename
df_train["p_label"]    = df_train.primary_label.apply(lambda x: trans2label(x))
df_train["s_labels"]   = df_train.secondary_labels.apply(lambda x: [trans2label(l) for l in eval(x)])
df_train["dupli_sec"]  = df_train[["p_label","s_labels"]].apply(lambda x: [l for l in x[1] if l != x[0]], axis=1)
df_train["label"]      = df_train.dupli_sec.apply(lambda x: 0 if len(x) == 0 else 1)
df_train["fold_key"]   = df_train.primary_label + df_train.label.astype(str)

In [10]:
df_train.sample(50)

Unnamed: 0,img_path,filename,primary_label,secondary_labels,rating,audio_path,p_label,s_labels,dupli_sec,label,fold_key
13745,../input/birdclef-spectrograms-full-dataset-10sec/norcar/XC303678.ogg.000.jpg,XC303678.ogg,norcar,[],3.0,../input/birdclef-2022/train_audio/norcar/XC303678.ogg,21,[],[],0,norcar0
56702,../input/birdclef-spectrograms-full-dataset-10sec/wiltur/XC586564.ogg.004.jpg,XC586564.ogg,wiltur,[],5.0,../input/birdclef-2022/train_audio/wiltur/XC586564.ogg,21,[],[],0,wiltur0
47576,../input/birdclef-spectrograms-full-dataset-10sec/houfin/XC163050.ogg.047.jpg,XC163050.ogg,houfin,[],3.0,../input/birdclef-2022/train_audio/houfin/XC163050.ogg,12,[],[],0,houfin0
58670,../input/birdclef-spectrograms-full-dataset-10sec/eurwig/XC431085.ogg.000.jpg,XC431085.ogg,eurwig,[],4.0,../input/birdclef-2022/train_audio/eurwig/XC431085.ogg,21,[],[],0,eurwig0
6280,../input/birdclef-spectrograms-full-dataset-10sec/masboo/XC431820.ogg.004.jpg,XC431820.ogg,masboo,[],5.0,../input/birdclef-2022/train_audio/masboo/XC431820.ogg,21,[],[],0,masboo0
15486,../input/birdclef-spectrograms-full-dataset-10sec/norcar/XC489169.ogg.005.jpg,XC489169.ogg,norcar,[],5.0,../input/birdclef-2022/train_audio/norcar/XC489169.ogg,21,[],[],0,norcar0
24988,../input/birdclef-spectrograms-full-dataset-10sec/mallar3/XC615311.ogg.002.jpg,XC615311.ogg,mallar3,[],5.0,../input/birdclef-2022/train_audio/mallar3/XC615311.ogg,21,[],[],0,mallar30
45718,../input/birdclef-spectrograms-full-dataset-10sec/wesmea/XC503278.ogg.022.jpg,XC503278.ogg,wesmea,"['redjun', 'zebdov']",4.5,../input/birdclef-2022/train_audio/wesmea/XC503278.ogg,21,"[21, 21]",[],0,wesmea0
3171,../input/birdclef-spectrograms-full-dataset-10sec/commyn/XC400123.ogg.003.jpg,XC400123.ogg,commyn,[],4.5,../input/birdclef-2022/train_audio/commyn/XC400123.ogg,21,[],[],0,commyn0
39372,../input/birdclef-spectrograms-full-dataset-10sec/gamqua/XC636281.ogg.002.jpg,XC636281.ogg,gamqua,[],3.5,../input/birdclef-2022/train_audio/gamqua/XC636281.ogg,21,[],[],0,gamqua0


In [11]:
df_train = pd.concat([df_train[df_train.p_label != 21],
                      df_train[df_train.p_label == 21].sample(2500)]).reset_index(drop=True)
df_train.p_label.value_counts()

18    2555
21    2500
12    1712
14     432
19     272
2      268
8      244
20     232
13     221
16     122
3       98
7       79
0       78
5       71
1       39
10      28
15      25
9       23
11      16
6        8
17       4
4        2
Name: p_label, dtype: int64

In [12]:
df_train["fold"] = 0
skf = StratifiedKFold(n_splits=CFG["fold"])
for i, (train_index, test_index) in enumerate(skf.split(df_train, df_train.fold_key)):
    df_train.loc[test_index, "fold"] = i
    
df_train.groupby(["label","fold"]).img_path.count()

label  fold
0      0       1878
       1       1876
       2       1875
       3       1877
1      0        380
       1        381
       2        382
       3        380
Name: img_path, dtype: int64

In [13]:
print(df_train.shape)
display(df_train.sample(5))
df_train.label.value_counts()

(9029, 12)


Unnamed: 0,img_path,filename,primary_label,secondary_labels,rating,audio_path,p_label,s_labels,dupli_sec,label,fold_key,fold
4282,../input/birdclef-spectrograms-full-dataset-10sec/skylar/XC380395.ogg.005.jpg,XC380395.ogg,skylar,"['mallar3', 'skylar']",4.5,../input/birdclef-2022/train_audio/skylar/XC380395.ogg,18,"[21, 18]",[21],1,skylar1,2
5583,../input/birdclef-spectrograms-full-dataset-10sec/skylar/XC572104.ogg.005.jpg,XC572104.ogg,skylar,[],4.0,../input/birdclef-2022/train_audio/skylar/XC572104.ogg,18,[],[],0,skylar0,2
5984,../input/birdclef-spectrograms-full-dataset-10sec/skylar/XC55105.ogg.005.jpg,XC55105.ogg,skylar,[],2.0,../input/birdclef-2022/train_audio/skylar/XC55105.ogg,18,[],[],0,skylar0,3
4472,../input/birdclef-spectrograms-full-dataset-10sec/skylar/XC146521.ogg.005.jpg,XC146521.ogg,skylar,[],3.0,../input/birdclef-2022/train_audio/skylar/XC146521.ogg,18,[],[],0,skylar0,0
6927,../input/birdclef-spectrograms-full-dataset-10sec/sander/XC558525.ogg.001.jpg,XC558525.ogg,sander,[],2.5,../input/birdclef-2022/train_audio/sander/XC558525.ogg,21,[],[],0,sander0,0


0    7506
1    1523
Name: label, dtype: int64

# Define model

In [14]:
# Save model files for inference
download_path = snapshot_download(repo_id="facebook/wav2vec2-base-960h")
shutil.move(download_path, "./wav2vec/")

Downloading:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/158 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/378M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/378M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

'./wav2vec/'

Dataloader

In [15]:
class BirdCLEF_Dataset(Dataset):
    
    def __init__(self, df, transforms=None, output_label=True, is_train=True):
        super().__init__()
        self.df = df.reset_index(drop=True).copy()
        self.transforms   = transforms
        self.output_label = output_label
        self.is_train     = is_train
        self.processor    = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index: int):
        length   = 10
        img_path = self.df.loc[index].img_path
        img_num  = int(img_path[-7:-4])  # Extract XXX from XXX.jpg
        img      = get_img(img_path).copy()
        waveform, sr = torchaudio.load(self.df.loc[index].audio_path)
        waveform = waveform[0]
        if len(waveform) < sr*length:
            rep      = round(float(sr*length) / len(waveform))
            waveform = waveform.repeat(int(rep)+1)
            waveform = waveform[:sr*length]
        else:
            sta = sr*length*img_num
            end = sr*length*(img_num+1)
            if len(waveform[sta:end]) == sr*length:
                waveform = waveform[sta:end]
            else:
                waveform = waveform[-sr*length:]
        waveform = waveform[::2]
        waveform = self.processor(waveform, sampling_rate=16000, return_tensors="pt").input_values
        pl = torch.from_numpy(np.array(self.df.loc[index, "p_label"]))
        y  = torch.from_numpy(np.array(self.df.loc[index, "label"], dtype=np.float16))
        if self.transforms:
            if self.is_train:
                # Horizontal flip
                p = np.random.rand(1)
                if p < 0.5:
                    img      = img[:, ::-1]
                    waveform = waveform.flip(0)
                # Shift to right/left (range is -0.3 ~ 0.3 without 0)
                p = np.random.rand(1)
                if p < 0.5:
                    s = random.sample(list(np.arange(-3,0,0.5))+list(np.arange(0.5,3.1,0.5)), k=1)[0]/10 
                    img      = np.roll(img, int(img.shape[1]*s), axis=1)
                    waveform = torch.from_numpy(np.roll(waveform, int(len(waveform)*s)))
            img = self.transforms(image=img)['image']
        if self.output_label:
            return img, waveform.reshape(-1), pl, y
        return img, waveform.reshape(-1), pl
    
def get_train_transforms():
    return Compose([
        Resize(CFG['size'], CFG['size'], p=1.0),
        OneOf([
            Cutout(max_h_size=5, max_w_size=16),
            CoarseDropout(max_holes=4),
            ], p=0.5),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
        ToTensorV2(p=1.0)], p=1.0)

def get_valid_transforms():
    return Compose([
        Resize(CFG['size'], CFG['size'], p=1.0),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
        ToTensorV2(p=1.0)], p=1.0)

def prepare_dataloader(df):
    df = df.copy()
    train      = df[df.is_valid==0].reset_index(drop=True).copy()
    valid      = df[df.is_valid==1].reset_index(drop=True).copy()
    train_ds   = BirdCLEF_Dataset(train, transforms=get_train_transforms())
    valid_ds   = BirdCLEF_Dataset(valid, transforms=get_valid_transforms(), is_train=False)
    dataloader = DataLoaders.from_dsets(
        train_ds,
        valid_ds,
        seed=CFG["seed"],
        bs=CFG['batch_size'],
        shuffle=True,
        num_workers=CFG['num_workers']
    )
    if -1 < CFG["device"].find("cuda"):
        dataloader = dataloader.cuda()
    return dataloader

Metrics and loss

In [16]:
def rmse(input, target):
    return torch.sqrt(F.mse_loss(F.sigmoid(input.flatten()), target))

def self_f1_score(input, target):
    input  = input.cpu().detach().numpy()
    target = target.cpu().detach().numpy()
    return f1_score(target, input>0.3, average="micro")

# https://www.kaggle.com/c/rfcx-species-audio-detection/discussion/213075
class BCEFocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, preds, targets):
        bce_loss = nn.BCEWithLogitsLoss(reduction='none')(preds, targets)
        probas = torch.sigmoid(preds)
        loss = targets * self.alpha * \
            (1. - probas)**self.gamma * bce_loss + \
            (1. - targets) * probas**self.gamma * bce_loss
        loss = loss.mean()
        return loss

Models

In [17]:
class ImageAndAudioModel(nn.Module):
    def __init__(self, model_name, pretrained=True):
        super().__init__()
        self.img_model   = timm.create_model(model_name, pretrained=pretrained, num_classes=0, in_chans=3)
        self.audio_model = Wav2Vec2ForCTC.from_pretrained("./wav2vec/") #.to(CFG["device"])
        num_features = self.img_model.num_features
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(499*32, 1024)
        self.linear2 = nn.Linear(num_features+1024, 127)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x, w):
        x1 = self.img_model(x)
        x2 = self.audio_model(w).logits
        x2 = self.flatten(x2)
        x2 = self.linear1(x2)
        x  = torch.cat((x1, x2), 1)
        x  = self.dropout(x)
        out = self.linear2(x)
        return out

class ImageAndAudioModelForScoredBird(nn.Module):
    def __init__(self):
        super().__init__()
        self.pre_model = ImageAndAudioModel(CFG["model"], pretrained=False)
        self.pre_model.load_state_dict(torch.load(CFG['pretrain']))
        self.linear    = nn.Linear(127+1, CFG["class_num"])

    def forward(self, x, w, p):
        x = self.pre_model(x, w)
        p = p.view(-1, 1)
        x = torch.cat((x, p), 1)
        out = self.linear(x)
        return out
    
def get_learner(df):
    dataloader = prepare_dataloader(df)
    model   = ImageAndAudioModelForScoredBird()
    learner = Learner(
        dataloader,
        model,
        loss_func=BCEWithLogitsLossFlat(),
        metrics=rmse).to_fp16()
    return learner

# Train model

In [18]:
for i in range(CFG["fold"]):
    df_train_aug = df_train.copy()
    df_train_aug["is_valid"] = 0
    df_train_aug.loc[df_train_aug.fold==i, "is_valid"] = 1
    
    label_0_num = df_train_aug[df_train_aug.label==0].shape[0]
    label_1_num = df_train_aug[df_train_aug.label==1].shape[0]
    aug_num = round(label_0_num / label_1_num)
    df_label_1 = df_train_aug[df_train_aug.label==1].copy()
    df_aug = pd.DataFrame()
    for i in range(aug_num):
        df_aug = df_aug.append(df_label_1)
    df_train_aug = pd.concat([df_aug, df_train_aug[df_train_aug.label==0]]).reset_index(drop=True)
    print(df_train_aug.label.value_counts())
    print(df_train_aug[df_train_aug.is_valid==0].shape, df_train_aug[df_train_aug.is_valid==1].shape)
    
    learn = get_learner(df_train_aug)
    learn.fit_one_cycle(
        CFG["epochs"],
        CFG["lr"],
        cbs=[SaveModelCallback(monitor='valid_loss',
                               comp=np.less),
             EarlyStoppingCallback(monitor='valid_loss',
                                   comp=np.less,
                                   patience=CFG['early_stopping'])]
    )
    shutil.move("./models/model.pth", f"./models/{CFG['save_name']}_fold{i}.pth")
    
    del learn
    torch.cuda.empty_cache()
    gc.collect()
    
    break

1    7615
0    7506
Name: label, dtype: int64
(11343, 13) (3778, 13)


Downloading:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at ./wav2vec/ and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


epoch,train_loss,valid_loss,rmse,time
0,0.34014,0.479793,0.367081,54:29
1,0.199641,0.69357,0.410724,54:29
2,0.084663,1.360805,0.457778,54:33
3,0.014272,2.238046,0.480798,53:34


Better model found at epoch 0 with valid_loss value: 0.4797931909561157.
No improvement since epoch 0: early stopping
