In [1]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

!pip install ../input/noisereduce-2-0-0/noisereduce-2.0.0-py3-none-any.whl

Processing /kaggle/input/noisereduce-2-0-0/noisereduce-2.0.0-py3-none-any.whl
Installing collected packages: noisereduce
Successfully installed noisereduce-2.0.0


In [2]:
import os
import gc
import cv2
import copy
import time
import yaml
import random
import shutil
import warnings
import torchaudio
import subprocess
import numpy as np
import pandas as pd
import noisereduce as nr
import IPython.display as ipd
import matplotlib.pyplot as plt
from math import ceil
from tqdm import tqdm
from shutil import copyfile
from PIL import Image, ImageDraw
from glob import glob as glob_file
from joblib import Parallel, delayed
from IPython.core.display import Video, display
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, KFold, GroupKFold, StratifiedKFold

import timm
from torch import nn
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.nn.modules.loss import _WeightedLoss
from torchvision import models, transforms
from fastai.vision.all import *
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor

from albumentations.pytorch import ToTensorV2
from albumentations import (
    Compose, OneOf, Normalize, Resize,
    Flip, HorizontalFlip, VerticalFlip, CenterCrop, RandomResizedCrop,
    Rotate, ShiftScaleRotate, RandomRotate90, Transpose,
    RGBShift, ChannelShuffle, HueSaturationValue, RandomBrightnessContrast,
    Blur, MotionBlur, MedianBlur, GaussNoise, Cutout, CoarseDropout
)

warnings.simplefilter('ignore')
pd.set_option("max_columns", 150)
pd.set_option('display.max_rows', 150)

In [3]:
CFG = {
    "seed"        : 42,
    'device'      : "cuda:0" if torch.cuda.is_available() else "cpu",
    "base_path"   : "../input/birdclef-2022/",
    "img_path"    : "./test_images",
    "output_path" : './',
    "thres_long"  : -1,
    "thres_short" : 0.1,
    "long_model"  : [
        "swin_large_patch4_window7_224",
        224,
        21,
        "../input/birdclef-trained-multi-label-long-models/models/birdclef_swin_long_fold0.pth"
    ],
    "classifier"  : [
        "swin_large_patch4_window7_224",
        224,
        22,
        "../input/birdclef-trained-multi-class-models-10sec/models/birdclef_swin_multiclass_n300_fold0.pth"
    ],
    "binary_model": [
        ["swin_large_patch4_window7_224",
         224,
         21,
         "../input/birdclef-trained-multi-label-models-10sec/models/birdclef_swin_10sec_fold0.pth"]
    ],
    "wav2vec"     : "../input/birdclef-trained-multi-class-models-10sec/wav2vec/",
    "valid_pct"   : 0.3,
    "batch_size"  : 12,
    "lr"          : 1e-4,
    "tta"         : 5,
    "num_workers" : 4
}

CFG

{'seed': 42,
 'device': 'cuda:0',
 'base_path': '../input/birdclef-2022/',
 'img_path': './test_images',
 'output_path': './',
 'thres_long': -1,
 'thres_short': 0.1,
 'long_model': ['swin_large_patch4_window7_224',
  224,
  21,
  '../input/birdclef-trained-multi-label-long-models/models/birdclef_swin_long_fold0.pth'],
 'classifier': ['swin_large_patch4_window7_224',
  224,
  22,
  '../input/birdclef-trained-multi-class-models-10sec/models/birdclef_swin_multiclass_n300_fold0.pth'],
 'binary_model': [['swin_large_patch4_window7_224',
   224,
   21,
   '../input/birdclef-trained-multi-label-models-10sec/models/birdclef_swin_10sec_fold0.pth']],
 'wav2vec': '../input/birdclef-trained-multi-class-models-10sec/wav2vec/',
 'valid_pct': 0.3,
 'batch_size': 12,
 'lr': 0.0001,
 'tta': 5,
 'num_workers': 4}

In [4]:
def get_img(path):
    im_bgr = cv2.imread(path)
    im_rgb = im_bgr[:, :, ::-1]
    return im_rgb

def seed_everything(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(CFG["seed"])

In [5]:
# Scored bird names
scored_birds = pd.read_json(f"{CFG['base_path']}scored_birds.json").values.flatten().tolist()
bird2label   = {}
for i, b in enumerate(scored_birds):
    bird2label[b] = i
    
# Define labels for other cases
label_other_birds = max(bird2label.values()) + 1
label_no_birds    = label_other_birds + 1

label_other_birds, label_no_birds

(21, 22)

# Make Spectgrams

In [6]:
test_meta = pd.read_csv(CFG["base_path"] + "test.csv")
test_meta.head(3)

Unnamed: 0,row_id,file_id,bird,end_time
0,soundscape_1000170626_akiapo_5,soundscape_1000170626,akiapo,5
1,soundscape_1000170626_akiapo_10,soundscape_1000170626,akiapo,10
2,soundscape_1000170626_akiapo_15,soundscape_1000170626,akiapo,15


In [7]:
#@torch.no_grad()
def create_spectrogram(
    fname,
    reduce_noise: bool = False,
    frame_length: int = 5,
    channel: int = 0,
    device = "cpu"
) -> list:
    waveform, sample_rate = torchaudio.load(fname)
    transform = torchaudio.transforms.Spectrogram(n_fft=1800, win_length=512).to(device)
    if reduce_noise:
        waveform = torch.tensor(nr.reduce_noise(
            y=waveform,
            sr=sample_rate,
            win_length=transform.win_length,
            use_tqdm=False,
            n_jobs=2,
        ))
    nb = int(frame_length * sample_rate)
    spectrograms = []
    for i in range(ceil(waveform.size()[-1] / nb)):
        frame = waveform[channel][i * nb:(i + 1) * nb]
        if len(frame) < nb:
            if i == 0:
                rep = round(float(nb) / len(frame))
                frame = frame.repeat(int(rep))
            else:
                frame = waveform[channel][-nb:]
        sg = torch.log(transform(frame.to(device))).cpu()
        spectrograms.append(np.nan_to_num(sg.numpy()))
    return spectrograms

In [8]:
img_extension = ".jpg"

def convert_and_export(
    fn,
    path_in,
    path_out,
    reduce_noise = False,
    frame_length: int = 5,
    device = "cpu"
) -> list:
    path_audio = os.path.join(path_in, fn)
    sgs = create_spectrogram(
        path_audio,
        reduce_noise=reduce_noise,
        frame_length=frame_length,
        device=device
    )
    records = []
    for i, sg in enumerate(sgs):
        path_img = os.path.join(path_out, fn + f".{i:03}" + img_extension)
        os.makedirs(os.path.dirname(path_img), exist_ok=True)
        plt.imsave(path_img, sg, vmin=-70, vmax=20)
        records.append({"img_name": os.path.basename(path_img), "end_time": (i + 1) * frame_length, "file_id": os.path.splitext(fn)[0]})
    return records

In [9]:
_convert_and_export = partial(
    convert_and_export,
    path_in=CFG["base_path"] + "test_soundscapes",
    path_out="test_images_short",
    reduce_noise=True,
    frame_length=10,
    device=CFG["device"]
)

soundscapes = glob.glob(os.path.join(CFG["base_path"], "test_soundscapes", "*.ogg"))
soundscapes = list(map(os.path.basename, soundscapes))
converted = []
for batch in Parallel(n_jobs=3)(delayed(_convert_and_export)(fn) for fn in tqdm(soundscapes)):
    converted += batch
    
df_test_short = pd.DataFrame(converted)
df_test_short["img_path"]   = CFG["img_path"] + "_short/" + df_test_short.img_name
df_test_short["audio_path"] = CFG["base_path"] + "test_soundscapes/" + df_test_short.file_id + ".ogg"
df_test_short.head()

100%|██████████| 1/1 [00:00<00:00, 38.17it/s]
100%|██████████| 4/4 [00:01<00:00,  2.33it/s]

Unnamed: 0,img_name,end_time,file_id,img_path,audio_path
0,soundscape_453028782.ogg.000.jpg,10,soundscape_453028782,./test_images_short/soundscape_453028782.ogg.000.jpg,../input/birdclef-2022/test_soundscapes/soundscape_453028782.ogg
1,soundscape_453028782.ogg.001.jpg,20,soundscape_453028782,./test_images_short/soundscape_453028782.ogg.001.jpg,../input/birdclef-2022/test_soundscapes/soundscape_453028782.ogg
2,soundscape_453028782.ogg.002.jpg,30,soundscape_453028782,./test_images_short/soundscape_453028782.ogg.002.jpg,../input/birdclef-2022/test_soundscapes/soundscape_453028782.ogg
3,soundscape_453028782.ogg.003.jpg,40,soundscape_453028782,./test_images_short/soundscape_453028782.ogg.003.jpg,../input/birdclef-2022/test_soundscapes/soundscape_453028782.ogg
4,soundscape_453028782.ogg.004.jpg,50,soundscape_453028782,./test_images_short/soundscape_453028782.ogg.004.jpg,../input/birdclef-2022/test_soundscapes/soundscape_453028782.ogg


_convert_and_export = partial(
    convert_and_export,
    path_in=CFG["base_path"] + "test_soundscapes",
    path_out="test_images_long",
    reduce_noise=True,
    frame_length=60,
    device=CFG["device"]
)

soundscapes = glob.glob(os.path.join(CFG["base_path"], "test_soundscapes", "*.ogg"))
soundscapes = list(map(os.path.basename, soundscapes))
converted = []
for batch in Parallel(n_jobs=3)(delayed(_convert_and_export)(fn) for fn in tqdm(soundscapes)):
    converted += batch
    
df_test_long = pd.DataFrame(converted)
df_test_long["img_path"]   = CFG["img_path"] + "_long/" + df_test_long.img_name
df_test_long["audio_path"] = CFG["base_path"] + "test_soundscapes/" + df_test_long.file_id + ".ogg"
df_test_long.head()

# Define model

In [10]:
class BirdCLEF_Dataset(ImageDataLoaders):
    
    def __init__(
        self, 
        df,
        transforms=None,
        length=10,
        class_num=22,
        output_label=True,
        get_primary=False,
        is_binary=False
    ):
        super().__init__(device="cuda")
        self.df = df.reset_index(drop=True).copy()
        self.processor    = Wav2Vec2Processor.from_pretrained(CFG["wav2vec"])
        self.transforms   = transforms
        self.length       = length
        self.class_num    = class_num
        self.output_label = output_label
        self.get_primary  = get_primary
        self.is_binary    = is_binary
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index: int):
        img_path = self.df.loc[index].img_path
        img      = get_img(img_path).copy()
        end_time = self.df.loc[index].end_time
        waveform, sr = torchaudio.load(self.df.loc[index].audio_path)
        waveform = waveform[0]
        if len(waveform) < sr*self.length:
            rep      = round(float(sr*self.length) / len(waveform))
            waveform = waveform.repeat(int(rep)+1)
            waveform = waveform[:sr*self.length]
        else:
            sta = sr*(end_time-self.length)
            end = sr*end_time
            if len(waveform[sta:end]) == sr*self.length:
                waveform = waveform[sta:end]
            else:
                waveform = waveform[-sr*self.length:]
        if self.length == 10:
            waveform = waveform[::2]
        elif self.length == 60:
            waveform = waveform[::100]
        waveform = self.processor(waveform, sampling_rate=16000, return_tensors="pt").input_values
        if self.transforms:
            img = self.transforms(image=img)['image']
        if self.is_binary:
            y = torch.from_numpy(np.zeros(self.class_num, dtype=np.float16))
        else:
            y = torch.from_numpy(np.array([1]))
        if self.get_primary:
            primary_label = torch.from_numpy(np.array(self.df.loc[index].label))
            return img, waveform.reshape(-1), primary_label, y
        if self.output_label:
            return img, waveform.reshape(-1), y
        return img, waveform.reshape(-1)
        
def get_inference_transforms(size):
    return Compose([
        Resize(size, size, p=1.0),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
        ToTensorV2(p=1.0)], p=1.0)

def prepare_dataloader(df, size, length, class_num, get_primary=False, is_binary=False):
    test = df.copy()
    dummy_ds   = BirdCLEF_Dataset(
        test,
        transforms=get_inference_transforms(size),
        length=length,
        class_num=class_num,
        output_label=True,
        get_primary=get_primary,
        is_binary=is_binary
    )
    test_ds    = BirdCLEF_Dataset(
        test,
        transforms=get_inference_transforms(size),
        length=length,
        class_num=class_num,
        output_label=True,
        get_primary=get_primary,
        is_binary=is_binary
    )
    dataloader = DataLoaders.from_dsets(dummy_ds, test_ds, bs=CFG["batch_size"])
    if -1 < CFG["device"].find("cuda"):
        dataloader = dataloader.cuda()
    return dataloader

In [11]:
def rmse(input, target):
    return 100*torch.sqrt(F.mse_loss(F.sigmoid(input.flatten()), target))

def self_f1_score(input, target):
    input  = input.cpu().detach().numpy()
    target = target.cpu().detach().numpy()
    return f1_score(target, input>0.3, average="micro")

class BCEFocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, preds, targets):
        bce_loss = nn.BCEWithLogitsLoss(reduction='none')(preds, targets)
        probas = torch.sigmoid(preds)
        loss = targets * self.alpha * \
            (1. - probas)**self.gamma * bce_loss + \
            (1. - targets) * probas**self.gamma * bce_loss
        loss = loss.mean()
        return loss

In [12]:
class ImageAndAudioModel(nn.Module):
    def __init__(self, model_name, pretrained=False, hidden=499*32, out=127):
        super().__init__()
        self.img_model   = timm.create_model(model_name, pretrained=pretrained, num_classes=0, in_chans=3)
        self.audio_model = Wav2Vec2ForCTC.from_pretrained(CFG["wav2vec"])
        num_features = self.img_model.num_features
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(hidden, 1024)
        self.linear2 = nn.Linear(num_features+1024, out)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x, w):
        x1 = self.img_model(x)
        x2 = self.audio_model(w).logits
        x2 = self.flatten(x2)
        x2 = self.linear1(x2)
        x  = torch.cat((x1, x2), 1)
        x  = self.dropout(x)
        out = self.linear2(x)
        return out

class ImageAndAudioModelForScoredBird(nn.Module):
    def __init__(self, model_name, out=127, class_num=22):
        super().__init__()
        self.pre_model = ImageAndAudioModel(model_name, pretrained=False)
        self.linear    = nn.Linear(out, class_num)

    def forward(self, x, w):
        x = self.pre_model(x, w)
        out = self.linear(x)
        return out

class ImageAndAudioModelForScoredBird_Secondaly(nn.Module):
    def __init__(self, model_name, out=127, class_num=22):
        super().__init__()
        self.pre_model = ImageAndAudioModel(model_name, pretrained=False)
        self.linear    = nn.Linear(out+1, class_num)

    def forward(self, x, w, p):
        x = self.pre_model(x, w)
        p = p.view(-1, 1)
        x = torch.cat((x, p), 1)
        out = self.linear(x)
        return out
        
def get_learner(
    df,
    model_name,
    size,
    path,
    length,
    class_num,
    get_primary=False,
    is_binary=False,
    is_long=False
):
    dataloader = prepare_dataloader(
        df,
        size,
        length,
        class_num,
        get_primary=get_primary,
        is_binary=is_binary
    )
    if is_long:
        model  = ImageAndAudioModel(model_name, hidden=59*32, out=class_num)
        loss   = BCEFocalLoss()
        metric = AccumMetric(self_f1_score)
    else:
        if get_primary:
            model  = ImageAndAudioModelForScoredBird_Secondaly(model_name, class_num=class_num)
            loss   = BCEWithLogitsLossFlat()
            metric = rmse
        else:
            #model  = ImageAndAudioModelForScoredBird(model_name, class_num=class_num)
            #loss   = CrossEntropyLossFlat()
            #metric = F1Score(average='micro')
            model  = ImageAndAudioModel(model_name, out=class_num)
            loss   = BCEFocalLoss()
            metric = AccumMetric(self_f1_score)

    model.load_state_dict(torch.load(path))
    learner = Learner(
        dataloader,
        model,
        loss_func=loss,
        metrics=metric).to_fp16()
    return learner, dataloader

# Inference
Inference to identify a primary label

In [13]:
#model_name = CFG["classifier"][0]
#size       = CFG["classifier"][1]
#class_num  = CFG["classifier"][2]
#path       = CFG["classifier"][3]
#learn, data_loader = get_learner(df_test_short, model_name, size, path, 10, class_num, get_primary=False)
#res, _ = learn.get_preds()
#res    = res.detach().numpy()

#del learn, data_loader
#gc.collect()

In [14]:
# Add predicted primary label
#df_test_short["label"] = np.argmax(res,1)
#df_test_short.head()

Inference to identify secondaly labels with long audio

model_name = CFG["long_model"][0]
size       = CFG["long_model"][1]
class_num  = CFG["long_model"][2]
path       = CFG["long_model"][3]
learn, data_loader = get_learner(df_test_long, model_name, size, path, 60, class_num, is_long=True, is_binary=True)
res, _ = learn.get_preds()
res    = torch.sigmoid(res).detach().numpy()

del learn, data_loader
gc.collect()

# Add predicted primary label
#df_long_labeled = pd.DataFrame(res, columns=scored_birds + ["others"])
df_long_labeled = pd.DataFrame(res, columns=scored_birds)
df_test_long = df_test_long.join(df_long_labeled)
df_test_long.head()

Inference to identify secondaly labels

In [15]:
all_res = []
for (model_name, size, class_num, path) in CFG["binary_model"]:
    print(path)
    learn, data_loader = get_learner(df_test_short, model_name, size, path, 10, class_num, get_primary=False, is_binary=True)
    res, _ = learn.get_preds()
    res    = torch.sigmoid(res).detach().numpy()
    all_res.append(res)
    
    del learn, data_loader
    gc.collect()

../input/birdclef-trained-multi-label-models-10sec/models/birdclef_swin_10sec_fold0.pth


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at ../input/birdclef-trained-multi-class-models-10sec/wav2vec/ and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Ensemble (simple blending)
res = np.array(all_res).mean(0)
#res

# Post processing

In [17]:
def post_processing(df):
    df = df.copy()
    df.end_time = df.end_time.astype(int)
    
    submission_post = pd.DataFrame()
    for f in df.file_name.unique():
        df_f = df[df.file_name==f]
        for b in df.bird_name.unique():
            df_post = df_f[df_f.bird_name==b].copy()
            df_post = df_post.sort_values("end_time").reset_index(drop=True)
            # Replace to True*3 or False*3
            target_str = str(list(df_post.target))
            target_str = target_str.replace("True, False, True",  "True, True, True")
            #target_str = target_str.replace("False, True, False", "False, False, False")
            # Change type from str to bool
            replaced_target = target_str[1:-1].split(",")
            replaced_target = [True if rt.replace(" ","")=="True" else False for rt in replaced_target]
            df_post.target  = replaced_target
            submission_post = submission_post.append(df_post)
            
    submission_post.end_time = submission_post.end_time.astype(str)
    return submission_post.reset_index(drop=True)

In [18]:
scored_birds = pd.read_json(f"{CFG['base_path']}scored_birds.json").values.flatten().tolist()
label2bird = {}
for i, bird in enumerate(scored_birds):
    label2bird[i] = bird

In [19]:
submission = pd.DataFrame()
for fi, et, r in zip(df_test_short.file_id, df_test_short.end_time, res):
    #r = r[0]
    #long_res = df_test_long[(df_test_long.file_id==fi)&(int(et)<=df_test_long.end_time)].head(1)
    #bird_candidates_long  = np.where(np.array(long_res.iloc[:, 5:]) > CFG['thres_long'])[1]
    #bird_candidates_short = np.where(r > CFG['thres_short'])[0]
    #bird_candidates = [b for b in bird_candidates_short if b in bird_candidates_long]
    bird_candidates = list(np.where(r > CFG['thres_short'])[0])
    bird_label = [True if label in bird_candidates else False for label, bird in label2bird.items()]

    df_submit_a  = pd.DataFrame({"file_name": fi,
                                 "end_time" : et-5,
                                 "bird_name": label2bird.values(),
                                 "target"   : bird_label})
    df_submit_b  = pd.DataFrame({"file_name": fi,
                                 "end_time" : et,
                                 "bird_name": label2bird.values(),
                                 "target"   : bird_label})

    df_submit  = pd.concat([df_submit_a, df_submit_b])
    submission = submission.append(df_submit)
    
submission = submission.reset_index(drop=True)
#submission = post_processing(submission)
submission["row_id"] = submission.file_name + "_" + submission.bird_name + "_" + submission.end_time.astype(str)
submission = submission[["row_id", "target"]]

print(submission.shape)
submission.head()

(252, 2)


Unnamed: 0,row_id,target
0,soundscape_453028782_akiapo_5,False
1,soundscape_453028782_aniani_5,False
2,soundscape_453028782_apapan_5,True
3,soundscape_453028782_barpet_5,False
4,soundscape_453028782_crehon_5,False


In [20]:
!rm -rf ./test_images*

In [21]:
submission.to_csv("submission.csv", index=False)