# First

This is an inference notebook.<br>
The training and dataset notebooks are existing.

#### Import 

In [None]:
from time import time
t1 = time()
# onnxsim-0.4.36
# !pip install --no-index /kaggle/input/birdclef2024-openvino-onnxruntime/onnxsim-0.4.36/onnxsim-0.4.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

# onnxruntime-1.17.3
# !pip install --no-index /kaggle/input/birdclef2024-openvino-onnxruntime/onnxruntime-1.17.3/humanfriendly-10.0-py2.py3-none-any.whl
# !pip install --no-index /kaggle/input/birdclef2024-openvino-onnxruntime/onnxruntime-1.17.3/coloredlogs-15.0.1-py2.py3-none-any.whl
# !pip install --no-index /kaggle/input/birdclef2024-openvino-onnxruntime/onnxruntime-1.17.3/onnxruntime-1.17.3-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

# onnxconverter-common-1.14.0
!pip install --no-index /kaggle/input/birdclef2024-openvino-onnxruntime/onnxconverter-common-1.14.0/protobuf-3.20.2-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
!pip install --no-index /kaggle/input/birdclef2024-openvino-onnxruntime/onnxconverter-common-1.14.0/onnxconverter_common-1.14.0-py2.py3-none-any.whl
!pip install --no-index /kaggle/input/birdclef2024-openvino-onnxruntime/fastjsonschema-2.17.1/fastjsonschema-2.17.1-py3-none-any.whl

# openvino-dev-2024.0.0
!pip install --no-index /kaggle/input/birdclef2024-openvino-onnxruntime/onnx-1.15.0/onnx-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install --no-index /kaggle/input/birdclef2024-openvino-onnxruntime/openvino-dev-2024.0.0/networkx-3.1-py3-none-any.whl
!pip install --no-index /kaggle/input/birdclef2024-openvino-onnxruntime/openvino-dev-2024.0.0/openvino_telemetry-2024.1.0-py3-none-any.whl
!pip install --no-index /kaggle/input/birdclef2024-openvino-onnxruntime/openvino-dev-2024.0.0/openvino-2024.0.0-14509-cp310-cp310-manylinux2014_x86_64.whl
!pip install --no-index /kaggle/input/birdclef2024-openvino-onnxruntime/openvino-dev-2024.0.0/openvino_dev-2024.0.0-14509-py3-none-any.whl

t2 = time()
print('Import time: ', f"{(t2-t1)/60}m")

In [None]:
# Basic
import sys
import os
import gc
import copy
import yaml
import random
import shutil
from time import time
import gzip
import bz2

# Python
import numpy as np
import pandas as pd
import pandas.api.types
import pickle
import pywt
import librosa
import librosa.display
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedGroupKFold
import sklearn.metrics
import matplotlib.pyplot as plt 
import plotly.express as px
import typing as tp
import cv2
from scipy.special import softmax
from glob import glob

# Notebook
from IPython.display import Audio
from tqdm.notebook import tqdm

# PyTorch
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch import optim
from torch.optim import lr_scheduler
from torch.cuda import amp
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2

# For faster inference
import onnx
# import onnxruntime as rt
from onnxconverter_common import float16
import openvino
import openvino as ov




# Use one device only
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# To show all af column
pd.set_option('display.max_columns', None)

#### Config

In [None]:
# test audio dir
TEST_SOUNDSCAPE = Path('/kaggle/input/birdclef-2024/test_soundscapes')

In [None]:
class Config:
    def __init__(self):
        self.TEST_SOUNDSCAPE = Path('/kaggle/input/birdclef-2024/test_soundscapes')
        self.NO_SOUNDSCAPE = False
        if 1 >= len(os.listdir(self.TEST_SOUNDSCAPE)):
            self.TEST_SOUNDSCAPE = Path('/kaggle/input/birdclef-2024/train_audio/asbfly')
            self.NO_SOUNDSCAPE = True

    model_name = "efficientnet_b0.ra_in1k"  # model  224
#     model_name = "efficientvit_b0.r224_in1k"  # model 224
#     model_name = "efficientnetv2_rw_s.ra2_in1k"  # model 288
    img_size = 224            # input size.
    n_folds = 5
    interpolation = cv2.INTER_AREA  # specifying method of interpolation(dfault is cv2.INTER_LINEAR)
    max_epoch = 9                   # number of max epoch. 1epoch means going around the training dataset.
#     batch_size = 32               # train batch size. Number of samples passed to the network in one training step
    batch_size = 1                  # test batch size. openvino can't respond flexibility to leftovers of number of batches 
    lr = 1.0e-03                    # learning rate. determine step size when updating model's weight
    weight_decay = 1.0e-02          # weight decay. Append regularization term for prevent over fitting
    es_patience = 5                 # Early Stopping
    seed = 1086                     # seed
    deterministic = True            # enable deterministic behaviour or not
    enable_amp = False              # enable Automatic Mixed Precision or not
#     device = "cuda"               # Device to use training. "cuda" is NVIDIA GPU
    device = "cpu"                  # Device to use training.
    
    # test related
    simple_training = True      # only use few data with training, be enable in training
    simple_inferring = False    # only use few data with inferring
    n_simple = 100              # number of data with simple training
    test = True                 # when inference
    show = False                # show few batch data at the end
    
    # Data Loading related
    MELSPEC_H = 128     # Horizontal melspectrogram resolution
    TOP_DB = 100        # Maximum decibel to clip audio to
    MIN_RATING = 0.0    # Minimum rating
    SR = 32000          # Sample rate as provided in competition description
    N_FFT = 2000
    HOP_LENGTH = 500
    fmin = 20
    fmax = 16000
    
    # faster inference related
    INPUT_SHAPE: list[int] = [1, 1, img_size, img_size]
    DUMMY_INPUT_TENSOR: torch.Tensor = torch.randn(*INPUT_SHAPE)
    DUMMY_INPUT_NUMPY_FP32: np.ndarray = DUMMY_INPUT_TENSOR.numpy()
    DUMMY_INPUT_NUMPY_FP16: np.ndarray = DUMMY_INPUT_NUMPY_FP32.astype(np.float16)
    OUTPUT_DIR_ONNX: Path = Path('./model/onnx')
    OUTPUT_DIR_OV: Path = Path('./model/ov')




CFG = Config()

In [None]:
# model output dir
CFG.OUTPUT_DIR_ONNX.mkdir(parents=True, exist_ok=True)
CFG.OUTPUT_DIR_OV.mkdir(parents=True, exist_ok=True)

# type elements
FilePath = tp.Union[str, Path]
Label = tp.Union[int, float, np.ndarray]

#### Preprocessing

In [None]:
# audio file
KAGGLE_TRAIN = '/kaggle/input/birdclef-2024/train_audio'
ADDED_TRAIN = '/kaggle/input/birdclef2024-additional-mp3/additional_audio'
ADDED_TRAIN_1 = '/kaggle/input/birdclef2024-additional-wav-1/additional_audio-1'
ADDED_TRAIN_2 = '/kaggle/input/birdclef2024-additional-wav-2/additional_audio-2'
TEST_SOUNDSCAPE = CFG.TEST_SOUNDSCAPE
os.makedirs(KAGGLE_TRAIN, exist_ok=True)

# save image dir, that obtrained from audio
SAVE_TRAIN = '/kaggle/working/train_image'
SAVE_TEST = '/kaggle/working/test_image'

# input image dir
TRAIN_IMAGE = Path('/kaggle/input/bird2024-melspec-v6/train_image/melspec')
TEST_IMAGE = Path('/kaggle/working/test_image/melspec')

# trained model
TRAINED_MODEL = Path('/kaggle/input/birdcref-2024-introduction-withtraining-train-v2')

class preprocessing():
    def __init__(self, AUDIO_DIRECTORY, SAVE_DIRECTORY, view=False, test=CFG.test):
        # config
        self.AUDIO_DIRECTORY = AUDIO_DIRECTORY
        self.SAVE_DIRECTORY = SAVE_DIRECTORY
        self.view = view
        self.test = test
        
        # make directory
        func_names = [method for method in dir(self) if callable(getattr(self, method)) and method.startswith("func")]
        print(func_names)
        os.makedirs(self.SAVE_DIRECTORY, exist_ok=True)
        for func_name in func_names:
            func = func_name.split('_')[-1]
            os.makedirs(self.SAVE_DIRECTORY + '/' + func, exist_ok=True)
    
    # load data
    def load_wave(self, audio_filepath, offset=0):
        # pick up 5 seconds
        if CFG.NO_SOUNDSCAPE:
            offset = 0
        self.y, _ = librosa.load(audio_filepath, sr=CFG.SR , offset=offset, duration=5)
        self.sr = CFG.SR
    
    def normalize(self, data: np.ndarray):
        # exclude upper and lower
        # np.clip(img,np.exp(-4), np.exp(8))
        
        data = data.astype(np.single)                       # to single
        data = data - data.min()                            # Normalize 0 to min
        data = (data / data.max() * 255).astype(np.uint8)   # Normalize 0 to 255
        
        return data
        
    # apply and save
    def apply_func(self, function):
        species_list = os.listdir(self.AUDIO_DIRECTORY)
        if not self.test:
            for species in species_list:
                species_path = self.AUDIO_DIRECTORY + '/' + species
                audio_file_list = os.listdir(species_path)
                for audio_file in audio_file_list:
                    audio_filepath = species_path + '/' + audio_file
                    self.load_wave(audio_filepath) # load audio
                    output = function() # apply function
                    output = self.normalize(output)
                    
                    SAVE_DIRECTORY = Path(self.SAVE_DIRECTORY + '/' + function.__name__.split('_')[-1] + '/' + species)
                    SAVE_DIRECTORY.mkdir(exist_ok=True)
                    SAVE_PATH = SAVE_DIRECTORY  / f"{audio_file.split('.')[0]}.npy"
                    np.save(SAVE_PATH, output)
                    del output

        if self.test:
            audio_directory_path = self.AUDIO_DIRECTORY
            audio_length = int(4*60) # second
            audio_offset_unit_max = int(audio_length / 5)
            for audio_file_path in audio_directory_path.glob('*.ogg'):
                    for audio_offset_unit in range(audio_offset_unit_max):
                        audio_offset = audio_offset_unit * 5
                        self.load_wave(str(audio_file_path), audio_offset) # load audio
                        output = function() # apply function
                        output = self.normalize(output)

                        SAVE_DIRECTORY = Path(self.SAVE_DIRECTORY) / function.__name__.split('_')[-1]
                        SAVE_DIRECTORY.mkdir(exist_ok=True)
                        if CFG.NO_SOUNDSCAPE:
                            SAVE_PATH = SAVE_DIRECTORY  / f"{audio_file_path.stem.replace('XC','')}_{audio_offset+5}.npy" # [soundscape_id]_[end_time].npy
                        else:
                            SAVE_PATH = SAVE_DIRECTORY  / f"{audio_file_path.stem.replace('soundscape_','')}_{audio_offset+5}.npy" # [soundscape_id]_[end_time].npy
                        np.save(SAVE_PATH, output)

                        del output
        
    def save_as_picke_gzip(self, data, filepath):       
        with gzip.open(filepath, 'wb') as f:
            pickle.dump(data, f)
            
    def func_waveform(self):        
        if self.view:
            print('waveform shape: ', self.y.shape)
            display(Audio(self.y, rate=self.sr))
            plt.figure(figsize=(10, 4))
            librosa.display.waveshow(self.y, sr=self.sr)
            plt.title('Waveform')
            plt.xlabel('Time (s)')
            plt.ylabel('Amplitude')
            plt.show()
        return self.y
    
    def func_spec(self):
        spec = librosa.amplitude_to_db(
            np.abs(librosa.stft(self.y)), 
            ref=np.max,
        )
        
        if self.view:
            print('spec shape: ', spec.shape)
            plt.figure(figsize=(10, 4))
            librosa.display.specshow(spec, sr=self.sr, x_axis='time', y_axis='log')
            plt.colorbar(format='%+2.0f dB')
            plt.title('Spectrogram')
            plt.show()
        return spec
    
    def func_melspec(self):
        melspec = librosa.feature.melspectrogram(
            y=self.y, 
            sr=CFG.SR,                  # sample rate
            n_fft=CFG.N_FFT,            # number of samples in window 
            hop_length=CFG.HOP_LENGTH,  # step size of window
            n_mels=CFG.MELSPEC_H,       # horizontal resolution from fmin→fmax in log scale
            fmin=CFG.fmin,              # minimum frequency
            fmax=CFG.fmax,              # maximum frequency
            power=2.0,                  # intensity^(power) for log scale
        )
        melspec = librosa.power_to_db(melspec, ref=np.max)
        
        if self.view:
            print('melspec shape: ', melspec.shape)
            plt.figure(figsize=(10, 4))
            librosa.display.specshow(melspec, sr=self.sr, x_axis='time', y_axis='mel')
            plt.colorbar(format='%+2.0f dB')
            plt.title('Mel Spectrogram')
            plt.show()
        return melspec
    
    def func_scalogram(self):
        scales = pywt.central_frequency('cmor') / np.linspace(1, 100, 100) * self.sr
        cwtmatr, freqs = pywt.cwt(self.y, scales, 'cmor', sampling_period=1/self.sr)
        
        if self.view:
            print('scarogram shape: ', cwtmatr.shape)
            plt.figure(figsize=(10, 4))
            plt.imshow(abs(cwtmatr), aspect='auto', extent=[0, len(self.y) / self.sr, 1, 100], cmap='jet', origin='lower')
            plt.colorbar()
            plt.title('Scalogram')
            plt.xlabel('Time (s)')
            plt.ylabel('Scale')
            plt.show()
            
        # to real value
        return abs(cwtmatr)

    def func_chromagram(self):
        C = librosa.feature.chroma_cqt(y=self.y, sr=self.sr)
        
        if self.view:
            print('chromagram shape: ', C.shape)
            plt.figure(figsize=(10, 4))
            librosa.display.specshow(C, sr=self.sr, x_axis='time', y_axis='chroma', cmap='coolwarm')
            plt.colorbar()
            plt.title('Chromagram')
            plt.show()
        return C

    
    def func_mfcc(self): 
        mfcc = librosa.feature.mfcc(y=self.y, sr=self.sr)
        
        if self.view:
            print('mfcc shape: ', mfcc.shape)
            plt.figure(figsize=(10, 4))
            librosa.display.specshow(mfcc, sr=self.sr, x_axis='time')
            plt.ylabel('MFCC coeffs')
            plt.colorbar()
            plt.title('MFCC')
            plt.show()
        return mfcc

    def func_spectralcontrast(self):
        contrast = librosa.feature.spectral_contrast(y=self.y, sr=self.sr)
        
        if self.view:
            print('contrast shape: ', contrast.shape)
            plt.figure(figsize=(10, 4))
            librosa.display.specshow(contrast, x_axis='time')
            plt.colorbar()
            plt.ylabel('Frequency bands')
            plt.title('Spectral Contrast')
            plt.show()
        return contrast

    
    def execute(self):
        func_list = [
            # self.func_waveform,
            # self.func_spec,
            self.func_melspec,
            # self.func_scalogram,
            # self.func_chromagram,
            # self.func_mfcc,
            # self.func_spectralcontrast,
        ]
        for func in func_list:
            self.apply_func(func)

# ・Define preprocessing class
# preprocessing_kaggle = preprocessing(KAGGLE_TRAIN, SAVE_TRAIN, view=True)
# preprocessing_added_train = preprocessing(ADDED_TRAIN, SAVE_TRAIN)
# preprocessing_added_train_1 = preprocessing(ADDED_TRAIN_1, SAVE_TRAIN)
# preprocessing_added_train_2 = preprocessing(ADDED_TRAIN_2, SAVE_TRAIN)
preprocessing_test = preprocessing(TEST_SOUNDSCAPE, SAVE_TEST, test=True)

t1 = time()
# ・Execute preprocessing
# preprocessing_kaggle.execute()
# preprocessing_added_train.execute()
# preprocessing_added_train_1.execute()
# preprocessing_added_train_2.execute()
preprocessing_test.execute()
t2 = time()


print('Preprocessing time: ', f"{(t2-t1)/60}m")


#### Utils

In [None]:
# setting seed in each env
def set_random_seed(seed: int = 42, deterministic: bool = False):
    """Set seeds"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = deterministic  # type: ignore

# function to set tensor to device
def to_device(
    tensors: tp.Union[tp.Tuple[torch.Tensor], tp.Dict[str, torch.Tensor]],
    device: torch.device, *args, **kwargs
):
    if isinstance(tensors, tuple):
        return (t.to(device, *args, **kwargs) for t in tensors)
    elif isinstance(tensors, dict):
        return {
            k: t.to(device, *args, **kwargs) for k, t in tensors.items()}
    else:
        return tensors.to(device, *args, **kwargs)

#### Dataset

In [None]:
class Bird2024Dataset(Dataset):
    def __init__(
        self,
        image_paths: tp.Sequence[FilePath],
        labels: tp.Sequence[Label],
        transform: A.Compose,
    ):
        self.train_path_list = image_paths
        self.label_list = labels
        self.transform = transform
        
    def __len__(self):
        # return total num of data
        return len(self.train_path_list)
    
    def __getitem__(self, index:int):
        # return data and target assosiated with index
        X = np.load(self.train_path_list[index])
        X = self._apply_transform(X)
        y = self.label_list[index] # y is ignored in test

        return (X, y) # y is ignored in test
    
    def _apply_transform(self, img:np.ndarray):
        """apply transform to image"""
        transformed = self.transform(image=img)
        img = transformed["image"].float()
        return img
    


### Define Model

In [None]:
import timm
import torch
from torch import nn

class BirdCLEF2024SpecModel(nn.Module):

    def __init__(
            self,
            model_name: str,
            pretrained: bool,
            in_channels: int,
            num_classes: int,
        ):
        super().__init__()
        self.model = timm.create_model(
            model_name=model_name, 
            pretrained=pretrained,
            num_classes=num_classes, 
            in_chans=in_channels
        )

    def forward(self, x):
        h = self.model(x)      

        return h

### Training

In [None]:
class FocalLossBCE(torch.nn.Module):
    def __init__(
        self,
        alpha: float = 0.25,
        gamma: float = 2,
        reduction: str = "mean",
        bce_weight: float = 1.0,
        focal_weight: float = 1.0,
    ):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.bce = torch.nn.BCEWithLogitsLoss(reduction=reduction)
        self.bce_weight = bce_weight
        self.focal_weight = focal_weight

    def forward(self, inputs, targets):
        focall_loss = torchvision.ops.focal_loss.sigmoid_focal_loss(
            inputs=inputs,
            targets=targets,
            alpha=self.alpha,
            gamma=self.gamma,
            reduction=self.reduction,
        )
        bce_loss = self.bce(inputs, targets)
        return self.bce_weight * bce_loss + self.focal_weight * focall_loss

### Inference

##### Function for inference

In [None]:
def get_test_path_label(img_paths: list):
    """Get file path and dummy target info."""
    labels = np.full((len(img_paths), N_CLASSES), -1, dtype="float32")
        
    test_data = {
        "image_paths": img_paths,
        "labels": [l for l in labels]}
    
    return test_data

def get_test_transforms(CFG):
    test_transform = A.Compose([
        A.Resize(p=1.0, height=CFG.img_size, width=CFG.img_size, interpolation = CFG.interpolation),
        ToTensorV2(p=1.0)
    ])
    return test_transform

#### Inference loop

In [None]:
def run_inference_loop(model, loader, device):
    model.to(device)
    model.eval()
    pred_list = []
    with torch.no_grad():
        for batch in tqdm(loader):
            x = to_device(batch[0], device)
            y = model(x)
            pred_list.append(y.softmax(dim=1).detach().cpu().numpy())
        
    pred_arr = np.concatenate(pred_list)
    del pred_list
    return pred_arr

#### ONNX and OpenVINO

##### Functions that perform inference

In [None]:
# Functions that perform inference
def infernce_loop_openvino(
    infer_request: openvino.runtime.ie_api.InferRequest,
    loader: torch.utils.data.DataLoader,
    device
):
    pred_list = []
    with torch.no_grad():
        for batch in tqdm(loader):
            # get input
            input_data = to_device(batch[0], device).detach().cpu().numpy()
            # to ov tensor
            input_ov_tensor = ov.Tensor(array=input_data, shared_memory=True)
            # set input tensor
            infer_request.set_input_tensor(input_ov_tensor)
            # inference
            output_numpy = infer_request.infer()["output"]
            # to probability
            output_softmax = softmax(output_numpy, axis=1)
            # stack the prediction
            pred_list.append(output_softmax)
    
    # If asynchronous processing is used effectively, there is a potential to further speed up the inference workflow.
    # infer_request.start_async()
    # infer_request.wait()
    # output_numpy = infer_request.get_output_tensor().data
    pred_arr = np.concatenate(pred_list)
    del pred_list
    return pred_arr

#### Converting models to openvino

In [None]:
# converting models to openvino
def convert_pytorch_to_openvino(device):
    for fold_id in range(CFG.n_folds):
        # load model
        model_path = TRAINED_MODEL / f"best_model_fold{fold_id}.pth"
        model = BirdCLEF2024SpecModel(
            model_name=CFG.model_name, pretrained=False, num_classes=N_CLASSES, in_channels=1
        )
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.eval()
        # export to onnx
        torch.onnx.export(model,
                      CFG.DUMMY_INPUT_TENSOR,
                      CFG.OUTPUT_DIR_ONNX / f"fp32_fold{fold_id}.onnx",
                      opset_version=15,
                      input_names=['input'],
                      output_names=['output']
        )

        # convert model to openvino
        ov_model = ov.convert_model(CFG.OUTPUT_DIR_ONNX / f"fp32_fold{fold_id}.onnx",
                                input=[('input', CFG.INPUT_SHAPE)],)
        # save model
        ov.save_model(ov_model, CFG.OUTPUT_DIR_OV / f"fp32_fold{fold_id}.xml", compress_to_fp16=False)
        
convert_pytorch_to_openvino(device=torch.device(CFG.device))

#### Executing inference

In [None]:
def execute_inference():
    img_paths = [filepath for filepath in TEST_IMAGE.iterdir()]
    if CFG.simple_inferring:
        if len(img_paths) > CFG.n_simple:
            img_paths = img_paths[:CFG.n_simple]
            
    IDs = ['soundscape_' + filepath.stem for filepath in TEST_IMAGE.iterdir()]
    test_preds_arr = np.zeros((CFG.n_folds, len(img_paths), N_CLASSES))

    test_path_label = get_test_path_label(img_paths)
    test_transform = get_test_transforms(CFG)
    test_dataset = Bird2024Dataset(**test_path_label, transform=test_transform)
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=CFG.batch_size, num_workers=4, shuffle=False, drop_last=False)

    device = torch.device(CFG.device)
    
    # Make an openvino core object
    core = ov.Core()
    for fold_id in range(CFG.n_folds):
        print(f"\n[fold {fold_id}]")

        ##  ・When using an external model
        # model_path = TRAINED_MODEL / f"best_model_fold{fold_id}.pth"
        # model = BirdCLEF2024SpecModel(
        #     model_name=CFG.model_name, pretrained=False, num_classes=N_CLASSES, in_channels=1)
        # model.load_state_dict(torch.load(model_path, map_location=device))
        
        # Cmpile
        compiled_model = core.compile_model(CFG.OUTPUT_DIR_OV / f"fp32_fold{fold_id}.xml", device_name='CPU')
        # Make an inference request
        infer_request = compiled_model.create_infer_request()
        # Make an inference
        test_pred = infernce_loop_openvino(infer_request, test_loader, device)

        ##  ・When using an external model
        # test_pred = run_inference_loop(model, test_loader, device)
        test_preds_arr[fold_id] = test_pred

        del compiled_model, infer_request, test_pred
        torch.cuda.empty_cache()
        gc.collect()
        
    return test_preds_arr, IDs

# execute
t1 = time()
test_preds_arr, IDs = execute_inference()
t2 = time()
print('inference time: ', f"{(t2-t1)/60}m")

##### Creating sample submission

In [None]:
def make_submission(test_preds_arr, IDs):
    # average of each fold's model
    test_pred = test_preds_arr.mean(axis=0)

    # make id column
    IDs = pd.DataFrame(
        IDs, columns=['row_id']
    )

    # make prediction colmuns
    test_pred_df = pd.DataFrame(
        test_pred, columns=CLASSES
    )

    # concat
    sub = pd.concat([IDs, test_pred_df], axis=1)
    
    # Sort
    # Extract the middle and last numbers from each string in the column
    sub['middle_number'] = sub['row_id'].str.extract(r'_([0-9]+)_')
    sub['end_number'] = sub['row_id'].str.extract(r'_(\d+)$').astype(int)

    # Sort by 'middle_number' first,'end_number' second
    sub = sub.sort_values(by=['middle_number', 'end_number'], ignore_index=True)
    sub = sub.drop(columns=['middle_number', 'end_number'])
    sub = sub.dropna(axis=0,how='any')

    # make submission
    sub.to_csv("submission.csv", index=False)
    display(sub.head())
    print(sub.shape)
    print(sub.iloc[0][CLASSES].sum())
    
    return sub
    
sub = make_submission(test_preds_arr, IDs)


#### check the batch data

In [None]:
if CFG.show:
    img_paths = [filepath for filepath in TEST_IMAGE.iterdir()]
    if CFG.simple_inferring:
        if len(img_paths) > CFG.n_simple:
            img_paths = img_paths[:CFG.n_simple]

    IDs = ['soundscape_' + filepath.stem for filepath in TEST_IMAGE.iterdir()]
    test_preds_arr = np.zeros((CFG.n_folds, len(img_paths), N_CLASSES))

    test_path_label = get_test_path_label(img_paths)
    test_transform = get_test_transforms(CFG)
    test_dataset = Bird2024Dataset(**test_path_label, transform=test_transform)
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=CFG.batch_size, num_workers=4, shuffle=False, drop_last=False)

    def show_batch(ds, row=3, col=3):
        fig = plt.figure(figsize=(10, 10))
        img_index = np.random.randint(0, len(ds)-1, row*col)

        for i in range(len(img_index)):
            img, label = ds[img_index[i]]

            if isinstance(img, torch.Tensor):
                img = img.detach().numpy()
                img = np.squeeze(img)

            ax = fig.add_subplot(row, col, i + 1, xticks=[], yticks=[])
            ax.imshow(img, cmap='jet')
        plt.tight_layout()
        plt.show()

    show_batch(test_dataset)

END