## Install

In [None]:
# Basic
import sys
import os
import gc
import copy
import yaml
import bz2
import pickle
import ast
import gzip
import random
import shutil
from time import time
import typing as tp
import cv2

# Python
from pathlib import Path
import numpy as np
import pandas as pd
import pandas.api.types
import scipy
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import GroupKFold
import sklearn.metrics
import pywt
import matplotlib.pyplot as plt
import plotly.express as px
import librosa
import librosa.display

# Notebook
# from tqdm.notebook import tqdm
from tqdm import tqdm
from IPython.display import Audio

# Pytorch
import torch
import torchvision
from torch import nn
from torch import optim
from torch.optim import lr_scheduler
from torch.cuda import amp
from torch.utils.data import Dataset
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2


# use one device only
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# delete the display limit of columns
pd.set_option('display.max_columns', None)

# type elements
FilePath = tp.Union[str, Path]
Label = tp.Union[int, float, np.ndarray]

In [None]:
# Global
KAGGLE_TRAIN = '/kaggle/input/birdclef-2024/train_audio'
ADDED_TRAIN_1 = '/kaggle/input/birdclef2024-additional-wav-1/additional_audio-1'
ADDED_TRAIN_2 = '/kaggle/input/birdclef2024-additional-wav-2/additional_audio-2'
SAVE_TRAIN = '/kaggle/working/train_image'
os.makedirs(KAGGLE_TRAIN, exist_ok=True)

make_dataset = False
only_over4 = False # sort by data rank

# 0. 目標
目標: **鳥の鳴き声の音声ファイルから、鳥の種類を判別/分類すること。**

- 提出は184の鳥種を列名に持つcsvとして行うため、各行につき184要素の予測分布を作成する必要がある。  
- テストデータには約1100個の4分間の音声データを使用する。
- 音声データを区切った、各row_idが示す5秒間に対して予測を行う。


### 0.1 評価指標
マクロ平均ROC-AUCスコアを使用します。  
[公式Notebook](https://www.kaggle.com/code/metric/birdclef-roc-auc/notebook)による以下のscore関数によって計算されます。

・引数  
solution: 教師データ  
submission: 推論データ  
row_id_column_name: 両データに共通する識別id列の列名

In [None]:
# kaggle_metric_utilitiesが使用できないので実際に試すことはできません
# import kaggle_metric_utilities

class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    '''
    Version of macro-averaged ROC-AUC score that ignores all classes that have no true positive labels.
    '''
    del solution[row_id_column_name]
    del submission[row_id_column_name]

    if not pandas.api.types.is_numeric_dtype(submission.values):
        bad_dtypes = {x: submission[x].dtype  for x in submission.columns if not pandas.api.types.is_numeric_dtype(submission[x])}
        raise ParticipantVisibleError(f'Invalid submission data types found: {bad_dtypes}')

    solution_sums = solution.sum(axis=0)
    scored_columns = list(solution_sums[solution_sums > 0].index.values)
    assert len(scored_columns) > 0

    return kaggle_metric_utilities.safe_call_score(sklearn.metrics.roc_auc_score, solution[scored_columns].values, submission[scored_columns].values, average='macro')

・試しにsample_submissionを提出

In [None]:
sample_submission = pd.read_csv('/kaggle/input/birdclef-2024/sample_submission.csv') 
display(sample_submission)
sample_submission.to_csv('submission.csv')
# Score 0.5

# 1. データ確認
与えられたデータは以下の通りです。  
・train_audio: 学習用データ    
・test_soundscapes: テストデータが格納される  
・unlabeled_soundscapes: テストデータと同じ場所で録音された、ラベルのない音声データ   
・train_metadata.csv: 学習用データのメタデータ   
・sample_submission.csv  
・eBird_Taxonomy_v2021.csv: 鳥種間の関係性についてのデータ

それぞれについて解説します。
- train_audio  
ディレクトリ構成は`train_audio/'bird_species_name'/'file_name'.ogg`で、テストデータに合わせて32KHzにダウンサンプリングされた音声データがoog形式で保存されています。  
- test_soundscapes  
Notebookが提出された時に、このディレクトリに4分間の約1100個の音声データが配置されます。ファイル名はランダムですが、`soundscape_xxxxxx.ogg`という一般的な名前です。  
  全データのダウンロードに約5分かかります。  
- unlabeled_soundscapes  
テストデータと同じ場所で録音された、ラベルのついていない音声データです。  
これ以上の説明はありませんでした。事前学習に使えるかもしれません。  
- train_metadata.csv  
`train_audio`に関するメタデータが記述されています。主要な列の意味は以下の通りです。  
・primary_label: 鳥種を識別するコード    
例: `amecro`など。`https://ebird.org/species/amecro`のように、URLの末尾に指定するとより詳細な鳥種の情報を確認できます(幾つかのページは壊れています)。  
・latitude & longitude: 録音した場所  
一部の鳥種には'方言'があるため、地理的な多様性を確保することが推奨されます。  
・author: 録音を行ったユーザー名  
・filename: このメタデータが紐づけられたファイルの名前  
- sample_submission.csv  
有効な提出ファイルの例。列の意味は以下の通りです  
・row_id: 予測に使用するid  
row_idは`soundscape_[soundscape_id]_[end_time]`の形式で指定される  
・[bird_id]: 鳥種コードです。182個存在し、それぞれの鳥の確率を予測する必要があります。 
- eBird_Taxonomy_v2021.csv
鳥種間の関係性についてのデータです。

<br>  

以下のより詳細な確認は次で行います。    
・train_metadata<br>
・train_audio<br>
・eBird_Taxonomy_v2021.csv<br>   


### 1.1 train_metadata 
学習用データのメタデータついて、詳細を確認します。

In [None]:
# メタデータ
metadata = pd.read_csv('/kaggle/input/birdclef-2024/train_metadata.csv')
display(metadata.head(5))

各列について解説します。  
・primary_label: 鳥種を識別するコード。例: `amecro`など  
・secondary_label: ファイルに含まれる、中心的でない鳥種(紛れ込んだ鳥種)  
・type: データの種類。[call, song, ]  
・latitude: 録音した場所の緯度(南北)  
・longitude: 録音した場所の経度(東西)  
・scientific_name: 学名  
・common_name: 一般名  
・author: 録音を行ったユーザー名  
・license: ライセンス  
・rating: xenocantによる録音品質の評価。 範囲は[1,5]で、0は未評価を示す  
・url: xeno-canto.orgのデータurl  
・filename: このメタデータが紐づけられたファイルの名前  

次は統計データを確認します。

In [None]:
# メタデータの統計データ
print('\n'+ '-'*15 +' '+ f'shape' +' '+ '-'*15)
print(metadata.shape)
print('\n'+ '-'*15 +' '+ f'columns' +' '+ '-'*15)
print(metadata.columns)
print('\n'+ '-'*15 +' '+ f'info' +' '+ '-'*15)
display(metadata.info())
print('\n'+ '-'*15 +' '+ f'null' +' '+ '-'*15)
display(metadata.isnull().sum())
print('\n'+ '-'*15 +' '+ f'statistics' +' '+ '-'*15)
display(metadata.describe())
print('\n'+ '-'*15 +' '+ f'value_counts' +' '+ '-'*15)

count_column = ['primary_label', 'type', 'author', 'rating']
for column in count_column:
    print('\n')
    display(metadata[column].value_counts())

複数のtypeが存在するため、この情報を使用する場合は取り扱いに注意する必要があります。

In [None]:
display(metadata['type'].value_counts())

unique_types = []
for type_list in metadata['type']:
    for type_elem in ast.literal_eval(type_list):
        if not type_elem in unique_types:
            unique_types.append(type_elem)

print('\n unique_types: ',unique_types)

**また上記の確認から分かるように、各データ数が500で打ち止めとなっているため、正しいデータを取得する必要があります。**  
Shiroさんが追加のデータセットを公開してくれています。

### 1.2 train_audio

##### 1.2.1 check data (include additional data)

In [None]:
metadata_df = pd.read_csv('/kaggle/input/birdclef-2024/train_metadata.csv')
train_path = '/kaggle/input/birdclef-2024/train_audio/'
data, rate = librosa.load(train_path + metadata_df.filename[0])
print('data shape: ', data.shape)
print('sampling rate: ', rate)
print(type(data))

display(Audio(data[:rate*5], rate=rate)) # choice first channel and first 5s (rate: num of samples per second)
display(px.line(y=data[:rate*5], title=metadata_df.common_name[0]))

metadata_adwav = pd.read_csv('/kaggle/input/birdclef2024-additional-wav-1/BirdClef2024_additional.csv')
train_path_adwav = '/kaggle/input/birdclef2024-additional-wav-1/additional_audio-1'
data, rate = librosa.load(train_path_adwav + '/asbfly/XC155673.wav')
display(Audio(data[:rate*5], rate=rate))
display(px.line(y=data[:rate*5]))

##### 1.2.2 check MFCC

In [None]:
import librosa
import librosa.display
import matplotlib.pyplot as plt

# Load an audio file
y_ogg, sr_ogg = librosa.load('/kaggle/input/birdclef-2024/train_audio/asbfly/XC134896.ogg')
y_wav, sr_wav = librosa.load('/kaggle/input/birdclef2024-additional-wav-1/additional_audio-1/asbfly/XC155673.wav')

# Extract MFCC
mfcc_ogg = librosa.feature.mfcc(y=y_ogg, sr=sr_ogg)
mfcc_wav = librosa.feature.mfcc(y=y_wav, sr=sr_wav)
mfccs = {'ogg':mfcc_ogg, 'wav':mfcc_wav}

# Plot the MFCCs
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 8))
axes = axes.flatten()

for i, mfcc_key in enumerate(mfccs):
    ax = axes[i]
    print(ax)
    print(type(ax))
    img = librosa.display.specshow(mfccs[mfcc_key], x_axis='time', ax=ax)
    cbar = fig.colorbar(img, ax=ax)
    cbar.set_label('Value')
    ax.set_title('MFCC_'+f'{mfcc_key}')

plt.tight_layout()   
plt.show()


### 1.3 eBird_Taxonomy_v2021.csv
このファイルは分類学的情報を持ちます。

In [None]:
taxonomy = pd.read_csv('/kaggle/input/birdclef-2024/eBird_Taxonomy_v2021.csv')

display(taxonomy.head())
print('\n'+ '-'*15 +' '+ f'statistics' +' '+ '-'*15)
display(taxonomy.describe())
display(taxonomy.describe(include=['O']))


各列について解説します。<br><br>
TAXON_ORDER: 鳥種の分類番号   
CATEGORY: 鳥種  
SPECIES_CODE: 鳥種コード  
PRIMARY_COM_NAME: 鳥種の一般名
SCI_NAME: 鳥種の学名  
ORDER1: 何目か(イヌ目など)  
FAMILY: 分類学上の家族  
SPECIES_GROUP: 種族グループ  
REPORT_AS: 鳥種に関する追加情報  



# 2. Create data and Save

### 2.1 Load kaggle train metadata

In [None]:
# Load kaggle train metadata
TRAIN_CSV = '/kaggle/input/birdclef-2024/train_metadata.csv'
train_csv_df = pd.read_csv(TRAIN_CSV)
print(train_csv_df.columns)

filename_df = train_csv_df['filename']

### 2.2 Load metadata added

In [None]:
# Load metadata added
TRAIN_CSV_ADD = '/kaggle/input/birdclef2024-additional-wav-1/BirdClef2024_additional.csv'
train_csv_add_df = pd.read_csv(TRAIN_CSV_ADD)

print(train_csv_add_df.columns)
display(train_csv_add_df)

### 2.3 Aligne added metadata 

In [None]:
# Aligne added metadata 
pd.set_option('future.no_silent_downcasting', True)
def Aligne_metadata_added(df:pd.DataFrame):
    if 'lat' in df.columns:
        # Adjust kaggle_train and additional_train for merge, columns required the processing: file, q , type
        alignment_dict = {'latitude':'lat', 'longitude':'lng', 'filename': 'file', 'rating':'q'}
        in_alingment_dict = {value: key for key, value in alignment_dict.items()}

        # rename
        df.rename(columns=in_alingment_dict, inplace=True)
        # modify filename
        df['filename'] = df['primary_label'] + '/' + df['filename'] + '.wav'
        # modify q
        dict_q_to_rating = {'A':5, 'B':4, 'C':3, 'D':2, 'E':1, 'no score':0}
        df['rating'] = df['rating'].replace(dict_q_to_rating)
        df['rating'] = df['rating'].astype('int64')
        # modify type
        df['type'] = "[\'" + df['type'].str.replace(", ", "\', \'") + "\']"
        df['added'] = True

    return df

# check df aligned
train_csv_add_df_aligned = Aligne_metadata_added(train_csv_add_df)
train_csv_add_df_aligned.head(10)

### 2.4 Concat kaggle metadata and added matadata with delete not exist data

In [None]:
from pathlib import Path

# check shape
print('before: ')
print(train_csv_df.shape)
print(train_csv_add_df_aligned.shape)

# concat
train_csv_all_df = pd.concat([train_csv_df, train_csv_add_df_aligned], axis=0).reset_index(drop=True)
# add column of stem (ex. XC134896)
train_csv_all_df['stem'] = train_csv_all_df['filename'].apply(lambda x: Path(x).stem)

# remove not exist data(because because of the link being dead on the website)
not_exist_list = [
    'aspfly1/XC775312',
    'comior1/XC881009',
    'hoopoe/XC891005',
    'hoopoe/XC891004',
    'hoopoe/XC798809',
    'hoopoe/XC798808',
    'hoopoe/XC798807',
    'hoopoe/XC798806',
    'hoopoe/XC798805',
    'eaywag1/XC835367',
    'orihob2/XC762524',
]
for i, filename in enumerate(not_exist_list):
    not_exist_list[i] = filename + '.wav'

# Filter the DataFrame
train_csv_all_df = train_csv_all_df[~train_csv_all_df['filename'].isin(not_exist_list)]

# rating sort
if only_over4:
    label_counts = train_csv_all_df['primary_label'].value_counts()
    CLASSES_OVER10 = label_counts[label_counts > 10].index.tolist()
    threshold = 4
    train_csv_all_df = train_csv_all_df.drop(train_csv_all_df[(train_csv_all_df['primary_label'].isin(CLASSES_OVER10)) & (train_csv_all_df['rating'] < threshold)].index)

# reset index
train_csv_all_df = train_csv_all_df.reset_index()

# check shape and content
print('after concat: ')
print(train_csv_all_df.shape)
train_csv_all_df

### 2.5 Check statistics

In [None]:
# statistics data of metadata
print('\n'+ '-'*15 +' '+ f'shape' +' '+ '-'*15)
print(train_csv_all_df.shape)
print('\n'+ '-'*15 +' '+ f'columns' +' '+ '-'*15)
print(train_csv_all_df.columns)
print('\n'+ '-'*15 +' '+ f'info' +' '+ '-'*15)
display(train_csv_all_df.info())
print('\n'+ '-'*15 +' '+ f'null' +' '+ '-'*15)
display(train_csv_all_df.isnull().sum())
print('\n'+ '-'*15 +' '+ f'nunique of stem' +' '+ '-'*15)
print(train_csv_all_df['stem'].nunique())
print('\n'+ '-'*15 +' '+ f'statistics' +' '+ '-'*15)
display(train_csv_all_df.describe())
print('\n'+ '-'*15 +' '+ f'value_counts' +' '+ '-'*15)

count_column = ['primary_label', 'type', 'author', 'rating']
for column in count_column:
    print('\n')
    display(train_csv_all_df[column].value_counts())

# 3. Split Fold

### 3.1 split fold

In [None]:
from sklearn.model_selection import StratifiedKFold

def split_fold(df:pd.DataFrame):
    # config
    N_FOLDS = 5
    RANDAM_SEED = 42
    df['fold'] = -1

    # object
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDAM_SEED)

    for i, (train_index, test_index) in enumerate(skf.split(df, df['primary_label'])):
        df.loc[test_index, 'fold'] = i
    
    return df
        
train_csv_all_df = split_fold(train_csv_all_df)

display(train_csv_all_df.head())
train_csv_all_df.to_csv('train_csv_all.csv', index=False)

# check
view_data = False
if view_data:
    with pd.option_context('display.max_rows', 300):
        print(train_csv_all_df.groupby('fold')['primary_label'].value_counts().head(300))

# 4. Pre-Processing
##### Make Dataset in here.
This is needed for using when inference.

In [None]:
if make_dataset:
    class preprocessing():
        def __init__(self, AUDIO_DIRECTORY, SAVE_DIRECTORY, view=False):
            # config
            self.AUDIO_DIRECTORY = AUDIO_DIRECTORY
            self.SAVE_DIRECTORY = SAVE_DIRECTORY
            self.view = view

            # make directory
            func_names = [method for method in dir(self) if callable(getattr(self, method)) and method.startswith("func")]
            print(func_names)
            os.makedirs(self.SAVE_DIRECTORY, exist_ok=True)
            for func_name in func_names:
                func = func_name.split('_')[-1]
                os.makedirs(self.SAVE_DIRECTORY + '/' + func, exist_ok=True)

        # apply and save
        def apply_func(self, function):
            train_dict = {}
            species_list = os.listdir(self.AUDIO_DIRECTORY)
            for species in species_list:
                species_path = self.AUDIO_DIRECTORY + '/' + species
                audio_file_list = os.listdir(species_path)
                for audio_file in audio_file_list:
                    audio_filepath = species_path +'/' + audio_file
                    # load audio
                    self.load_wave(audio_filepath) 
                    # apply function
                    mfcc = function() 
                    # register to dict
                    train_dict[audio_file.split(".")[0]] = mfcc 

            # set function name as filepath name
            SAVE_PATH = self.SAVE_DIRECTORY + '/' + function.__name__.split('_')[-1] + f'/train.pickle.gz'
            self.save_as_picke_gzip(train_dict, SAVE_PATH)

        def save_as_picke_gzip(self, data, filepath):       
            with gzip.open(filepath, 'wb') as f:
                pickle.dump(data, f)

        def load_wave(self, audio_filepath):
            # pick up first 5 seconds
            self.y, self.sr = librosa.load(audio_filepath, offset=0, duration=5)

        def func_waveform(self):        
            if self.view:
                print('waveform shape: ', self.y.shape)
                display(Audio(self.y, rate=self.sr))
                plt.figure(figsize=(10, 4))
                librosa.display.waveshow(self.y, sr=self.sr)
                plt.title('Waveform')
                plt.xlabel('Time (s)')
                plt.ylabel('Amplitude')
                plt.show()
            return self.y

        def func_spec(self):
            spec = librosa.amplitude_to_db(np.abs(librosa.stft(self.y)), ref=np.max)

            if self.view:
                print('spec shape: ', spec.shape)
                plt.figure(figsize=(10, 4))
                librosa.display.specshow(spec, sr=self.sr, x_axis='time', y_axis='log')
                plt.colorbar(format='%+2.0f dB')
                plt.title('Spectrogram')
                plt.show()
            return spec

        def func_melspec(self):
            melspec = librosa.feature.melspectrogram(y=self.y, sr=self.sr, n_mels=128)
            melspec_DB = librosa.power_to_db(melspec, ref=np.max)

            if self.view:
                print('melspec shape: ', melspec_DB.shape)
                plt.figure(figsize=(10, 4))
                librosa.display.specshow(melspec_DB, sr=self.sr, x_axis='time', y_axis='mel')
                plt.colorbar(format='%+2.0f dB')
                plt.title('Mel Spectrogram')
                plt.show()
            return melspec_DB

        def func_scalogram(self):
            scales = pywt.central_frequency('cmor') / np.linspace(1, 100, 100) * self.sr
            cwtmatr, freqs = pywt.cwt(self.y, scales, 'cmor', sampling_period=1/self.sr)

            if self.view:
                print('scarogram shape: ', cwtmatr.shape)
                plt.figure(figsize=(10, 4))
                plt.imshow(abs(cwtmatr), aspect='auto', extent=[0, len(self.y) / self.sr, 1, 100], cmap='jet', origin='lower')
                plt.colorbar()
                plt.title('Scalogram')
                plt.xlabel('Time (s)')
                plt.ylabel('Scale')
                plt.show()
            # to real value
            return abs(cwtmatr)

        def func_chromagram(self):
            C = librosa.feature.chroma_cqt(y=self.y, sr=self.sr)

            if self.view:
                print('chromagram shape: ', C.shape)
                plt.figure(figsize=(10, 4))
                librosa.display.specshow(C, sr=self.sr, x_axis='time', y_axis='chroma', cmap='coolwarm')
                plt.colorbar()
                plt.title('Chromagram')
                plt.show()
            return C


        def func_mfcc(self): 
            mfcc = librosa.feature.mfcc(y=self.y, sr=self.sr)

            if self.view:
                print('mfcc shape: ', mfcc.shape)
                plt.figure(figsize=(10, 4))
                librosa.display.specshow(mfcc, sr=self.sr, x_axis='time')
                plt.ylabel('MFCC coeffs')
                plt.colorbar()
                plt.title('MFCC')
                plt.show()
            return mfcc

        def func_spectralcontrast(self):
            contrast = librosa.feature.spectral_contrast(y=self.y, sr=self.sr)

            if self.view:
                print('contrast shape: ', contrast.shape)
                plt.figure(figsize=(10, 4))
                librosa.display.specshow(contrast, x_axis='time')
                plt.colorbar()
                plt.ylabel('Frequency bands')
                plt.title('Spectral Contrast')
                plt.show()
            return contrast


        def execute(self):
            func_list = [
    #             self.func_waveform,
    #             self.func_spec,
    #             self.func_melspec,
                self.func_scalogram,
    #             self.func_chromagram,
    #             self.func_mfcc,
    #             self.func_spectralcontrast,
            ]
            for func in func_list:
                self.apply_func(func)

    # ・Define preprocessing class
    preprocessing_kaggle = preprocessing(KAGGLE_TRAIN, SAVE_TRAIN)
    preprocessing_added_train_1 = preprocessing(ADDED_TRAIN_1, SAVE_TRAIN)
    preprocessing_added_train_2 = preprocessing(ADDED_TRAIN_2, SAVE_TRAIN)

    # ・Execute preprocessing
    preprocessing_kaggle.execute()
    preprocessing_added_train_1.execute()
    preprocessing_added_train_2.execute()



    print(os.listdir(SAVE_TRAIN))


# 5. Create Dataset

### 5.1 Configurations

In [None]:
class CFG:
#     model_name = "efficientnetv2_rw_s.ra2_in1k"  # input size: 288
    model_name = "efficientnet_b0.ra_in1k"   # input size: 224
#     model_name = "efficientvit_b0.r224_in1k"   # input size: 224

    img_size = 224                  # input size 
    folds = 5                       # number of folds
    interpolation = cv2.INTER_AREA  # specifying method of interpolation(dfault is cv2.INTER_LINEAR)
    max_epoch = 9                   # number of max epoch. 1epoch means going around the training dataset.
    batch_size = 32                 # batch size. Number of samples passed to the network in one training step
    lr = 1.0e-03                    # learning rate. determine step size when updating model's weight
    weight_decay = 1.0e-02          # weight decay. Append regularization term for prevent over fitting
    es_patience = 5                 # epoch number of Early Stopping
    seed = 1086                     # seed
    deterministic = True            # deterministic
    enable_amp = False              # Automatic Mixed Precision
    device = "cuda"                 # devide for model, "cuda" is NVIDIA GPU
    simple_training = False         # only use a few data for training.
    n_simple = 100                  # number of data of simple_training
    ten_tweny_sec = False           # using 10 or 20 sec data instead of 5 sec

##### 5.1.1 Install

In [None]:
import pandas as pd

sample_submission = pd.read_csv('/kaggle/input/birdclef-2024/sample_submission.csv')
TRAIN_IMAGE = Path('/kaggle/input/bird2024-melspec-v6/train_image/melspec')
CLASSES = sample_submission.columns[1:].values
N_CLASSES = len(CLASSES)
if CFG.simple_training:
    train_csv_all_df = train_csv_all_df[:CFG.n_simple]
    


print(CLASSES)
print(N_CLASSES)

### 5.2 Utility Functions

In [None]:
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2

# for dataset
def get_path_label(val_fold, train_csv_all_df: pd.DataFrame):
    """
    Get file path and label.
    """
    
    train_idx = train_csv_all_df[train_csv_all_df["fold"] != val_fold].index.values
    val_idx   = train_csv_all_df[train_csv_all_df["fold"] == val_fold].index.values
    img_paths = []
    labels = train_csv_all_df['primary_label'].values
    
    for filename in train_csv_all_df["filename"].values:
        img_path = TRAIN_IMAGE / filename.replace(".ogg", ".npy").replace(".wav", ".npy")
        
        if CFG.ten_tweny_sec:
            img_path = (TRAIN_IMAGE / (filename + '_10').replace(".ogg", ".npy").replace(".wav", ".npy"))
            
            if img_path.exists():
                img_paths.append(img_path)
                
        else:
            img_paths.append(img_path)

    train_data = {
        "image_paths": [img_paths[idx] for idx in train_idx],
        "labels": [labels[idx] for idx in train_idx]}

    val_data = {
        "image_paths": [img_paths[idx] for idx in val_idx],
        "labels": [labels[idx] for idx in val_idx]}
    
    return train_data, val_data, train_idx, val_idx

# make correspondence list of label and number
def make_corresponding_list():
    """
    return: list of train_path, list of label_path
    """
    # Correspondences:
    # ID -> XC000000, ...
    # LABEL -> asbfly, ashdro1, ...
    # NUMBER -> 0,1,2, ...


    LABEL2NUM = {}
    for i, _class in enumerate(CLASSES):
        LABEL2NUM[f'{_class}'] = i
        NUM2LABEL = {value: key for key, value in LABEL2NUM.items()}

    train_path_list = []
    ID_list = []
    
    for species in TRAIN_IMAGE.iterdir():
        # iterate train_image dir
        for image in species.iterdir():
            # iterate image file
            if not CFG.simple_training: 
                # when simple training is false
                train_path_list.append(str(image.absolute()))
                ID_list.append(str(image.stem))
                
            else: 
                # when simple training is ture
                if image.stem in train_csv_all_df['stem']:
                    train_path_list.append(str(image.absolute()))
                    ID_list.append(str(image.stem))                
        
    ID2LABEL = pd.Series(train_csv_all_df.primary_label.values, index=train_csv_all_df.stem).to_dict()

    return LABEL2NUM, NUM2LABEL

# set seed
def set_random_seed(seed: int = 42, deterministic: bool = False):
    """Set seeds"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = deterministic  # type: ignore

# set tensor to device
def to_device(tensors: tp.Union[tp.Tuple[torch.Tensor], tp.Dict[str, torch.Tensor]], device: torch.device, *args, **kwargs):
    if isinstance(tensors, tuple):
        return (t.to(device, *args, **kwargs) for t in tensors)
    elif isinstance(tensors, dict):
        return {
            k: t.to(device, *args, **kwargs) for k, t in tensors.items()}
    else:
        return tensors.to(device, *args, **kwargs)


# global
LABEL2NUM, NUM2LABEL = make_corresponding_list()

### 5.3 Augmentations

In [None]:
def get_transforms_mixup(CFG):
    train_transform = A.Compose([
        A.CoarseDropout(max_holes=1, max_height=CFG.img_size, max_width=int(CFG.img_size/5), min_holes=None, min_height=None, min_width=None, fill_value=0, mask_fill_value=None, p=0.5) ,
        A.CoarseDropout(max_holes=1, max_height=int(CFG.img_size/7), max_width=CFG.img_size, min_holes=None, min_height=None, min_width=None, fill_value=0, mask_fill_value=None, p=0.5) ,
        ToTensorV2(p=1.0)
    ])
    val_transform = A.Compose([
        A.Resize(p=1.0, height=CFG.img_size, width=CFG.img_size, interpolation = CFG.interpolation),
        ToTensorV2(p=1.0)
    ])
    return train_transform, val_transform

    
def get_transforms(CFG):
    train_transform = A.Compose([
        A.Resize(p=1.0, height=CFG.img_size, width=CFG.img_size, interpolation = CFG.interpolation),
        # ToTensorV2(p=1.0)
    ])
    val_transform = A.Compose([
        A.Resize(p=1.0, height=CFG.img_size, width=CFG.img_size, interpolation = CFG.interpolation),
        # ToTensorV2(p=1.0)
    ])
    return train_transform, val_transform

### 5.4 Dataset

In [None]:
class Bird2024Dataset(Dataset):
    def __init__(
        self,
        image_paths: tp.Sequence[FilePath],
        labels: tp.Sequence[Label],
        transform: A.Compose,
    ):
        self.train_path_list = image_paths
        self.label_list = labels
        self.transform = transform
        
    def __len__(self):
        # return total num of data
        return len(self.train_path_list)
    
    def __getitem__(self, index:int):
        # return data and target assosiated with index
        X = np.load(self.train_path_list[index])
        X = self._apply_transform(X)
        y = self.label_list[index]
        y = LABEL2NUM[y]
        
        # Number of unique categories
        one_hot_y = np.zeros(N_CLASSES)
        one_hot_y[y] = 1
        #         display(type(X), type(y))

        return (X, one_hot_y)
    
    def _apply_transform(self, img:np.ndarray):
        """apply transform to image"""
        transformed = self.transform(image=img)
        img = transformed["image"]# .float()# .half()
        return img


### 5.5 Dataset with Mixup

In [None]:
def mixup(image1, image2, label1, label2, alpha=0.5):
    
    possible_values = np.arange(0, 1.1, 0.1) 
    p = 0.5
    mixed_image = (p * image1) + ((1 - p) * image2)
    mixed_label = (p * label1) + ((1 - p) * label2)
    
    mixed_image = mixed_image.astype(np.single)
    # Normalize 0 to min
    mixed_image = mixed_image - mixed_image.min()
    # Normalize 0 to 255
    mixed_image = (mixed_image / mixed_image.max() * 255).astype(np.uint8)
    
    return mixed_image, mixed_label

class MixupDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, alpha=0.5, transform=None):
        self.dataset = dataset
        self.alpha = alpha
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image1, label1 = self.dataset[idx]
        random_number = np.random.rand()
        
        # mixup
        if random_number < self.alpha:
            image2, label2 = self.dataset[np.random.randint(0, len(self.dataset))]
            mixed_image, mixed_label = mixup(image1, image2, label1, label2, self.alpha)
            if self.transform:
                mixed_image = self.transform(image=mixed_image)["image"]
                mixed_image = mixed_image.float()
            return (mixed_image, mixed_label)
        # as is
        else:
            if self.transform:
                image1 = self.transform(image=image1)['image']
                image1 = image1.float()
            return (image1, label1)

# 6. Model

In [None]:
import timm
import torch
from torch import nn

class BirdCLEF2024SpecModel(nn.Module):

    def __init__(
            self,
            model_name: str,
            pretrained: bool,
            in_channels: int,
            num_classes: int,
        ):
        super().__init__()
        self.model = timm.create_model(
            model_name=model_name, 
            pretrained=pretrained,
            num_classes=num_classes, 
            in_chans=in_channels
        )

    def forward(self, x):
        h = self.model(x)      

        return h

model = BirdCLEF2024SpecModel(model_name=CFG.model_name, pretrained=True, num_classes=N_CLASSES, in_channels=1)

### Visualize

In [None]:
def show_batch(ds, row=3, col=3):
    fig = plt.figure(figsize=(10, 10))
    img_index = np.random.randint(0, len(ds)-1, row*col)
    
    for i in range(len(img_index)):
        img, label = ds[img_index[i]]
        
        if isinstance(img, torch.Tensor):
            img = img.detach().numpy()
            img = np.squeeze(img)
        
        ax = fig.add_subplot(row, col, i + 1, xticks=[], yticks=[])
        ax.imshow(img, cmap='jet')
        ax.set_title(f'ID: {img_index[i]}; Target: {label}')
    
    plt.tight_layout()
    plt.show()
    



if True:
    _train_path_label, _val_path_label, _, _ = get_path_label(0, train_csv_all_df)
    _train_transform, _val_transform = get_transforms(CFG)
    _train_transform_mixup, _val_transform_mixup = get_transforms_mixup(CFG)

    # Normal Dataset
    # _train_dataset = Bird2024Dataset(**_train_path_label, transform=_train_transform)
    # Mixup Dataset
    _train_dataset = MixupDataset(Bird2024Dataset(**_train_path_label, transform=_train_transform), transform=_train_transform_mixup)    
    test_input, test_target = _train_dataset[0]
    
    show_batch(_train_dataset)

# 7. Training

In [None]:
class FocalLossBCE(torch.nn.Module):
    def __init__(
        self,
        alpha: float = 0.25,
        gamma: float = 2,
        reduction: str = "mean",
        bce_weight: float = 1.0,
        focal_weight: float = 1.0,
    ):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.bce = torch.nn.BCEWithLogitsLoss(reduction=reduction)
        self.bce_weight = bce_weight
        self.focal_weight = focal_weight

    def forward(self, inputs, targets):
        focall_loss = torchvision.ops.focal_loss.sigmoid_focal_loss(
            inputs=inputs,
            targets=targets,
            alpha=self.alpha,
            gamma=self.gamma,
            reduction=self.reduction,
        )
        bce_loss = self.bce(inputs, targets)
        return self.bce_weight * bce_loss + self.focal_weight * focall_loss

In [None]:
def train_one_fold(CFG, val_fold, train_all, output_path):
    """Main"""
    # If True, forces cuDNN to benchmark multiple convolution algorithms and choose the fastest one
    torch.backends.cudnn.benchmark = True
    set_random_seed(CFG.seed, deterministic=CFG.deterministic)
    # set device with pytorch env
    device = torch.device(CFG.device)
    
    train_path_label, val_path_label, _, _ = get_path_label(val_fold, train_csv_all_df)
    train_transform, val_transform = get_transforms(CFG)
    train_transform_mixup, val_transform_mixup = get_transforms_mixup(CFG)
    
#     train_dataset = Bird2024Dataset(**train_path_label, transform=train_transform)
#     val_dataset = Bird2024Dataset(**val_path_label, transform=val_transform)
    train_dataset = MixupDataset(Bird2024Dataset(**_train_path_label, transform=_train_transform), transform=train_transform_mixup)
    val_dataset = MixupDataset(Bird2024Dataset(**val_path_label, transform=val_transform), transform=val_transform_mixup)
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=CFG.batch_size, num_workers=4, shuffle=True, drop_last=True)
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=CFG.batch_size, num_workers=4, shuffle=False, drop_last=False)
    
    model = BirdCLEF2024SpecModel(
        model_name=CFG.model_name, 
        pretrained=True, 
        num_classes=N_CLASSES, 
        in_channels=1
    )
    model.to(device)
    
    optimizer = optim.AdamW(params=model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    scheduler = lr_scheduler.OneCycleLR(
        optimizer=optimizer, epochs=CFG.max_epoch,
        pct_start=0.0, steps_per_epoch=len(train_loader),
        max_lr=CFG.lr, div_factor=25, final_div_factor=4.0e-01
    )
    
# type of loss function(train)
#     loss_func = KLDivLossWithLogits()
#     loss_func = nn.CrossEntropyLoss()
    loss_func = nn.BCEWithLogitsLoss()
#     loss_func = FocalLossBCE()
    loss_func.to(device)
    
# type of loss function(test)
#     loss_func_val = KLDivLossWithLogitsForVal()
#     loss_func_val = nn.CrossEntropyLoss()
    loss_func_val = nn.BCEWithLogitsLoss()
#     loss_func_val = FocalLossBCE()
    
    use_amp = CFG.enable_amp
    scaler = amp.GradScaler(enabled=use_amp)
    
    best_val_loss = 1.0e+09
    best_epoch = 0
    train_loss = 0
    val_loss = 0
    
    for epoch in range(1, CFG.max_epoch + 1):
        epoch_start = time()
        model.train()
        for batch in train_loader:
            
            x, t = batch
            x = to_device(x, device)
            t = to_device(t, device)
                
            optimizer.zero_grad()
            with amp.autocast(use_amp):
                y = model(x)
#                 y = torch.sigmoid(y) # as necessary
                loss = loss_func(y, t)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_loss += loss.item()
            scheduler.step()
            
        train_loss /= len(train_loader)
            
        model.eval()
        for batch in val_loader:
            x, t = batch
            x = to_device(x, device)
            with torch.no_grad(), amp.autocast(use_amp):
                y = model(x)
#                 y = torch.sigmoid(y) # as necessary
            y = y.detach().cpu().to(torch.float32)
            loss = loss_func_val(y, t)
            val_loss += loss.item()
        val_loss /= len(val_loader)
        
        if val_loss < best_val_loss:
            best_epoch = epoch
            best_val_loss = val_loss
            torch.save(model.state_dict(), str(output_path / f'snapshot_epoch_{epoch}.pth'))
        
        elapsed_time = time() - epoch_start
        print(
            f"[epoch {epoch}] train loss: {train_loss: .6f}, val loss: {val_loss: .6f}, elapsed_time: {elapsed_time: .3f}")
        
        if epoch - best_epoch > CFG.es_patience:
            print("Early Stopping!")
            break
            
        train_loss = 0
        val_loss = 0
            
    return val_fold, best_epoch, best_val_loss

In [None]:
score_list = []
for fold_id in range(CFG.folds):
    output_path = Path(f"fold{fold_id}")
    output_path.mkdir(exist_ok=True)
    print(f"[fold{fold_id}]")
    score_list.append(train_one_fold(CFG, fold_id, train_csv_all_df, output_path))

# 8. Validation

In [None]:
# check the best score and fold
print(score_list)

##### select the best model and delete others

In [None]:
# select the best model and delete others
best_log_list = []
for (fold_id, best_epoch, _) in score_list:
    
    # select the best model
    exp_dir_path = Path(f"fold{fold_id}")
    best_model_path = exp_dir_path / f"snapshot_epoch_{best_epoch}.pth"
    # copy to new place
    copy_to = f"./best_model_fold{fold_id}.pth"
    shutil.copy(best_model_path, copy_to)
    
    for p in exp_dir_path.glob("*.pth"):
        # delete
        p.unlink()

##### Function for Inference

In [None]:
# Function for inference
def run_inference_loop(model, loader, device):
    model.to(device)
    model.eval()
    pred_list = []
    with torch.no_grad():
        for batch in tqdm(loader):
            x = to_device(batch[0], device)
            y = model(x)
#             y = torch.sigmoid(x) # as necessary
            pred_list.append(y.softmax(dim=1).detach().cpu().numpy())
    
    # concatenate in vertical (long scroll like)
    pred_arr = np.concatenate(pred_list)
    del pred_list
    return pred_arr

##### Inference for test data

In [None]:
# Inference for test data
after_runned = True
if after_runned:

    oof_pred_arr = np.zeros((len(train_csv_all_df), N_CLASSES))
    score_list = []

    for fold_id in range(CFG.folds):
        print(f"\n[fold {fold_id}]")
        device = torch.device(CFG.device)

        # # get_dataloader
        _, val_path_label, _, val_idx = get_path_label(fold_id, train_csv_all_df)
        _, val_transform = get_transforms(CFG)
        _, val_transform_mixup = get_transforms_mixup(CFG)
        
        val_dataset = MixupDataset(Bird2024Dataset(**val_path_label, transform=val_transform), transform=val_transform_mixup)
#         val_dataset = Bird2024Dataset(**val_path_label, transform=val_transform)
        val_loader = torch.utils.data.DataLoader(
            val_dataset, batch_size=CFG.batch_size, num_workers=4, shuffle=False, drop_last=False)

        # get saved model
        model_path = f"./best_model_fold{fold_id}.pth"
        model = BirdCLEF2024SpecModel(
            model_name=CFG.model_name, 
            pretrained=False, 
            num_classes=N_CLASSES, 
            in_channels=1,
        )
        model.load_state_dict(torch.load(model_path, map_location=device))

        # inference
        val_pred = run_inference_loop(model, val_loader, device)
#         val_pred = scipy.special.softmax(val_pred)
        oof_pred_arr[val_idx] = val_pred

        del val_idx, val_path_label
        del model, val_loader
        torch.cuda.empty_cache()
        gc.collect()

##### Calculate CV score

In [None]:
from sklearn.metrics import roc_auc_score

# make true array(teacher label)
label_arr = train_csv_all_df['primary_label'].apply(lambda x: LABEL2NUM[x]).values
# one-hot
ture_arr = np.zeros((label_arr.size, N_CLASSES))
ture_arr[np.arange(label_arr.size), label_arr] = 1
ture_arr = pd.DataFrame(ture_arr, columns=CLASSES)

# oof
oof = pd.DataFrame(oof_pred_arr, columns=CLASSES)

micro_roc_auc_ovr = roc_auc_score(
    ture_arr,
    oof,
    multi_class="ovr",
    average="micro",
)

print(f"CV: Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.10f}")

In [None]:
# check whether the sum of row will be 1
display(oof.loc[0].sum())

In [None]:
# check the results
display(oof.head())
display(ture_arr.head())
display(oof.tail())
display(ture_arr.tail())

#### END