In [1]:
%matplotlib inline
import matplotlib_inline   # setup output image format
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
import matplotlib.pyplot as plt
import matplotlib
from PIL import Image
import numpy as np
from sklearn import *
from scipy import stats
np.random.seed(100)
import csv
from scipy import io
import pickle
from IPython.display import Audio, display, Image
import os.path
from pathlib import Path
from tqdm.notebook import tqdm

In [2]:
import librosa
import pandas as pd

In [3]:
import torchaudio
import torchvision
import torch
print('PyTorch version:{}, Device:{}'.format(torch.__version__, torch.device('cuda' if torch.cuda.is_available() else 'cpu')))

PyTorch version:1.13.0+cpu, Device:cpu


In [4]:
class Config:
    seed = 46
    n_fft = 2048
    duration = 5
    sr = 32000
    hop_length = 512
    fmin = 20
    fmax = 12000
    n_mels = 128
    n_mfcc = 20

    n_fold = 5
    n_class = 264
    train_df_path = '/kaggle/input/produce-data-and-training/dfdata/train_df.csv'
    test_df_path = '/kaggle/input/produce-data-and-training/dfdata/test_df.csv'
    test_dir_path = '/kaggle/input/birdclef-2023/test_soundscapes'
    bestmodel = '/kaggle/input/produce-data-and-training/effmodel/bestmodel.pth'
    
    batch_size = 32
    num_workers = 2
    device = torch.device('cpu')
    epochs = 12
    PRECISION = 16    
    PATIENCE = 8    
    pretrained = True            
    weight_decay = 1e-3
    lr = 1e-3
    mix_up_alpha = 0.2

In [5]:
train_df = pd.read_csv(Config.train_df_path)
Config.n_class = len(train_df['primary_label'].unique())
class_name = train_df['primary_label'].unique().tolist()

In [6]:
df_test = pd.DataFrame(
     [(path.stem, *path.stem.split("_"), path) for path in Path(Config.test_dir_path).glob("*.ogg")],
    columns = ["filename", "name" ,"id", "path"]
)

In [7]:
def pad4mel(y, length, pad_mode='constant'):
    repeat = length // len(y) + 1
    y = np.tile(y, repeat)
    if len(y) < length:
        y = np.pad(y, (0, length - len(y)), pad_mode)
    elif len(y) > length:
        y = y[:length]

    return y

class BirdDataset(torch.utils.data.Dataset):

    def __init__(self, df:pd.DataFrame, sr = Config.sr, duration = Config.duration, hop_length=Config.hop_length, fmin=Config.fmin, fmax=Config.fmax, n_fft=Config.n_fft, n_mels=Config.n_mels):

        self.df = df
        self.sr = sr 
        self.duration = duration
        self.hop_length = hop_length
        self.fmin = fmin
        self.fmax = fmax
        self.n_mels = n_mels
        self.n_fft = n_fft

    def __len__(self):
        return len(self.df)
    
    @staticmethod
    def normalize(mel_spectrogram):
        mel_spectrogram = torch.nn.functional.normalize(mel_spectrogram, p=2, dim=0)
        return mel_spectrogram
    
    @staticmethod
    def audio2mel(wav, sr, n_fft, hop_length, n_mels, fmin, fmax):
        S = librosa.feature.melspectrogram(y=wav, sr=sr,n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, fmin=fmin, fmax=fmax)
        S_dB = librosa.power_to_db(S, ref=np.max)
        return S_dB

    def __getitem__(self, idx):
    
        row = self.df.iloc[idx]
        audio_path = row.path
        wav, sr = librosa.load(audio_path)
        if sr!=self.sr:
            wav = librosa.resample(wav, orig_sr=sr, target_sr=self.sr)
        audios = []
        for i in range(0, len(wav), self.duration*self.sr):
            print(i)
            start = i
            end = start + self.duration*self.sr
            audios.append(wav[start:end])
        
        if len(audios[-1])<self.duration*self.sr:
            audios[-1] = pad4mel(audios[-1], self.duration*self.sr)
        mel_data = [self.normalize(torch.tensor(self.audio2mel(audio,self.sr, self.n_fft, self.hop_length, self.n_mels, self.fmin, self.fmax)).float()) for audio in audios]
        mel_data = torch.stack(mel_data)
        return mel_data
            

In [8]:
def trans_data():
    transform = torchvision.transforms.Compose([
    torchvision.transforms.ToPILImage(),
    torchvision.transforms.Resize((224, 224)),
    torchvision.transforms.ToTensor(),
])
    return transform


def inference_model(model, test_ds, channel_num=1, transform=None):
        
    model.eval()    
    predictions = []
    for en in tqdm(range(len(test_ds))):
        inputs = test_ds[en]
        inputs = torch.stack([transform(x.repeat(3, 1, 1)) for x in inputs])
        with torch.no_grad():
            outputs = model(inputs).detach().cpu().numpy()
            print(outputs.shape)
        predictions.append(outputs)
            
    
    return predictions


In [9]:
test_ds = BirdDataset(
    df_test, 
    sr = Config.sr,
    duration = Config.duration,
)

In [10]:
class MelClassifierBiLSTM(torch.nn.Module):

    def __init__(self, n_class, model_name, hidden_size):
        super().__init__()
        if model_name == 'efficientnet_b3':
            self.model = torchvision.models.efficientnet_b3(weights='IMAGENET1K_V1')
        elif model_name == 'efficientnet_b0':
            self.model = torchvision.models.efficientnet_b0(weights='IMAGENET1K_V1')
        elif model_name == 'mobilenet_v2':
            self.model = torchvision.models.mobilenet_v2(weights='IMAGENET1K_V1')
        self.features = self.model.features
        self.n_features = self.model.classifier[1].in_features
        self.lstm = torch.nn.LSTM(self.n_features, hidden_size, batch_first=True, bidirectional=True)
        self.fc1 = torch.nn.Linear(hidden_size*2, hidden_size)
        self.fc2 = torch.nn.Linear(hidden_size, n_class)
        self.dropout = torch.nn.Dropout(0.4)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.features(x)
        x = x.mean([2, 3])
        x = x.unsqueeze(1)
        x, _ = self.lstm(x)
        x = x.mean(dim=1)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.dropout(x)
        x = self.sigmoid(x)
        return x

In [11]:
transform = trans_data()
model = torch.load(Config.bestmodel,map_location=torch.device('cpu'))
model.to(Config.device)
print('begin')
prediction = inference_model(model, test_ds, 3, transform)

begin


  0%|          | 0/1 [00:00<?, ?it/s]

0
160000
320000
480000
640000
800000
960000
1120000
1280000
1440000
1600000
1760000
1920000
2080000
2240000
2400000
2560000
2720000
2880000
3040000
3200000
3360000
3520000
3680000
3840000
4000000
4160000
4320000
4480000
4640000
4800000
4960000
5120000
5280000
5440000
5600000
5760000
5920000
6080000
6240000
6400000
6560000
6720000
6880000
7040000
7200000
7360000
7520000
7680000
7840000
8000000
8160000
8320000
8480000
8640000
8800000
8960000
9120000
9280000
9440000
9600000
9760000
9920000
10080000
10240000
10400000
10560000
10720000
10880000
11040000
11200000
11360000
11520000
11680000
11840000
12000000
12160000
12320000
12480000
12640000
12800000
12960000
13120000
13280000
13440000
13600000
13760000
13920000
14080000
14240000
14400000
14560000
14720000
14880000
15040000
15200000
15360000
15520000
15680000
15840000
16000000
16160000
16320000
16480000
16640000
16800000
16960000
17120000
17280000
17440000
17600000
17760000
17920000
18080000
18240000
18400000
18560000
18720000
18880000
1904

In [12]:
filenames = df_test.filename.values.tolist()

bird_cols = list(pd.get_dummies(train_df['primary_label']).columns)
sub_df = pd.DataFrame(columns=['row_id']+bird_cols)



In [13]:
prediction

[array([[3.2452587e-03, 1.4413803e-04, 5.4777003e-05, ..., 3.4488574e-05,
         3.4261293e-06, 5.0824382e-03],
        [1.3851183e-03, 6.4900868e-02, 3.3512020e-03, ..., 7.9497257e-03,
         8.9736581e-03, 9.9733979e-02],
        [4.9505714e-02, 8.0797918e-02, 1.9211065e-02, ..., 3.7812535e-04,
         1.4490727e-03, 1.3117665e-02],
        ...,
        [2.5018054e-04, 1.4725573e-04, 2.6868307e-04, ..., 2.1035408e-05,
         1.7236146e-03, 1.0455451e-02],
        [3.4646393e-04, 2.3493146e-04, 5.3098076e-04, ..., 8.6857917e-06,
         3.1236259e-04, 2.5320848e-02],
        [3.1768624e-04, 2.1734303e-02, 5.9318845e-03, ..., 4.2193140e-05,
         1.4842728e-03, 2.8163530e-02]], dtype=float32)]

In [14]:
for i, file in enumerate(filenames):
    pred = prediction[i]
    num_rows = len(pred)
    row_ids = [f'{file}_{(i+1)*5}' for i in range(num_rows)]
    df = pd.DataFrame(columns=['row_id']+bird_cols)
    
    df['row_id'] = row_ids
    df[bird_cols] = pred
    
    sub_df = pd.concat([sub_df,df]).reset_index(drop=True)
    

In [15]:
sub_df

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,soundscape_29201_5,0.003245,0.000144,0.000055,1.880557e-07,0.000020,0.001184,0.000109,2.525306e-07,0.000101,...,0.000005,0.000021,0.001845,0.000001,0.000002,0.000007,0.000021,0.000034,0.000003,0.005082
1,soundscape_29201_10,0.001385,0.064901,0.003351,1.492601e-02,0.006959,0.005190,0.039473,1.192670e-02,0.005551,...,0.000228,0.000456,0.037604,0.003086,0.002343,0.006282,0.002592,0.007950,0.008974,0.099734
2,soundscape_29201_15,0.049506,0.080798,0.019211,1.612537e-02,0.004967,0.020676,0.040244,5.504800e-03,0.004813,...,0.000136,0.006166,0.403179,0.020476,0.003436,0.007697,0.003803,0.000378,0.001449,0.013118
3,soundscape_29201_20,0.000188,0.000014,0.000248,1.316954e-04,0.000060,0.000033,0.000203,8.881603e-04,0.000082,...,0.000011,0.000348,0.003556,0.020630,0.000007,0.000249,0.000101,0.000027,0.000057,0.006428
4,soundscape_29201_25,0.000037,0.000030,0.000289,2.847510e-04,0.000415,0.000181,0.000304,7.803887e-03,0.000128,...,0.000007,0.000033,0.002581,0.026353,0.000004,0.000385,0.000287,0.000015,0.000146,0.016664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,soundscape_29201_580,0.000301,0.001598,0.001242,4.512926e-03,0.002353,0.002324,0.002597,3.243234e-03,0.000897,...,0.000012,0.000612,0.033500,0.009465,0.000018,0.001242,0.001788,0.000301,0.000349,0.020207
116,soundscape_29201_585,0.000313,0.002126,0.000026,5.040270e-05,0.000314,0.000066,0.090556,1.030302e-04,0.017521,...,0.000018,0.000009,0.000682,0.000189,0.008113,0.000208,0.000269,0.000103,0.000597,0.014579
117,soundscape_29201_590,0.000250,0.000147,0.000269,1.259234e-04,0.000555,0.000070,0.000725,4.599689e-03,0.000064,...,0.000008,0.000126,0.003964,0.008142,0.000008,0.000183,0.000254,0.000021,0.001724,0.010455
118,soundscape_29201_595,0.000346,0.000235,0.000531,3.507524e-03,0.003018,0.000148,0.002491,7.473086e-03,0.001344,...,0.000041,0.000376,0.005012,0.010481,0.000014,0.000748,0.000106,0.000009,0.000312,0.025321


In [16]:
sub_df.to_csv('submission.csv',index=False)