In [14]:
import numpy as np
import pandas as pd
import os
import warnings
import joblib
import torch
from glob import glob

In [15]:
class Config:
    class_names = sorted(os.listdir("/root/birdclef2024/inputs/train_audios/"))
    num_classes = len(class_names)
    debug = False
 
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')    

    data_root = "/root/birdclef2024/inputs/"
    train_path = "/root/birdclef2024/inputs/train_audios/train_metadata.csv"
    test_path = '/root/birdclef2024/inputs/test_audios/unlabeled_soundscapes'
    if len(glob(f'{test_path}/*.ogg'))==0:
        test_path = '/kaggle/input/birdclef-2024/unlabeled_soundscapes'

    SR = 32000
    DURATION = 5
    
    infer_duration=10
    
    train_duration=10
    
    # Sed model
#     model_ckpt = [
#         '/kaggle/input/birdclef-openvino-comp/sed_v2s_final_30s_finetune/sed3_120.xml', #v2s
#         '/kaggle/input/birdclef-openvino-comp/sed_se_half_ce/sed_se_120.xml', #seresnext26t
#         '/kaggle/input/birdclef-openvino-comp/sed_b3ns_30s_finetune/sed3_b3ns_120.xml', #b3ns
#     ]
    
    model_ckpt = [
        # '/root/birdclef2024/outputs/sed_b3ns/openvino/exp002_rmv_dupfiles/sed_b3ns.xml'
#         '/kaggle/input/birdclef2024-openvino-models/sed_v2s.xml', #v2s
    ]    
    
    # CNN model
    re_model_ckpt = [
        '/root/birdclef2024/outputs/cnn_b3ns/openvino/exp002_rmv_dupfiles/cnn_b3ns.xml'
        # '/kaggle/input/birdclef2024-openvino-models/cnn_b0ns.xml'
#         '/kaggle/input/birdclef2024-openvino-models/cnn_v2s.xml'
#         '/kaggle/input/birdclef-openvino-comp/openvino_models_comp_half/re_120.xml', #resnet34d
#         '/kaggle/input/birdclef-openvino-comp/re_b3ns_ce/re_b3ns_120.xml', #b3ns
#         '/kaggle/input/birdclef-openvino-comp/re_v2s_30s_finetune/re_v2s_120.xml', #v2s
#         '/kaggle/input/birdclef-openvino-comp/re_b0ns_final/re_b0ns_120.xml', #b0ns
    ]

In [16]:
n_model = len(Config.model_ckpt)+len(Config.re_model_ckpt)

In [17]:
df_train = pd.read_csv(Config.train_path)
Config.num_classes = len(df_train.primary_label.unique())

In [18]:
def pred(df_test,num_workers=1,sleep=0,batch_size=1):
    import openvino.runtime as ov
    core = ov.Core()
    
    import numpy as np
    import pandas as pd
    import torch
    import os
    from torch.utils.data import Dataset, DataLoader
    import warnings

    warnings.filterwarnings('ignore')
    import torch.nn as nn
    import timm
    import librosa as lb
    import soundfile as sf
    from  soundfile import SoundFile 
    import torchaudio

    import torch.nn as nn
    import time
    from torch.nn import functional as F
    from torch.distributions import Beta
    from torch.nn.parameter import Parameter
    from joblib.externals.loky.backend.context import get_context
    #torch.jit.enable_onednn_fusion(True)


    class BirdDatasetSED(torch.utils.data.Dataset):

        def __init__(self, df, sr = Config.SR,n_mels=128, fmin=0, fmax=None, step=None, res_type="kaiser_fast",resample=True, duration = Config.DURATION, train = True):

            self.df = df
            self.sr = sr 
            self.n_mels = n_mels
            self.fmin = fmin
            self.fmax = fmax or self.sr//2

            self.train = train
            self.duration = duration

            self.audio_length = self.duration*self.sr
            self.step = step or self.audio_length

            self.res_type = res_type
            self.resample = resample   

        def __len__(self):
            return len(self.df)

        def read_file(self, filepath):
            #audio, orig_sr = torchaudio.load(filepath)
            #if orig_sr != self.sr:
            #    # sinc_interpolation
            #    resample_transform = torchaudio.transforms.Resample(orig_sr, self.sr, resampling_method="kaiser_window")
            #    audio = resample_transform(audio)

            audio, orig_sr = sf.read(filepath, dtype="float32")

            if self.resample and orig_sr != self.sr:
                audio = lb.resample(audio, orig_sr, self.sr, res_type=self.res_type)

            seconds = []
            for i in range(self.audio_length, len(audio) + self.step, self.step):
                start = max(0, i - self.audio_length)
                end = start + self.audio_length
                if end > len(audio):
                    pass
                else:
                    seconds.append(int(end/self.sr))

            # 端のデータをattentionかけれるようにデータを連結しておく
            audio = np.concatenate([audio,audio,audio])
            audios = []
            for i,second in enumerate(seconds):
                end_seconds = int(second)
                start_seconds = int(end_seconds - Config.DURATION)

                end_index = int(self.sr * (end_seconds + (Config.train_duration - Config.DURATION) / 2) ) + len(audio) // 3
                start_index = int(self.sr * (start_seconds - (Config.train_duration - Config.DURATION) / 2) ) + len(audio) // 3
                end_pad = int(self.sr * (Config.train_duration - Config.DURATION) / 2) 
                start_pad = int(self.sr * (Config.train_duration - Config.DURATION) / 2) 
                y = audio[start_index:end_index].astype(np.float32)
                if i==0:
                    y[:start_pad] = 0
                elif i==(len(seconds)-1):
                    y[-end_pad:] = 0
                audios.append(y)
            audios = np.stack(audios)
            audios = torch.tensor(audios).float().unsqueeze(1)
            spec384,spec256,spec300_another,spec_rev2s=transform_to_spec(audios,train=False)
            return spec384,spec256,spec300_another,spec_rev2s

        def __getitem__(self, idx):

            return self.read_file(self.df.loc[idx, "path"])
        
    # 色んな時間解像度でmelspec_transformを作成
    # 出力は1channel
    hop_length384 = Config.infer_duration*Config.SR // (384-1)
    melspec_transform = torchaudio.transforms.MelSpectrogram(sample_rate=Config.SR, hop_length=hop_length384, n_mels=128, f_min=0, f_max=Config.SR//2, n_fft=2048, center=True, pad_mode='constant',norm='slaney',onesided=True,mel_scale='slaney')
    hop_length256 = Config.infer_duration*Config.SR // (256-1)
    melspec_transform256 = torchaudio.transforms.MelSpectrogram(sample_rate=Config.SR, hop_length=hop_length256, n_mels=128, f_min=0, f_max=Config.SR//2, n_fft=2048, center=True, pad_mode='constant',norm='slaney',onesided=True,mel_scale='slaney')
    #hop_length224 = Config.infer_duration*Config.SR // (224-1)
    #melspec_transform224 = torchaudio.transforms.MelSpectrogram(sample_rate=Config.SR, hop_length=hop_length224, n_mels=128, f_min=0, f_max=Config.SR//2, n_fft=2048, center=True, pad_mode='constant',norm='slaney',onesided=True,mel_scale='slaney')
    hop_length300 = Config.infer_duration*Config.SR // (300-1)
    melspec_transform300 = torchaudio.transforms.MelSpectrogram(sample_rate=Config.SR, hop_length=hop_length300, n_mels=128, f_min=50, f_max=14000, n_fft=1024, center=True, pad_mode='constant',norm='slaney',onesided=True,mel_scale='slaney')
    melspec_transform_rev2s = torchaudio.transforms.MelSpectrogram(sample_rate=Config.SR, hop_length=320, n_mels=64, f_min=50, f_max=14000, n_fft=1024, center=True, pad_mode='constant',norm='slaney',onesided=True,mel_scale='slaney')
    
    db_transform = torchaudio.transforms.AmplitudeToDB(stype='power',top_db=80)

    def transform_to_spec(audio,train=True):
        import math
        amin=1e-10
        ref_value=1.0
        db_multiplier = math.log10(max(amin, ref_value))
        spec = melspec_transform(audio)     
        #spec = torchaudio.functional.amplitude_to_DB(spec,multiplier=10,amin=amin,db_multiplier=db_multiplier,top_db=80)
        spec = db_transform(spec)
        spec256 = melspec_transform256(audio)
        spec256 = db_transform(spec256)
        
        #spec224 = melspec_transform224(audio)
        #spec224 = db_transform(spec224)
        
        spec300_another = melspec_transform300(audio)
        spec300_another = db_transform(spec300_another)
        
        spec_rev2s = melspec_transform_rev2s(audio)
        spec_rev2s = db_transform(spec_rev2s)
        
        spec384 = (spec+80)/80
        spec256 = spec256/255
        #spec224 = spec224/255
        spec300_another = spec300_another/255
        spec_rev2s = (spec_rev2s+80)/80
        return spec384,spec256,spec300_another,spec_rev2s

    
    
    def openvino_infer(model,data,tta):
        outputs = model.infer(inputs=[data,tta])
        outputs = torch.tensor(outputs[list(outputs.keys())[0]])
        return outputs
    
    def openvino_infer_re(model,data):
        outputs = model.infer(inputs=[data])
        outputs = torch.tensor(outputs[list(outputs.keys())[0]])
        return outputs
    
    def compute_deltas(
            specgram: torch.Tensor,
            win_length: int = 5,
            mode: str = "replicate"
    ) -> torch.Tensor:
        r"""Compute delta coefficients of a tensor, usually a spectrogram:

        .. math::
           d_t = \frac{\sum_{n=1}^{\text{N}} n (c_{t+n} - c_{t-n})}{2 \sum_{n=1}^{\text{N}} n^2}

        where :math:`d_t` is the deltas at time :math:`t`,
        :math:`c_t` is the spectrogram coeffcients at time :math:`t`,
        :math:`N` is ``(win_length-1)//2``.

        Args:
            specgram (Tensor): Tensor of audio of dimension (..., freq, time)
            win_length (int, optional): The window length used for computing delta (Default: ``5``)
            mode (str, optional): Mode parameter passed to padding (Default: ``"replicate"``)

        Returns:
            Tensor: Tensor of deltas of dimension (..., freq, time)

        Example
            >>> specgram = torch.randn(1, 40, 1000)
            >>> delta = compute_deltas(specgram)
            >>> delta2 = compute_deltas(delta)
        """
        device = specgram.device
        dtype = specgram.dtype

        # pack batch
        shape = specgram.size()
        specgram = specgram.reshape(1, -1, shape[-1])

        assert win_length >= 3

        n = (win_length - 1) // 2

        # twice sum of integer squared
        denom = n * (n + 1) * (2 * n + 1) / 3

        specgram = torch.nn.functional.pad(specgram, (n, n), mode=mode)

        kernel = torch.arange(-n, n + 1, 1, device=device, dtype=dtype).repeat(specgram.shape[1], 1, 1)

        output = torch.nn.functional.conv1d(specgram, kernel, groups=specgram.shape[1]) / denom

        # unpack batch
        output = output.reshape(shape)

        return output
    
    
    def make_delta(
        input_tensor: torch.Tensor
    ):
        input_tensor = input_tensor.transpose(3,2)
        input_tensor = compute_deltas(input_tensor)
        input_tensor = input_tensor.transpose(3,2)
        return input_tensor

    
    # 3channelに変更
    def image_delta(x):
        delta_1 = make_delta(x)
        delta_2 = make_delta(delta_1)
        x = torch.cat([x,delta_1,delta_2], dim=1)
        return x
    
    def reshp(images):
        # 4min*60/10inference duration=24batch size?
        bs,clip_len,channel_num,mel_num,time_len = images.size()
        images=images.reshape((bs*clip_len,channel_num,mel_num,time_len))
        return images
    
    def predict(data_loader, models,re_models):   
        predictions = []
        pred_binary = []
        dl_test = DataLoader(ds_test, batch_size=batch_size,num_workers = num_workers, multiprocessing_context=get_context('loky'))
        
        for spec384,spec256,spec300_another,spec_rev2s in dl_test:
            spec384 = reshp(spec384)
            spec256 = reshp(spec256)
            spec300_another = reshp(spec300_another)
            spec300_80 = (spec300_another*255+80)/80
            spec_rev2s = reshp(spec_rev2s)
            
            out = []
            for i,model in enumerate(models):
                model_name = Config.model_ckpt[0].split('/')[-1].split('.')[0]
                if model_name=='sed_v2s':
                    images2_3chan = image_delta(spec384).numpy()

                    if images2_3chan.shape[0]>120:
                        output1 = openvino_infer(model,images2_3chan[:120,:,:,:],3)
                        output2 = openvino_infer(model,images2_3chan[120:240,:,:,:],3)
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer(model,images2_3chan,3)
                elif model_name=='sed_b3ns':
                    images_3chan = image_delta(spec300_another).numpy()
                    print(spec300_another.shape)
                    if images_3chan.shape[0]>120:
                        output1 = openvino_infer(model,images_3chan[:120,:,:,:],2)
                        output2 = openvino_infer(model,images_3chan[120:240,:,:,:],2)
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer(model,images_3chan,2)
                else:
                    image_res = spec256.numpy()

                    if image_res.shape[0]>120:
                        output1 = openvino_infer(model,image_res[:120,:,:,:],2)
                        output2 = openvino_infer(model,image_res[120:240,:,:,:],2)
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer(model,image_res,2)

                out.append(outputs)
            for i,model in enumerate(re_models):
                model_name = Config.re_model_ckpt[0].split('/')[-1].split('.')[0]
                if (model_name=='cnn_b0ns'):
                    image_b0ns = spec256[:,:,:,:].numpy()
                    if image_b0ns.shape[0]>120:
                        output1 = openvino_infer_re(model,image_b0ns[:120,:,:,:])
                        output2 = openvino_infer_re(model,image_b0ns[120:240,:,:,:])
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer_re(model,image_b0ns)
                elif (i==1):
                    images_center_resize2 = image_delta(spec300_80)[:,:,:,150:450].numpy()
                    if images_center_resize2.shape[0]>120:
                        output1 = openvino_infer_re(model,images_center_resize2[:120,:,:,:])
                        output2 = openvino_infer_re(model,images_center_resize2[120:240,:,:,:])
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer_re(model,images_center_resize2)
                elif (model_name=='cnn_v2s'):
                    images_re_v2s = image_delta(spec_rev2s)[:,:,:,250:750].numpy()
                    if images_re_v2s.shape[0]>120:
                        output1 = openvino_infer_re(model,images_re_v2s[:120,:,:,:])
                        output2 = openvino_infer_re(model,images_re_v2s[120:240,:,:,:])
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer_re(model,images_re_v2s)
                elif (i==3):
                    images_center_resize1 = image_delta(spec256)[:,:,:,128:384].numpy()
                    if images_center_resize1.shape[0]>120:
                        output1 = openvino_infer_re(model,images_center_resize1[:120,:,:,:])
                        output2 = openvino_infer_re(model,images_center_resize1[120:240,:,:,:])
                        outputs = torch.cat([output1,output2],dim=0)
                    else:
                        outputs = openvino_infer_re(model,images_center_resize1)
                else:
                    outputs = model(images_center_resize1)
    
                out.append(outputs)
                
            predictions.append(out)
        return predictions

    import gc

    print(f"Create Dataloader...")

    ds_test = BirdDatasetSED(
        df_test, 
        sr = Config.SR,
        duration = Config.DURATION,
        train = False
    )

    
    #print("Model Creation")
    models = []
    for i,ckpt in enumerate(Config.model_ckpt):
        #if i==0:
        #    model = load_mdl(name,ckpt,size,sed_3chan=True)
        #else:
        #    model = load_mdl(name,ckpt,size)

        model = core.read_model(model=ckpt)
        model = core.compile_model(model, device_name="CPU")
        model = model.create_infer_request()
        models.append(model)
        
    re_models = []
    for i,ckpt in enumerate(Config.re_model_ckpt):

        model = core.read_model(model=ckpt)
        model = core.compile_model(model, device_name="CPU")
        model = model.create_infer_request()
        re_models.append(model)

    print("Running Inference..")
    time.sleep(sleep)
    preds = predict(ds_test, models,re_models)   

    return preds

In [19]:
import pandas as pd
from pathlib import Path

df_test = pd.DataFrame(
     [(path.stem, path) for path in Path(Config.test_path).glob("*.ogg")],
    columns = ["filename", "path"]
)
# if not submission, use only 5 files out of unlabeled dataset
if len(df_test)==8444:
    df_test = df_test.sample(5)
print(df_test.shape)
df_test.head()

(5, 2)


Unnamed: 0,filename,path
8255,960500408,/root/birdclef2024/inputs/test_audios/unlabele...
4912,2133074275,/root/birdclef2024/inputs/test_audios/unlabele...
8024,906788264,/root/birdclef2024/inputs/test_audios/unlabele...
5737,387939110,/root/birdclef2024/inputs/test_audios/unlabele...
4539,2051341574,/root/birdclef2024/inputs/test_audios/unlabele...


In [20]:
cpu_num=2
num_job = min([cpu_num,len(df_test)])
split = len(df_test)//num_job
num_job,split

(2, 2)

In [21]:
dfs_test = []
df_test_left = None
for i in range(num_job):
    df_test_split = df_test.iloc[i*split:(i+1)*split].reset_index(drop=True)
    dfs_test.append(df_test_split)
    if i==num_job-1:
        df_test_left = df_test.iloc[(i+1)*split:].reset_index(drop=True)
len(dfs_test),len(df_test_left)

(2, 1)

In [22]:
for i,df_test in enumerate(dfs_test):
    print(f"Running Job {i}")
    pred(df_test,2,0,1)

Running Job 0


Create Dataloader...
Running Inference..


NameError: name 'images_center_resize3' is not defined

In [None]:
import time
t1=time.time()
# delayedによってpred関数を遅延実行している
#results1 = joblib.Parallel(n_jobs=num_job, backend='loky')(joblib.delayed(pred)(df_test) for df_test in dfs_test)
# sl引数は，time.sleep(sleep)で実行を遅らせて，CPU負荷を一気にかけるのを避けるため
results1 = joblib.Parallel(n_jobs=num_job, backend='loky')(joblib.delayed(pred)(df_test,num_workers,sl,batch_size) for df_test,num_workers,sl,batch_size in zip(dfs_test,[2,2],[0,5],[1,1]))
t2=time.time()
print(t2-t1)

Create Dataloader...
Create Dataloader...
Running Inference..
Running Inference..


RuntimeError: Check 'inputs.size() > idx' failed at src/inference/src/cpp/ie_infer_request.cpp:296:
Input port for index 1 was not found! The model has only 1 inputs.
