# Bird Project Data Preprocessing (In Kaggle Env)

In [None]:
import gc
import glob
import logging
import os
import random
import re
import sys
import time
import warnings
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import torchaudio.transforms as AT
from scipy import signal
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

os.environ["CUDA_VISIBLE_DEVICES"] = ""
torch.set_num_threads(4)

In [None]:
class MyAudioConfig:
    def __init__(self):
        # sampling
        self.sample_rate = 32000
        self.window_size = 5  # seconds

        # mel-spec
        self.n_fft = 2048
        self.hop_length = 512
        self.n_mels = 128
        self.fmin = 20
        self.fmax = 16000
        self.power = 2.0
        self.target_shape = (256, 128)

        # processing
        self.noise_reduction_strength = 0.1
        self.contrast_factor = 0.15

        # spec augment
        self.freq_mask_param = 20
        self.time_mask_param = 30
        self.freq_mask_count = 1
        self.time_mask_count = 1

        self.eps = 1e-6


class MyAudioPipeline:
    def __init__(self, cfg: MyAudioConfig):
        self.cfg = cfg

    def reduce_noise(self, audio: np.ndarray) -> np.ndarray:
        denoised = signal.medfilt(audio, 5)
        a = self.cfg.noise_reduction_strength
        return (1 - a) * audio + a * denoised

    def normalize_audio(self, audio: np.ndarray) -> np.ndarray:
        audio = audio - np.mean(audio)
        m = np.max(np.abs(audio))
        return audio / m if m > 0 else audio

    def enhance_contrast(self, spec: np.ndarray) -> np.ndarray:
        mu = np.mean(spec)
        return np.clip(mu + (spec - mu) * (1 + self.cfg.contrast_factor), 0, 1)

    def spec_augment(self, spec: np.ndarray) -> np.ndarray:
        out = spec.copy()
        # freq mask
        for _ in range(self.cfg.freq_mask_count):
            f = np.random.randint(0, self.cfg.freq_mask_param)
            f0 = np.random.randint(0, out.shape[0] - f)
            out[f0:f0 + f, :] = 0
        # time mask
        for _ in range(self.cfg.time_mask_count):
            t = np.random.randint(0, self.cfg.time_mask_param)
            t0 = np.random.randint(0, out.shape[1] - t)
            out[:, t0:t0 + t] = 0
        return out

    def audio_to_melspec(self, audio: np.ndarray) -> np.ndarray:
        # pad/trim
        n_samples = self.cfg.sample_rate * self.cfg.window_size
        if len(audio) < n_samples:
            audio = np.pad(audio, (0, n_samples - len(audio)))
        else:
            audio = audio[:n_samples]

        audio = self.normalize_audio(self.reduce_noise(audio))

        mel = librosa.feature.melspectrogram(
            y=audio,
            sr=self.cfg.sample_rate,
            n_fft=self.cfg.n_fft,
            hop_length=self.cfg.hop_length,
            n_mels=self.cfg.n_mels,
            fmin=self.cfg.fmin,
            fmax=self.cfg.fmax,
            power=self.cfg.power,
        )
        mel_db = librosa.power_to_db(mel, ref=np.max)
        mel_norm = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + self.cfg.eps)

        mel_norm = self.enhance_contrast(mel_norm)
        mel_norm = self.spec_augment(mel_norm)

        if self.cfg.target_shape:
            mel_norm = cv2.resize(mel_norm, self.cfg.target_shape, interpolation=cv2.INTER_LINEAR)

        return mel_norm.astype(np.float32)

In [None]:
cfg = MyAudioConfig()
pipeline = MyAudioPipeline(cfg)

train_audio_dir = '/kaggle/input/birdclef-2025/train_audio'
output_file = 'train_data.npy'

data_dict = {}

ogg_files = glob.glob(os.path.join(train_audio_dir, '**', '*.ogg'), recursive=True)
print(f"Found {len(ogg_files)} ogg files under {train_audio_dir}")

for oggfile in tqdm(ogg_files):
    label_dir = os.path.basename(os.path.dirname(oggfile))
    label = label_dir 
    filename = os.path.basename(oggfile)
    
    y, sr = librosa.load(oggfile, sr=cfg.sample_rate, mono=True)
    
    mel_spec = pipeline.audio_to_melspec(y)
    
    file_id = f"{label}_{filename}"
    data_dict[file_id] = {
        "data": mel_spec,
        "label": label
    }

np.save(output_file, data_dict, allow_pickle=True)
print(f"Saved processed data to {output_file}")

Found 28564 ogg files under /kaggle/input/birdclef-2025/train_audio


100%|██████████| 28564/28564 [38:08<00:00, 12.48it/s]


Saved processed data to train_data.npy
