# Bird Project

In [1]:
import time
from concurrent.futures import ThreadPoolExecutor
import glob
import librosa
import numpy as np
import os
import pandas as pd
import re
import sys
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
import timm
import torchaudio
import torchaudio.transforms as AT
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from scipy import signal
import gc
import warnings
import logging
from pathlib import Path
import random

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

os.environ['CUDA_VISIBLE_DEVICES'] = ''
torch.set_num_threads(4) 

class MyAudioConfig:
    def __init__(self):        
        self.sample_rate = 32000
        # 音频窗口长度，单位秒
        self.window_size = 5

        # Mel 频谱图参数
        self.n_fft = 2048               # FFT 窗口大小（频域分辨率）
        self.hop_length = 512           # 每隔多少点进行一次窗口（时间分辨率）
        self.n_mels = 128               # 输出的 Mel bin 数量，越大频率分辨率越高
        self.fmin = 20                  # 最低频率范围
        self.fmax = 16000               # 最高频率范围（采样率一半）

        self.power = 2.0                # 功率谱指数，2.0 表示功率谱，1.0 表示幅度谱

        # 输出图片尺寸（输入到模型前 resize 成统一大小）
        self.target_shape = (256, 128)  # 宽度 256，高度 128

        # 噪声处理参数
        self.apply_noise_reduction = True  # 是否开启降噪
        self.noise_reduction_strength = 0.1  # 降噪强度：原始信号 + 降噪信号混合比例

        # 音频归一化，去除直流偏移并归一化振幅
        self.apply_normalization = True

        # 频谱图对比度增强
        self.apply_spec_contrast = True
        self.contrast_factor = 0.15  # 增强力度

        # SpecAugment（训练增强，防止过拟合）
        self.use_spec_augment = False
        self.freq_mask_param = 20
        self.time_mask_param = 30
        self.freq_mask_count = 1
        self.time_mask_count = 1

        # 数值稳定性的小常数
        self.eps = 1e-6


class MyAudioPipeline:
    """
    A pipeline class that converts raw audio waveform into
    a final Mel-spectrogram, mirroring the structure of the 
    original big code. This includes:
    - Noise reduction
    - Normalization
    - Mel-spectrogram
    - dB scaling + minmax normalization
    - (Optional) Contrast enhancement
    - (Optional) SpecAugment
    - Resizing to target shape
    """
    def __init__(self, cfg: MyAudioConfig):
        self.cfg = cfg

    def reduce_noise(self, audio_data: np.ndarray) -> np.ndarray:
        """
        Apply simple median-filter-based denoising, then mix
        original signal and denoised signal by noise_reduction_strength.
        """
        if not self.cfg.apply_noise_reduction:
            return audio_data
        
        # Median filter
        window_size = 5
        audio_denoised = signal.medfilt(audio_data, window_size)
        
        # Mix
        alpha = self.cfg.noise_reduction_strength
        return (1 - alpha) * audio_data + alpha * audio_denoised

    def normalize_audio(self, audio_data: np.ndarray) -> np.ndarray:
        """
        Remove DC offset and scale to [-1, 1].
        """
        if not self.cfg.apply_normalization:
            return audio_data
        
        mean_val = np.mean(audio_data)
        audio_data = audio_data - mean_val
        
        max_amp = np.max(np.abs(audio_data))
        if max_amp > 0:
            audio_data = audio_data / max_amp
        
        return audio_data

    def enhance_spectrogram_contrast(self, spec: np.ndarray, factor: float = 0.15) -> np.ndarray:
        """
        Enhance spectrogram contrast. We shift values away from the mean,
        then clip to [0,1].
        """
        mean_val = np.mean(spec)
        enhanced = mean_val + (spec - mean_val) * (1 + factor)
        return np.clip(enhanced, 0, 1)

    def apply_spec_augment(self, spec: np.ndarray) -> np.ndarray:
        """
        SpecAugment: randomly mask along frequency & time axes.
        """
        if not self.cfg.use_spec_augment:
            return spec
        
        augmented = spec.copy()
        
        # Frequency mask
        for _ in range(self.cfg.freq_mask_count):
            f = np.random.randint(0, self.cfg.freq_mask_param)
            f0 = np.random.randint(0, augmented.shape[0] - f)
            augmented[f0:f0+f, :] = 0
        
        # Time mask
        for _ in range(self.cfg.time_mask_count):
            t = np.random.randint(0, self.cfg.time_mask_param)
            t0 = np.random.randint(0, augmented.shape[1] - t)
            augmented[:, t0:t0+t] = 0
        
        return augmented
    
    def audio_to_melspec(self, audio_data: np.ndarray) -> np.ndarray:
        """
        Core function that:
         1) Pad or trim to self.cfg.window_size
         2) reduce_noise -> normalize_audio
         3) librosa.feature.melspectrogram
         4) power_to_db -> [0,1] minmax
         5) (Optional) enhance contrast
         6) (Optional) spec augment
         7) resize
        """
        # 1) Pad or trim to exactly (sample_rate * window_size) samples
        required_samples = self.cfg.sample_rate * self.cfg.window_size
        if len(audio_data) < required_samples:
            audio_data = np.pad(audio_data, (0, required_samples - len(audio_data)), mode='constant')
        elif len(audio_data) > required_samples:
            audio_data = audio_data[:required_samples]
        
        # 2) Denoise & Normalize
        audio_data = self.reduce_noise(audio_data)
        audio_data = self.normalize_audio(audio_data)
        
        # 3) Mel-spectrogram
        mel = librosa.feature.melspectrogram(
            y=audio_data,
            sr=self.cfg.sample_rate,
            n_fft=self.cfg.n_fft,
            hop_length=self.cfg.hop_length,
            n_mels=self.cfg.n_mels,
            fmin=self.cfg.fmin,
            fmax=self.cfg.fmax,
            power=self.cfg.power
        )
        
        # 4) Convert to dB, then min-max to [0,1]
        mel_db = librosa.power_to_db(mel, ref=np.max)
        db_min, db_max = mel_db.min(), mel_db.max()
        mel_norm = (mel_db - db_min) / (db_max - db_min + self.cfg.eps)
        
        # 5) Enhance contrast if needed
        if self.cfg.apply_spec_contrast:
            mel_norm = self.enhance_spectrogram_contrast(mel_norm, self.cfg.contrast_factor)
        
        # 6) SpecAugment (time/freq mask)
        mel_aug = self.apply_spec_augment(mel_norm)
        
        # 7) Resize
        # if target_shape=(256,128) means (width=256, height=128)
        # but mel shape is (n_mels, time_frames) -> (128, ???)
        # So we do: cv2.resize(mel_aug, (width, height))
        if self.cfg.target_shape is not None:
            mel_aug = cv2.resize(mel_aug, self.cfg.target_shape, interpolation=cv2.INTER_LINEAR)
        
        # Return float32 array
        return mel_aug.astype(np.float32)

In [2]:
import os
import glob
import librosa
import numpy as np
from tqdm import tqdm

cfg = MyAudioConfig()
pipeline = MyAudioPipeline(cfg)

train_audio_dir = '/kaggle/input/birdclef-2025/train_audio'
output_file = 'train_data.npy'

data_dict = {}

ogg_files = glob.glob(os.path.join(train_audio_dir, '**', '*.ogg'), recursive=True)
print(f"Found {len(ogg_files)} ogg files under {train_audio_dir}")

for oggfile in tqdm(ogg_files):
    label_dir = os.path.basename(os.path.dirname(oggfile))
    label = label_dir 
    filename = os.path.basename(oggfile)
    
    y, sr = librosa.load(oggfile, sr=cfg.sample_rate, mono=True)
    
    mel_spec = pipeline.audio_to_melspec(y)
    
    file_id = f"{label}_{filename}"
    data_dict[file_id] = {
        "data": mel_spec,
        "label": label
    }

np.save(output_file, data_dict, allow_pickle=True)
print(f"Saved processed data to {output_file}")

Found 28564 ogg files under /kaggle/input/birdclef-2025/train_audio


100%|██████████| 28564/28564 [38:08<00:00, 12.48it/s]


Saved processed data to train_data.npy
