# Bird Project Final (In Kaggle Env)

In [6]:
import gc
import glob
import logging
import os
import pickle
import random
import re
import sys
import time
import warnings
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import librosa
import matplotlib.pyplot as plt
import numpy
import numpy as np
import pandas
import pandas as pd
import tensorflow as tf
import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import torchaudio.transforms as AT
from scipy import signal
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import load_model
from tqdm import tqdm

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

os.environ["CUDA_VISIBLE_DEVICES"] = ""
torch.set_num_threads(4)

## Preprocessing

In [8]:
class MyAudioConfig:
    def __init__(self):

        self.sample_rate = 32000
        # Audio window length, in seconds
        self.window_size = 5

        # Mel spectrum parameters
        self.n_fft = 2048  # FFT window size (frequency domain resolution)
        self.hop_length = (
            512  # How many points are used to perform a window (time resolution)
        )
        self.n_mels = 128  # The number of Mel bins output, the larger the number, the higher the frequency resolution
        self.fmin = 20  # Minimum frequency range
        self.fmax = 16000  # Maximum frequency range (half the sampling rate)

        self.power = 2.0  # Power spectrum index, 2.0 represents power spectrum, 1.0 represents amplitude spectrum

        self.target_shape = (256, 128)

        # Noise processing parameters
        self.apply_noise_reduction = True  # Whether to enable noise reduction
        self.noise_reduction_strength = 0.1  # Noise reduction strength: the ratio of the original signal + the noise reduction signal

        # Audio normalization, remove DC offset and normalize amplitude
        self.apply_normalization = True

        # Spectrum contrast enhancement
        self.apply_spec_contrast = True
        self.contrast_factor = 0.15

        # SpecAugment
        self.use_spec_augment = False
        self.freq_mask_param = 20
        self.time_mask_param = 30
        self.freq_mask_count = 1
        self.time_mask_count = 1

        # Small constant for numerical stability
        self.eps = 1e-6


class MyAudioPipeline:
    """
    A pipeline class that converts raw audio waveform into
    a final Mel-spectrogram, mirroring the structure of the
    original big code. This includes:
    - Noise reduction
    - Normalization
    - Mel-spectrogram
    - dB scaling + minmax normalization
    - (Optional) Contrast enhancement
    - (Optional) SpecAugment
    - Resizing to target shape
    """

    def __init__(self, cfg: MyAudioConfig):
        self.cfg = cfg

    def reduce_noise(self, audio_data: np.ndarray) -> np.ndarray:
        """
        Apply simple median-filter-based denoising, then mix
        original signal and denoised signal by noise_reduction_strength.
        """
        if not self.cfg.apply_noise_reduction:
            return audio_data

        # Median filter
        window_size = 5
        audio_denoised = signal.medfilt(audio_data, window_size)

        # Mix
        alpha = self.cfg.noise_reduction_strength
        return (1 - alpha) * audio_data + alpha * audio_denoised

    def normalize_audio(self, audio_data: np.ndarray) -> np.ndarray:
        """
        Remove DC offset and scale to [-1, 1].
        """
        if not self.cfg.apply_normalization:
            return audio_data

        mean_val = np.mean(audio_data)
        audio_data = audio_data - mean_val

        max_amp = np.max(np.abs(audio_data))
        if max_amp > 0:
            audio_data = audio_data / max_amp

        return audio_data

    def enhance_spectrogram_contrast(
        self, spec: np.ndarray, factor: float = 0.15
    ) -> np.ndarray:
        """
        Enhance spectrogram contrast. We shift values away from the mean,
        then clip to [0,1].
        """
        mean_val = np.mean(spec)
        enhanced = mean_val + (spec - mean_val) * (1 + factor)
        return np.clip(enhanced, 0, 1)

    def apply_spec_augment(self, spec: np.ndarray) -> np.ndarray:
        """
        SpecAugment: randomly mask along frequency & time axes.
        """
        if not self.cfg.use_spec_augment:
            return spec

        augmented = spec.copy()

        # Frequency mask
        for _ in range(self.cfg.freq_mask_count):
            f = np.random.randint(0, self.cfg.freq_mask_param)
            f0 = np.random.randint(0, augmented.shape[0] - f)
            augmented[f0 : f0 + f, :] = 0

        # Time mask
        for _ in range(self.cfg.time_mask_count):
            t = np.random.randint(0, self.cfg.time_mask_param)
            t0 = np.random.randint(0, augmented.shape[1] - t)
            augmented[:, t0 : t0 + t] = 0

        return augmented

    def audio_to_melspec(self, audio_data: np.ndarray) -> np.ndarray:
        """
        Core function that:
         1) Pad or trim to self.cfg.window_size
         2) reduce_noise -> normalize_audio
         3) librosa.feature.melspectrogram
         4) power_to_db -> [0,1] minmax
         5) (Optional) enhance contrast
         6) (Optional) spec augment
         7) resize
        """
        # 1) Pad or trim to exactly (sample_rate * window_size) samples
        required_samples = self.cfg.sample_rate * self.cfg.window_size
        if len(audio_data) < required_samples:
            audio_data = np.pad(
                audio_data, (0, required_samples - len(audio_data)), mode="constant"
            )
        elif len(audio_data) > required_samples:
            audio_data = audio_data[:required_samples]

        # 2) Denoise & Normalize
        audio_data = self.reduce_noise(audio_data)
        audio_data = self.normalize_audio(audio_data)

        # 3) Mel-spectrogram
        mel = librosa.feature.melspectrogram(
            y=audio_data,
            sr=self.cfg.sample_rate,
            n_fft=self.cfg.n_fft,
            hop_length=self.cfg.hop_length,
            n_mels=self.cfg.n_mels,
            fmin=self.cfg.fmin,
            fmax=self.cfg.fmax,
            power=self.cfg.power,
        )

        # 4) Convert to dB, then min-max to [0,1]
        mel_db = librosa.power_to_db(mel, ref=np.max)
        db_min, db_max = mel_db.min(), mel_db.max()
        mel_norm = (mel_db - db_min) / (db_max - db_min + self.cfg.eps)

        # 5) Enhance contrast if needed
        if self.cfg.apply_spec_contrast:
            mel_norm = self.enhance_spectrogram_contrast(
                mel_norm, self.cfg.contrast_factor
            )

        # 6) SpecAugment (time/freq mask)
        mel_aug = self.apply_spec_augment(mel_norm)

        # 7) Resize
        # if target_shape=(256,128) means (width=256, height=128)
        # but mel shape is (n_mels, time_frames) -> (128, ???)
        # So we do: cv2.resize(mel_aug, (width, height))
        if self.cfg.target_shape is not None:
            mel_aug = cv2.resize(
                mel_aug, self.cfg.target_shape, interpolation=cv2.INTER_LINEAR
            )

        # Return float32 array
        return mel_aug.astype(np.float32)

In [9]:
# Read sample_submission.csv to get the column order
sample = pd.read_csv("/kaggle/input/birdclef-2025/sample_submission.csv")
species_columns = list(sample.columns[1:])
NUM_SPECIES = len(species_columns)

model = tf.keras.models.load_model("/kaggle/input/my_resnet_model_improved/keras/default/1/my_resnet_model_improved.h5")

cfg = MyAudioConfig()
pipeline = MyAudioPipeline(cfg)
TEST_DIR = "/kaggle/input/birdclef-2025/test_soundscapes"
WIN_SEC  = cfg.window_size
WIN_SAMPLES = cfg.sample_rate * WIN_SEC

rows = []
ogg_list = sorted(glob.glob(os.path.join(TEST_DIR, "*.ogg")))
print("Total number of test audio files found：", len(ogg_list))

for ogg in tqdm(ogg_list):
    y, _ = librosa.load(ogg, sr=cfg.sample_rate, mono=True)
    n_windows = len(y) // WIN_SAMPLES

    base = os.path.splitext(os.path.basename(ogg))[0]

    for k in range(n_windows):
        seg = y[k*WIN_SAMPLES : (k+1)*WIN_SAMPLES]
        mel = pipeline.audio_to_melspec(seg)            # (128,256)
        mel = np.expand_dims(mel, axis=(0,-1))          # (1,128,256,1)

        probs = model.predict(mel, verbose=0)[0]        # (206,)

        row_id = f"{base}_{(k+1)*WIN_SEC}"
        rows.append([row_id, *probs])
        

sub_df = pd.DataFrame(rows, columns=["row_id"] + species_columns)
sub_df.to_csv("submission.csv", index=False)
print("submission.csv saved", sub_df.shape)

2025-04-17 12:49:36.406321: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Total number of test audio files found： 9726


  0%|          | 0/9726 [00:00<?, ?it/s]

KeyboardInterrupt: 