In [1]:
import os
import math
import logging
import numpy as np
import pandas as pd
import librosa
import jax
import jax.numpy as jnp
from flax import nnx
import orbax.checkpoint as ocp
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tqdm.auto import tqdm
from pathlib import Path

2025-04-28 00:39:37.528962: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745800777.799332      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745800777.878277      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import kagglehub

In [3]:
class CFG:
    """
    Configuration for BirdCLEF-2025 inference pipeline.
    """
    test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
    submission_csv    = '/kaggle/input/birdclef-2025/sample_submission.csv'
    taxonomy_csv      = '/kaggle/input/birdclef-2025/taxonomy.csv'
    model_path        = '/kaggle/input/birdclef-cnn-baseline/flax/default/1'

    FS          = 32000       # Sampling rate
    WINDOW_SIZE = 5           # Segment duration (seconds)
    N_MELS      = 128         # Mel bands
    HOP_LENGTH  = 512         # STFT hop length
    N_FRAMES    = math.ceil((WINDOW_SIZE * FS) / HOP_LENGTH)  # Time frames per segment

    BATCH_SIZE  = 32          # Inference batch size

# print("Downloading model via kagglehub...")
# model_path = kagglehub.model_download("nikhilpaleti/birdclef-cnn-baseline")
# print(f"Model downloaded to: {model_path}")
# CFG.model_path = model_path

# Check if all data is available
print(f"Test soundscapes directory exists: {os.path.exists(CFG.test_soundscapes)}")
print(f"Taxonomy CSV exists: {os.path.exists(CFG.taxonomy_csv)}")
print(f"Submission CSV exists: {os.path.exists(CFG.submission_csv)}")
print(f"Model path exists: {os.path.exists(CFG.model_path)}")

Test soundscapes directory exists: True
Taxonomy CSV exists: True
Submission CSV exists: True
Model path exists: True


In [4]:
def setup_logging():
    logging.basicConfig(
        format='%(asctime)s %(levelname)s: %(message)s',
        level=logging.INFO
    )
    
def build_label_encoder(taxonomy_csv: str):
    logging.info(f'Loading taxonomy from {taxonomy_csv}')
    df = pd.read_csv(taxonomy_csv)
    labels = sorted(df['primary_label'].astype(str).unique())
    le = LabelEncoder().fit(labels)
    num_classes = len(le.classes_)
    logging.info(f'Found {num_classes} classes')
    return le, num_classes

In [5]:
def process_audio_segment(y: np.ndarray, cfg: CFG) -> np.ndarray:
    """
    Compute log-mel spectrogram for a fixed-length audio segment.
    """
    S = librosa.feature.melspectrogram(
        y=y, sr=cfg.FS, n_mels=cfg.N_MELS,
        hop_length=cfg.HOP_LENGTH, fmax=cfg.FS // 2
    )
    logS = librosa.power_to_db(S, ref=np.max)
    if logS.shape[1] < cfg.N_FRAMES:
        pad = cfg.N_FRAMES - logS.shape[1]
        logS = np.pad(
            logS,
            ((0,0),(0,pad)),
            mode='constant',
            constant_values=logS.min()
        )
    else:
        logS = logS[:, :cfg.N_FRAMES]
    return logS.astype(np.float32)

In [6]:
class AudioCNN(nnx.Module):
    """Basic CNN for log-mel spectrograms."""
    def __init__(self, num_classes: int, rngs: nnx.Rngs):
        self.conv1 = nnx.Conv(1, 32, kernel_size=(3,3), rngs=rngs)
        self.conv2 = nnx.Conv(32, 64, kernel_size=(3,3), rngs=rngs)
        self.pool  = lambda x: nnx.avg_pool(x, window_shape=(2,2), strides=(2,2))
        self.dense = nnx.Linear(159744, 128, rngs=rngs)
        self.out   = nnx.Linear(128, num_classes, rngs=rngs)

    def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
        x = self.pool(nnx.relu(self.conv1(x)))
        x = self.pool(nnx.relu(self.conv2(x)))
        x = x.reshape(x.shape[0], -1)
        x = nnx.sigmoid(self.dense(x))
        return self.out(x)

In [7]:
def load_model(cfg: CFG, num_classes: int) -> nnx.Module:
    logging.info('Restoring model checkpoint')
    # Create abstract model to get graphdef and state spec
    abstract = nnx.eval_shape(lambda: AudioCNN(num_classes, rngs=nnx.Rngs(0)))
    graphdef, abstract_state = nnx.split(abstract)
    ckpt = ocp.StandardCheckpointer()
    restored_state = ckpt.restore(
        os.path.join(cfg.model_path, 'model_state'),
        abstract_state
    )
    model = nnx.merge(graphdef, restored_state)
    return model

In [8]:
def to_tf_inference_dataset(audio_paths, cfg: CFG):
    """
    Build a tf.data.Dataset yielding (spectrogram, row_id) batches.
    """
    def gen():
        for path in audio_paths:
            y, _ = librosa.load(str(path), sr=cfg.FS)
            seg_len = cfg.FS * cfg.WINDOW_SIZE
            n_segs = math.ceil(len(y) / seg_len)
            soundscape = path.stem
            for i in range(n_segs):
                start = i * seg_len
                end = start + seg_len
                seg = y[start:end]
                if len(seg) < seg_len:
                    seg = np.pad(seg, (0, seg_len - len(seg)), mode='constant')
                logS = process_audio_segment(seg, cfg)
                row_id = f"{soundscape}_{(i+1)*cfg.WINDOW_SIZE}"
                yield logS[..., None], row_id

    output_signature = (
        tf.TensorSpec((cfg.N_MELS, cfg.N_FRAMES, 1), tf.float32),
        tf.TensorSpec((), tf.string)
    )
    ds = tf.data.Dataset.from_generator(
        gen,
        output_signature=output_signature
    )
    return ds.batch(cfg.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


In [9]:
def inference(cfg: CFG) -> pd.DataFrame:
    setup_logging()
    le, num_classes = build_label_encoder(cfg.taxonomy_csv)
    classes = le.classes_.tolist()

    model = load_model(cfg, num_classes)

    # Prepare test files and dataset
    audio_paths = sorted(Path(cfg.test_soundscapes).glob('*.ogg'))
    logging.info(f'Found {len(audio_paths)} test soundscape files')
    ds_inf = to_tf_inference_dataset(audio_paths, cfg)

    all_row_ids = []
    all_preds   = []

    # Iterate batches
    for specs_batch, ids_batch in tqdm(ds_inf.as_numpy_iterator(), desc='Inference'):  # specs_batch: (B,H,W,1), ids_batch: (B,)
        # Run model
        logits = model(jnp.array(specs_batch))
        probs = jax.nn.sigmoid(logits)
        probs_np = np.array(probs)
        # Collect
        for rid, p in zip(ids_batch, probs_np):
            # rid from tf may be bytes
            if isinstance(rid, bytes):
                rid = rid.decode('utf-8')
            all_row_ids.append(rid)
            all_preds.append(p)

    preds_arr = np.stack(all_preds, axis=0)
    # Build DataFrame
    df = pd.DataFrame(preds_arr, columns=classes)
    df.insert(0, 'row_id', all_row_ids)

    # Align with submission template
    template = pd.read_csv(cfg.submission_csv)
    df = (
        df.set_index('row_id')
          .reindex(template['row_id'])
          .fillna(0.0)
          .reset_index()
    )
    # Ensure all species columns
    for col in template.columns[1:]:
        if col not in df.columns:
            df[col] = 0.0
    df = df[template.columns]

    # Save
    df.to_csv('submission.csv', index=False)
    logging.info('Saved submission.csv')
    return df

In [10]:
if __name__ == '__main__':
    cfg = CFG()
    submission_df = inference(cfg)
    print(submission_df.head())

2025-04-28 00:39:52.102666: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Inference: 0it [00:00, ?it/s]

ValueError: need at least one array to stack