In [39]:
import os
import gc
import warnings
import logging
import time
import math
import cv2
from pathlib import Path

import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F
import timm
from tqdm.auto import tqdm
import torchvision
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

import re
from torch.quantization import quantize_dynamic

In [40]:
class CFG:

    N_FFT = 2048
    HOP_LENGTH = 128
    N_MELS = 512
    FMIN = 20
    FMAX = 16000
    TARGET_SHAPE = (256,256)
    FS = 32000  
    WINDOW_SIZE = 5

    model_path = '/kaggle/input/effnet_v2_focal/pytorch/default/1'
    model_name = "tf_efficientnetv2_s.in21k_ft_in1k"
    use_specific_folds = False
    folds = [0,1,2,3]
    in_channels = 1
    device = 'cpu'  

    # datasets
    test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
    submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
    taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
    
    # Inference parameters
    batch_size = 16
    use_tta = False  
    tta_count = 3
    threshold = 0.5

    # util
    debug = False
    debug_count = 3

cfg = CFG()

In [41]:
print(f"Using device: {cfg.device}")
print(f"Loading taxonomy data...")
taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
species_ids = taxonomy_df['primary_label'].tolist()
num_classes = len(species_ids)
print(f"Number of classes: {num_classes}")

Using device: cpu
Loading taxonomy data...
Number of classes: 206


In [42]:
class BirdCLEFModel(nn.Module):
    def __init__(self, cfg, num_classes):
        super().__init__()
        self.cfg = cfg
        
        self.backbone = timm.create_model(
            cfg.model_name,
            pretrained=False,  
            in_chans=cfg.in_channels,
            drop_rate=0.0,    
            drop_path_rate=0.0
        )
        
        backbone_out = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.feat_dim = backbone_out
        self.classifier = nn.Linear(backbone_out, num_classes)
        
    def forward(self, x):
        features = self.backbone(x)
        if isinstance(features, dict):
            features = features['features']
        if len(features.shape) == 4:
            features = self.pooling(features)
            features = features.view(features.size(0), -1)

        logits = self.classifier(features)
        return logits

In [43]:
def audio2melspec(audio_data, cfg):
    """Convert audio data to mel spectrogram"""
    if np.isnan(audio_data).any():
        mean_signal = np.nanmean(audio_data)
        audio_data = np.nan_to_num(audio_data, nan=mean_signal)

    mel_spec = librosa.feature.melspectrogram(
        y=audio_data,
        sr=cfg.FS,
        n_fft=cfg.N_FFT,
        hop_length=cfg.HOP_LENGTH,
        n_mels=cfg.N_MELS,
        fmin=cfg.FMIN,
        fmax=cfg.FMAX,
        power=2.0,
        pad_mode="reflect",
        norm='slaney',
        htk=True,
        center=True,
    )

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
    
    return mel_spec_norm

def process_audio_segment(audio_data, cfg):
    """Process audio segment to get mel spectrogram"""
    if len(audio_data) < cfg.FS * cfg.WINDOW_SIZE:
        audio_data = np.pad(audio_data, 
                          (0, cfg.FS * cfg.WINDOW_SIZE - len(audio_data)), 
                          mode='constant')
    
    mel_spec = audio2melspec(audio_data, cfg)
    
    if mel_spec.shape != cfg.TARGET_SHAPE:
        mel_spec = cv2.resize(mel_spec, cfg.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
        
    return mel_spec.astype(np.float32)

In [44]:
def find_model_files(cfg):
    model_dir   = Path(cfg.model_path)
    model_files = [str(p) for p in model_dir.glob("**/*.pt")]
    return model_files


def load_models(cfg, num_classes):

    import re                              
    from torch.quantization import quantize_dynamic

    models      = []
    model_files = find_model_files(cfg)

    if not model_files:
        print(f"[WARN] No .pt found under {cfg.model_path}")
        return models

    print(f"Found {len(model_files)} model files.")

    # ---------- optional: only keep specified folds ----------
    if cfg.use_specific_folds:
        selected = []
        for f in cfg.folds:
            selected += [m for m in model_files if f"fold{f}" in m]
        model_files = selected
        print(f"Using {len(model_files)} files for folds {cfg.folds}")

    # ---------- load each ckpt ----------
    float_pat = re.compile(r"(\d+\.\d{2,4})")      # <<< changed (两到四位小数)
    for mp in model_files:
        try:
            print(f"Loading {mp}")
            ckpt = torch.load(mp, map_location=torch.device(cfg.device))

            net = BirdCLEFModel(cfg, num_classes)
            if isinstance(ckpt, dict) and "model_state_dict" in ckpt:
                net.load_state_dict(ckpt["model_state_dict"])
            else:
                net.load_state_dict(ckpt, strict=False)
            net   = net.to(cfg.device).eval()
            print(f'net loaded')
            
            # ---- TorchScript + freeze ----
            with torch.inference_mode():
                dummy    = torch.rand(1, cfg.in_channels, *cfg.TARGET_SHAPE)
                scripted = torch.jit.trace(net, dummy)
                scripted = torch.jit.freeze(scripted)

            # ---- fp16 dynamic quantisation (Linear layers) ----
            scripted = quantize_dynamic(
                scripted,
                {torch.nn.Linear},
                dtype=torch.float16                     # <<< changed
            )

            # ---- parse ckpt score as weight ----
            m = float_pat.search(mp)
            weight = float(m.group(1)) if m else 1.0   # fallback=1.0

            models.append((scripted, weight))

        except Exception as e:
            print(f"[ERR] {mp}: {e}")

    return models

In [45]:
def predict_on_spectrogram(audio_path, models, cfg, species_ids):
    """
    Inference for one .ogg; supports TTA & weighted ensemble.
    Returns row_ids, predictions (list of np.array(num_classes))
    """
    predictions, row_ids = [], []
    soundscape_id        = Path(audio_path).stem

    try:
        print(f"Processing {soundscape_id}")
        audio_data, _  = librosa.load(audio_path, sr=cfg.FS)
        total_segments = len(audio_data) // (cfg.FS * cfg.WINDOW_SIZE)

        # --------- take model weights once (np array) ---------
        #w_arr = np.array([w for _, w in models], dtype=np.float32)   # <<< changed
        w_arr = np.exp( np.array([w for _, w in models], np.float32) )
        w_arr /= w_arr.sum()                                         # 归一化

        for seg_idx in range(total_segments):
            ss = seg_idx * cfg.FS * cfg.WINDOW_SIZE
            es = ss +      cfg.FS * cfg.WINDOW_SIZE
            segment_audio = audio_data[ss:es]

            row_ids.append(f"{soundscape_id}_{(seg_idx+1)*cfg.WINDOW_SIZE}")

            def _forward(mel):
                mel = torch.tensor(mel, dtype=torch.float32).unsqueeze(0).unsqueeze(0
                       ).to(cfg.device)
                out_list = []
                for mdl, _ in models:
                    with torch.no_grad():
                        out_list.append(mdl(mel).cpu().numpy().squeeze())   # ← 不再 sigmoid
                return np.vstack(out_list)          # shape (N_models, C) logits

            # ---------- make predictions ----------
            if cfg.use_tta:
                tta_stack = []
                for tta_i in range(cfg.tta_count):
                    mel = apply_tta(process_audio_segment(segment_audio, cfg),
                                    tta_i)
                    tta_stack.append(_forward(mel))
                preds_raw = np.mean(tta_stack, axis=0)                 # (N, C)
            else:
                mel       = process_audio_segment(segment_audio, cfg)
                preds_raw = _forward(mel)                              # (N, C)

    # ---------- weighted logit average ----------
            if preds_raw.shape[0] == 1:                  
                final_logits = preds_raw[0]              
            else:
                scores  = np.array([w for _, w in models], dtype=np.float32)
                exp_w   = np.exp(scores * 50)            
                w_arr   = exp_w / exp_w.sum()            
                final_logits = np.average(preds_raw, axis=0, weights=w_arr)
            
            final_preds = 1 / (1 + np.exp(-final_logits))
            predictions.append(final_preds)

    except Exception as e:
        print(f"[ERR] processing {audio_path}: {e}")

    return row_ids, predictions

In [46]:
def apply_tta(spec, tta_idx):
    """Apply test-time augmentation"""
    if tta_idx == 0:
        # Original spectrogram
        return spec
    elif tta_idx == 1:
        # Time shift (horizontal flip)
        return np.flip(spec, axis=1)
    elif tta_idx == 2:
        # Frequency shift (vertical flip)
        return np.flip(spec, axis=0)
    else:
        return spec

In [47]:
def run_inference(cfg, models, species_ids):
    """Run inference on all test soundscapes"""
    test_files = list(Path(cfg.test_soundscapes).glob('*.ogg'))
    
    if cfg.debug:
        print(f"Debug mode enabled, using only {cfg.debug_count} files")
        test_files = test_files[:cfg.debug_count]
    
    print(f"Found {len(test_files)} test soundscapes")

    all_row_ids = []
    all_predictions = []

    for audio_path in tqdm(test_files):
        row_ids, predictions = predict_on_spectrogram(str(audio_path), models, cfg, species_ids)
        all_row_ids.extend(row_ids)
        all_predictions.extend(predictions)
    
    return all_row_ids, all_predictions

In [48]:
def create_submission(row_ids, predictions, species_ids, cfg):
    """Create submission dataframe"""
    print("Creating submission dataframe...")

    submission_dict = {'row_id': row_ids}
    
    for i, species in enumerate(species_ids):
        submission_dict[species] = [pred[i] for pred in predictions]

    submission_df = pd.DataFrame(submission_dict)
    submission_df.set_index('row_id', inplace=True)
    sample_sub = pd.read_csv(cfg.submission_csv, index_col='row_id')

    missing_cols = set(sample_sub.columns) - set(submission_df.columns)
    if missing_cols:
        print(f"Warning: Missing {len(missing_cols)} species columns in submission")
        for col in missing_cols:
            submission_df[col] = 0.0

    submission_df = submission_df[sample_sub.columns]
    submission_df = submission_df.reset_index()
    
    return submission_df

In [49]:
def main():
    start_time = time.time()
    print("Starting BirdCLEF-2025 inference...")
    print(f"TTA enabled: {cfg.use_tta} (variations: {cfg.tta_count if cfg.use_tta else 0})")

    models = load_models(cfg, num_classes)
    
    if not models:
        print("No models found! Please check model paths.")
        return
    
    print(f"Model usage: {'Single model' if len(models) == 1 else f'Ensemble of {len(models)} models'}")

    row_ids, predictions = run_inference(cfg, models, species_ids)
    submission_df = create_submission(row_ids, predictions, species_ids, cfg)
    submission_path = 'submission.csv'
    submission_df.to_csv(submission_path, index=False)
    print(f"Submission saved to {submission_path}")
    
    end_time = time.time()
    print(f"Inference completed in {(end_time - start_time)/60:.2f} minutes")

In [50]:
if __name__ == "__main__":
    main()

Starting BirdCLEF-2025 inference...
TTA enabled: False (variations: 0)
Found 2 model files.
Loading /kaggle/input/effnet_v2_focal/pytorch/default/1/effnet_focal_best.pt
net loaded
Loading /kaggle/input/effnet_v2_focal/pytorch/default/1/effnet_focal_final.pt
net loaded
Model usage: Ensemble of 2 models
Found 0 test soundscapes


0it [00:00, ?it/s]

Creating submission dataframe...
Submission saved to submission.csv
Inference completed in 0.25 minutes
