# BirdCLEF+ 2025: Audio Transformer Model

This notebook implements a transformer-based architecture for the BirdCLEF+ 2025 competition. The transformer model is designed to capture long-range context in audio data, particularly useful for detecting complex patterns like overlapping calls from multiple species.

As described in the project strategy, this transformer model provides a different inductive bias compared to CNNs (global self-attention vs. local convolution), enriching the ensemble approach.

In [None]:
# Install required packages
!pip install -q librosa torchlibrosa timm torchaudio einops

# Imports
import os, random, warnings, numpy as np, pandas as pd
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import librosa, librosa.display, soundfile as sf
import torch
import torch.nn as nn, torch.nn.functional as F, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from einops import rearrange
from datetime import datetime

In [None]:
# Reproducibility and device setup
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    torch.cuda.manual_seed(seed); torch.backends.cudnn.deterministic = True
    
set_seed()
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {DEVICE}')
print(f'Current time: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

## Data Loading
Load training metadata and define paths.

In [None]:
DATA_PATH = '../birdclef-2025-data'  # adjust if needed
train_df = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
print(f'Training samples: {len(train_df)}')

# Get unique species count
species = train_df['primary_label'].unique()
print(f'Number of unique species: {len(species)}')

# Check class distribution
plt.figure(figsize=(12, 5))
train_df['primary_label'].value_counts().head(20).plot(kind='bar')
plt.title('Top 20 Species Distribution')
plt.ylabel('Count')
plt.xlabel('Species')
plt.tight_layout()
plt.show()

train_df.head()

## Feature Extraction
Define mel-spectrogram extraction functions. For transformers, we'll use a denser representation with more time frames to capture long-range dependencies.

In [None]:
def compute_melspec(audio, sr=32000, n_mels=128, fmin=20, fmax=16000, n_fft=1024, hop_length=256):
    """Compute mel-spectrogram with slightly denser time resolution (hop_length=256 instead of 512)"""
    mel = librosa.feature.melspectrogram(
        y=audio, 
        sr=sr, 
        n_mels=n_mels, 
        fmin=fmin, 
        fmax=fmax, 
        n_fft=n_fft, 
        hop_length=hop_length, 
        power=2.0
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)
    
    # Normalize to 0-1 range
    mel_norm = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)
    return mel_norm

def audio_to_melspec(audio, sr=32000, duration=5, augment=False):
    """Convert audio to mel-spectrogram with optional augmentations"""
    target_len = int(sr * duration)
    
    # Handle audio length
    if len(audio) > target_len:
        start = np.random.randint(0, len(audio) - target_len)
        audio = audio[start:start + target_len]
    else:
        # Pad if audio is shorter
        pad = target_len - len(audio)
        audio = np.pad(audio, (pad//2, pad - pad//2))
    
    # Apply augmentations if requested
    if augment:
        # Time shift
        shift_factor = np.random.uniform(-0.1, 0.1)
        shift = int(shift_factor * len(audio))
        audio = np.roll(audio, shift)
        
        # Add random noise
        noise_level = np.random.uniform(0, 0.01)
        noise = np.random.normal(0, noise_level, len(audio))
        audio = audio + noise
        
        # Random gain
        gain = np.random.uniform(0.8, 1.2)
        audio = audio * gain
        audio = np.clip(audio, -1, 1)
    
    # Convert to mel spectrogram
    melspec = compute_melspec(audio, sr=sr)
    return melspec[np.newaxis, :, :]  # add channel dimension

# Display a sample mel-spectrogram
def plot_melspec(melspec, title="Mel-Spectrogram"):
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(
        melspec[0], 
        sr=32000, 
        hop_length=256, 
        x_axis='time', 
        y_axis='mel', 
        fmin=20, 
        fmax=16000
    )
    plt.colorbar(format='%+2.0f dB')
    plt.title(title)
    plt.tight_layout()
    plt.show()

In [None]:
# Load and visualize a sample audio
sample_row = train_df.iloc[0]
sample_path = os.path.join(DATA_PATH, 'train_audio', sample_row.filename)
print(f"Loading sample: {sample_row.filename} (Species: {sample_row.primary_label})")

audio, sr = sf.read(sample_path)
melspec = audio_to_melspec(audio, sr=sr)
plot_melspec(melspec, f"Mel-Spectrogram: {sample_row.primary_label}")

# Plot waveform
plt.figure(figsize=(10, 2))
librosa.display.waveshow(audio[:sr*5], sr=sr)
plt.title(f"Waveform: {sample_row.primary_label}")
plt.tight_layout()
plt.show()

## Dataset and DataLoader
Define PyTorch dataset for the Audio Transformer with patch embedding preparation.

In [None]:
class AudioTransformerDataset(Dataset):
    def __init__(self, df, path, sr=32000, duration=5, augment=False):
        self.df = df.reset_index(drop=True)
        self.path = path
        self.sr = sr
        self.duration = duration
        self.augment = augment
        
        # Create label mappings
        self.labels = sorted(df['primary_label'].unique())
        self.label2idx = {label: idx for idx, label in enumerate(self.labels)}
        self.idx2label = {idx: label for idx, label in enumerate(self.labels)}
        
        print(f"Dataset initialized with {len(self.labels)} classes and {len(df)} samples")
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        audio_path = os.path.join(self.path, 'train_audio', row.filename)
        
        # Load audio
        audio, _ = sf.read(audio_path)
        
        # Convert to mel spectrogram with augmentation
        melspec = audio_to_melspec(audio, sr=self.sr, duration=self.duration, augment=self.augment)
        
        # Get label
        label_idx = self.label2idx[row.primary_label]
        
        return torch.tensor(melspec, dtype=torch.float32), torch.tensor(label_idx)

## Audio Transformer Model Definition

Here we define the transformer-based architecture with self-attention mechanisms to capture long-range dependencies in audio data. The model follows a Vision Transformer (ViT) style but adapted for audio spectrograms.

In [None]:
class PatchEmbedding(nn.Module):
    """Convert mel-spectrogram into patches and embed them"""
    def __init__(self, patch_size=(16, 16), emb_dim=192, in_channels=1, img_size=(128, 626)):
        super().__init__()
        self.patch_size = patch_size
        self.num_patches = (img_size[0] // patch_size[0]) * (img_size[1] // patch_size[1])
        # Linear projection to embedding dimension
        self.proj = nn.Conv2d(
            in_channels, emb_dim,
            kernel_size=patch_size,
            stride=patch_size
        )
    
    def forward(self, x):
        # x: [B, C, H, W]
        x = self.proj(x)  # [B, E, H', W']
        # Flatten patches into sequence
        x = x.flatten(2).transpose(1, 2)  # [B, num_patches, E]
        return x

class TransformerEncoder(nn.Module):
    """Transformer encoder with multi-head self-attention"""
    def __init__(self, emb_dim=192, num_heads=8, mlp_ratio=4, dropout=0.1):
        super().__init__()
        self.norm1 = nn.LayerNorm(emb_dim)
        self.attn = nn.MultiheadAttention(emb_dim, num_heads, dropout=dropout, batch_first=True)
        self.norm2 = nn.LayerNorm(emb_dim)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, emb_dim * mlp_ratio),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(emb_dim * mlp_ratio, emb_dim),
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        # x: [B, seq_len, emb_dim]
        attn_input = self.norm1(x)
        attn_output, _ = self.attn(attn_input, attn_input, attn_input)
        x = x + attn_output
        x = x + self.mlp(self.norm2(x))
        return x

class AudioTransformer(nn.Module):
    """Audio Transformer for bioacoustic classification"""
    def __init__(self, num_classes, emb_dim=192, depth=12, num_heads=8, 
                 mlp_ratio=4, dropout=0.1, img_size=(128, 626)):
        super().__init__()
        # Patch embedding
        self.patch_embed = PatchEmbedding(
            patch_size=(16, 16),
            emb_dim=emb_dim,
            in_channels=1,
            img_size=img_size
        )
        
        # Class token and position embeddings
        self.cls_token = nn.Parameter(torch.zeros(1, 1, emb_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, self.patch_embed.num_patches + 1, emb_dim))
        nn.init.trunc_normal_(self.pos_embed, std=0.02)
        self.pos_drop = nn.Dropout(dropout)
        
        # Transformer blocks
        self.blocks = nn.ModuleList([
            TransformerEncoder(emb_dim, num_heads, mlp_ratio, dropout)
            for _ in range(depth)
        ])
        
        # Final normalization and classification head
        self.norm = nn.LayerNorm(emb_dim)
        self.head = nn.Linear(emb_dim, num_classes)
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        nn.init.trunc_normal_(self.cls_token, std=0.02)
        self.apply(self._init_weights_)
    
    def _init_weights_(self, m):
        if isinstance(m, nn.Linear):
            nn.init.trunc_normal_(m.weight, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
            nn.init.zeros_(m.bias)
            nn.init.ones_(m.weight)
    
    def forward(self, x):
        # x: [B, C, H, W]
        b = x.shape[0]
        x = self.patch_embed(x)  # [B, num_patches, emb_dim]
        
        # Add class token
        cls_token = self.cls_token.expand(b, -1, -1)  # [B, 1, emb_dim]
        x = torch.cat([cls_token, x], dim=1)  # [B, num_patches + 1, emb_dim]
        
        # Add position embeddings
        x = x + self.pos_embed
        x = self.pos_drop(x)
        
        # Process through transformer blocks
        for block in self.blocks:
            x = block(x)
        
        # Classification based on CLS token
        x = self.norm(x)
        x = self.head(x[:, 0])  # Use CLS token for classification
        
        return x

In [None]:
# Initialize a small model to check the architecture
dummy_data = torch.randn(2, 1, 128, 626)  # Batch of 2, 1 channel, 128x626 spectrogram
small_model = AudioTransformer(num_classes=10, emb_dim=64, depth=2, num_heads=2)
output = small_model(dummy_data)
print(f"Input shape: {dummy_data.shape}")
print(f"Output shape: {output.shape}")
print(f"Total parameters: {sum(p.numel() for p in small_model.parameters() if p.requires_grad):,}")

## Training and Validation
Define training and validation loops with mixed precision for faster training.

In [None]:
def train_epoch(model, loader, criterion, optimizer, scheduler=None):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    scaler = torch.cuda.amp.GradScaler()  # For mixed precision training
    
    pbar = tqdm(loader)
    for x, y in pbar:
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        
        # Use mixed precision for faster training
        with torch.cuda.amp.autocast():
            outputs = model(x)
            loss = criterion(outputs, y)
        
        # Scale gradients and optimize
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        running_loss += loss.item() * x.size(0)
        _, predicted = outputs.max(1)
        total += y.size(0)
        correct += predicted.eq(y).sum().item()
        
        # Update progress bar
        pbar.set_description(f"Loss: {loss.item():.4f}, Acc: {100.*correct/total:.2f}%")
        
    if scheduler is not None:
        scheduler.step()
        
    return running_loss / total, correct / total

def validate(model, loader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_outputs = []
    all_targets = []
    
    with torch.no_grad():
        for x, y in tqdm(loader):
            x, y = x.to(DEVICE), y.to(DEVICE)
            outputs = model(x)
            loss = criterion(outputs, y)
            
            running_loss += loss.item() * x.size(0)
            _, predicted = outputs.max(1)
            total += y.size(0)
            correct += predicted.eq(y).sum().item()
            
            # Store outputs and targets for ROC-AUC calculation
            all_outputs.append(F.softmax(outputs, dim=1).cpu().numpy())
            all_targets.append(F.one_hot(y, num_classes=outputs.size(1)).cpu().numpy())
    
    # Calculate AUC
    all_outputs = np.concatenate(all_outputs)
    all_targets = np.concatenate(all_targets)
    
    # Only calculate AUC for classes that have positive examples
    from sklearn.metrics import roc_auc_score
    pos_classes = (all_targets.sum(0) > 0)
    auc = roc_auc_score(all_targets[:, pos_classes], all_outputs[:, pos_classes], average='macro')
    
    return running_loss / total, correct / total, auc

## Main Training Routine
Prepare loaders, model, and train with learning rate scheduling and early stopping.

In [None]:
def main(train_epochs=30, batch_size=16, lr=3e-5):
    print("Starting training process...")
    
    # Prepare data with stratified split
    from sklearn.model_selection import train_test_split
    train_data, val_data = train_test_split(
        train_df, 
        test_size=0.2, 
        stratify=train_df.primary_label, 
        random_state=42
    )
    
    print(f"Training samples: {len(train_data)}, Validation samples: {len(val_data)}")
    
    # Create datasets
    train_dataset = AudioTransformerDataset(train_data, DATA_PATH, augment=True)
    val_dataset = AudioTransformerDataset(val_data, DATA_PATH, augment=False)
    
    # Create dataloaders
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=4,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset, 
        batch_size=batch_size*2, 
        shuffle=False, 
        num_workers=4,
        pin_memory=True
    )
    
    # Initialize the model
    num_classes = len(train_dataset.labels)
    model = AudioTransformer(
        num_classes=num_classes,
        emb_dim=192,       # Embedding dimension
        depth=8,           # Number of transformer layers (reduced for training speed)
        num_heads=8,       # Multi-head attention heads
        mlp_ratio=4,       # MLP hidden dim ratio
        dropout=0.2        # Dropout rate for regularization
    ).to(DEVICE)
    
    print(f"Model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad):,} parameters")
    
    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.05)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=train_epochs)
    
    # Training loop with early stopping
    best_auc = 0
    patience = 5
    wait = 0
    
    # Create directory for model checkpoints if it doesn't exist
    os.makedirs('models', exist_ok=True)
    
    # Track metrics
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []
    val_aucs = []
    
    for epoch in range(train_epochs):
        print(f"\nEpoch {epoch+1}/{train_epochs}")
        
        # Train
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, scheduler)
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        
        # Validate
        val_loss, val_acc, val_auc = validate(model, val_loader, criterion)
        val_losses.append(val_loss)
        val_accs.append(val_acc)
        val_aucs.append(val_auc)
        
        print(f"Epoch {epoch+1} results:")
        print(f"  Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
        print(f"  Valid - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, AUC: {val_auc:.4f}")
        
        # Check for improvement
        if val_auc > best_auc:
            best_auc = val_auc
            torch.save(model.state_dict(), 'models/audio_transformer_best.pt')
            print(f"  Improved! New best AUC: {best_auc:.4f}")
            wait = 0
        else:
            wait += 1
            print(f"  No improvement for {wait} epochs.")
            if wait >= patience:
                print(f"Early stopping triggered after {epoch+1} epochs!")
                break
    
    # Load the best model
    model.load_state_dict(torch.load('models/audio_transformer_best.pt'))
    
    # Plot training curves
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Val Loss')
    plt.title('Loss Curves')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Train Acc')
    plt.plot(val_accs, label='Val Acc')
    plt.plot(val_aucs, label='Val AUC')
    plt.title('Accuracy & AUC Curves')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('audio_transformer_training.png')
    plt.show()
    
    print(f"Training complete! Best validation AUC: {best_auc:.4f}")
    return model, train_dataset

In [None]:
# Train the model (comment out during development if needed)
model, train_dataset = main(train_epochs=30, batch_size=16)

## Inference and Submission
Implement sliding-window inference on test soundscapes with overlapping windows for more robust predictions.

In [None]:
def predict_on_audio(model, audio_path, sr=32000, window_size=5, step_size=2.5):
    """Run inference on a full audio file using sliding window approach"""
    # Load audio
    audio, _ = sf.read(audio_path)
    
    # Initialize predictions list
    all_preds = []
    
    # Calculate window and step sizes in samples
    window_samples = int(window_size * sr)
    step_samples = int(step_size * sr)
    
    # Create sliding windows
    for start in range(0, max(1, len(audio) - window_samples + 1), step_samples):
        end = min(start + window_samples, len(audio))
        window = audio[start:end]
        
        # If window is too short, pad it
        if len(window) < window_samples:
            pad = window_samples - len(window)
            window = np.pad(window, (0, pad))
        
        # Convert to mel spectrogram and make prediction
        melspec = audio_to_melspec(window, sr=sr)
        melspec_tensor = torch.tensor(melspec, dtype=torch.float32).unsqueeze(0).to(DEVICE)
        
        with torch.no_grad():
            with torch.cuda.amp.autocast():
                outputs = model(melspec_tensor)
                probs = F.softmax(outputs, dim=1).cpu().numpy()[0]
            
        all_preds.append(probs)
    
    # Average predictions
    avg_preds = np.mean(all_preds, axis=0)
    
    return avg_preds

def generate_submission():
    """Generate submission file for Kaggle"""
    print("Generating submission file...")
    
    # Load the best model
    num_classes = len(train_dataset.labels)
    model = AudioTransformer(
        num_classes=num_classes,
        emb_dim=192, 
        depth=8, 
        num_heads=8,
        mlp_ratio=4,
        dropout=0.0  # No dropout for inference
    ).to(DEVICE)
    model.load_state_dict(torch.load('models/audio_transformer_best.pt'))
    model.eval()
    
    # Load sample submission to get format
    sample_submission = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))
    
    # Extract unique soundscape IDs from row_id
    soundscape_ids = set()
    for row_id in sample_submission['row_id']:
        parts = row_id.split('_')
        if len(parts) >= 2:
            soundscape_ids.add(parts[1])
    
    # Create dictionary to store predictions for each soundscape
    soundscape_preds = {}
    
    # Run inference on each soundscape
    for sid in tqdm(soundscape_ids):
        audio_path = os.path.join(DATA_PATH, 'test_soundscapes', f'soundscape_{sid}.ogg')
        if os.path.exists(audio_path):
            preds = predict_on_audio(model, audio_path)
            soundscape_preds[sid] = preds
        else:
            print(f"Warning: File not found - {audio_path}")
    
    # Map predictions to row_ids in the submission format
    results = []
    for _, row in sample_submission.iterrows():
        row_id = row['row_id']
        parts = row_id.split('_')
        if len(parts) >= 2:
            sid = parts[1]
            if sid in soundscape_preds:
                preds = soundscape_preds[sid]
                results.append([row_id] + list(preds))
            else:
                # If we missed this soundscape, use zeros
                results.append([row_id] + [0] * num_classes)
    
    # Create submission dataframe
    submission = pd.DataFrame(results, columns=sample_submission.columns)
    
    # Save submission file
    submission.to_csv('audio_transformer_submission.csv', index=False)
    print("Submission file saved to 'audio_transformer_submission.csv'")
    
    return submission

In [None]:
# Generate submission file
submission = generate_submission()

## Ensemble Integration
Export model weights and predictions for later ensemble integration.

In [None]:
def save_for_ensemble(model, export_path='ensemble_exports', model_name='audio_transformer'):
    """Save model weights and OOF predictions for ensemble integration"""
    # Create directory
    os.makedirs(export_path, exist_ok=True)
    
    # Save model in multiple formats
    torch.save(model.state_dict(), f'{export_path}/{model_name}.pt')
    
    # Export to ONNX (for faster inference)
    dummy_input = torch.randn(1, 1, 128, 626).to(DEVICE)
    torch.onnx.export(
        model, 
        dummy_input, 
        f"{export_path}/{model_name}.onnx",
        export_params=True,
        opset_version=12,
        input_names=['input'],
        output_names=['output'],
        dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
    )
    
    print(f"Model exported to {export_path}/{model_name}.pt and {model_name}.onnx")
    print("These files can now be used in the ensemble integration notebook.")

In [None]:
# Save model for ensemble integration
save_for_ensemble(model)

## Conclusion

This notebook implements an Audio Transformer model for the BirdCLEF+ 2025 competition. The transformer architecture leverages self-attention mechanisms to capture long-range dependencies in audio data, which is particularly useful for identifying complex patterns like overlapping calls from multiple species.

Key aspects of this implementation:

1. **Transformer-based architecture**: Adapts Vision Transformer principles to audio spectrograms
2. **Patch embedding**: Splits spectrograms into patches for transformer processing
3. **Mixed precision training**: Uses FP16 for faster training
4. **Sliding window inference**: Processes longer audio files with overlapping windows
5. **Export to ensemble**: Ready for integration with other models in the ensemble

This model provides a different inductive bias compared to the CNN and CRNN models, enriching the ensemble approach by capturing global self-attention patterns in the audio data. The next step is to integrate this model with the others in the ensemble.