# BirdCLEF+ 2025: CRNN with Precomputed Mel Spectrograms
This notebook implements a Convolutional Recurrent Neural Network (CRNN) for the BirdCLEF+ 2025 competition using precomputed mel spectrograms. This approach provides consistency with the CNN model and faster training by reusing the same features.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
import os

# Define paths for Colab
DATA_PATH = '/content/drive/MyDrive/birdclef-2025-data'
MODEL_SAVE_DIR = '/content/drive/MyDrive/birdclef-2025-models'
PRECOMPUTED_FEATURES_PATH = '/content/drive/MyDrive/bird_features_scratch_copy.npz'  # Path to precomputed features

# Create directory if it doesn't exist
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

# Install required packages
!pip install -q librosa scikit-learn

# Imports
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import roc_auc_score

In [None]:
# Reproducibility and device setup
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
set_seed()
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {DEVICE}')

## Data Loading
We'll load the precomputed mel spectrogram features from .npz file

In [None]:
def load_precomputed_features(local_path=PRECOMPUTED_FEATURES_PATH):
    """Load precomputed features from saved file."""
    # Check if file exists
    if not os.path.exists(local_path):
        print(f"ERROR: Precomputed features file not found at {local_path}")
        return None, None

    # Load the features
    print(f"Loading precomputed features from {local_path}...")
    try:
        data = np.load(local_path)
        X = data['data']
        y = data['labels']
        print(f"Successfully loaded precomputed features!")
        print(f"Features shape: {X.shape}")
        print(f"Labels shape: {y.shape}")
        print(f"Number of unique classes: {len(np.unique(y))}")
        return X, y
    except Exception as e:
        print(f"Error loading precomputed features: {e}")
        return None, None

# Load the precomputed features
X, y = load_precomputed_features()

# Handle classes with only one sample (if any)
if X is not None and y is not None:
    label_counts = np.bincount(y)
    print(f"Number of classes with only one sample: {np.sum(label_counts == 1)}")
    
    if np.any(label_counts == 1):
        print("Handling classes with only one sample...")
        # Remove classes with only one sample
        valid_indices = np.isin(y, np.where(label_counts >= 2)[0])
        X = X[valid_indices]
        y = y[valid_indices]
        print(f"Removed {np.sum(~valid_indices)} samples with singleton classes")
    
    print(f"Final feature shape: {X.shape}")
    print(f"Number of unique classes after filtering: {len(np.unique(y))}")
    
    # Split into training and validation sets
    from sklearn.model_selection import train_test_split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

## Dataset for Precomputed Features
Define a PyTorch dataset to work with our precomputed mel spectrograms.

In [None]:
class PrecomputedFeatureDataset(torch.utils.data.Dataset):
    """Dataset for precomputed mel spectrogram features"""
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        # Add channel dimension if not already present
        feature = self.features[idx]
        if len(feature.shape) == 2:
            feature = feature[np.newaxis, :, :]
        return torch.tensor(feature, dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

## CRNN Model for Precomputed Features
Define our CRNN architecture to work with precomputed mel spectrograms.

In [None]:
class PrecomputedFeatureCRNN(nn.Module):
    """CRNN model that works directly with precomputed mel spectrogram features"""
    def __init__(self, num_classes, gru_hidden_size=256):
        super(PrecomputedFeatureCRNN, self).__init__()
        
        # Input feature shape for precomputed melspectrogram (height, width, channels)
        # Assuming shape is [1, 128, 256] (channels, height, width)
        input_channels = 1
        
        # CNN feature extractor (simplified from BirdCNN)
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(input_channels, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.2),
            
            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.3),
            
            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.4),
        )
        
        # Calculate feature dimensions after CNN
        # After 3 pool layers: 128/(2^3) x 256/(2^3) = 16 x 32
        # Feature channels: 256
        self.feature_dim = 256
        
        # GRU takes features from each time column (treating width as time steps)
        # Input size is height * channels after the last CNN layer
        self.gru = nn.GRU(
            input_size=16 * 256,  # Height * channels after CNN
            hidden_size=gru_hidden_size,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=0.3
        )
        
        # Attention mechanism
        self.attention = nn.Sequential(
            nn.Linear(gru_hidden_size*2, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )
        
        # Classification layers
        self.classifier = nn.Sequential(
            nn.Linear(gru_hidden_size*2, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, x):
        # Input x shape: [batch, channels, height, width]
        batch_size = x.size(0)
        
        # Extract features with CNN
        features = self.features(x)  # Shape: [batch, 256, 16, 32]
        
        # Prepare features for GRU
        # We'll treat width as time steps (32 time steps)
        # For each time step, we have a feature vector of size [channels * height]
        
        # Reshape to [batch, width(time), channels * height]
        features = features.permute(0, 3, 1, 2)  # [batch, 32, 256, 16]
        features = features.reshape(batch_size, features.size(1), -1)  # [batch, 32, 256*16]
        
        # Process with GRU
        gru_out, _ = self.gru(features)  # [batch, 32, 2*hidden_size]
        
        # Apply attention
        attn_weights = self.attention(gru_out).squeeze(-1)  # [batch, 32]
        attn_weights = F.softmax(attn_weights, dim=1).unsqueeze(1)  # [batch, 1, 32]
        
        # Compute weighted sum
        context = torch.bmm(attn_weights, gru_out).squeeze(1)  # [batch, 2*hidden_size]
        
        # Classify
        output = self.classifier(context)
        
        return output

## Visualize Training Results Function
Define a function to visualize training metrics.

In [None]:
def plot_history(history):
    plt.figure(figsize=(18, 5))
    
    # Plot training & validation loss
    plt.subplot(1, 3, 1)
    plt.plot(history['train_loss'], label='Train')
    plt.plot(history['val_loss'], label='Validation')
    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    # Plot training & validation accuracy
    plt.subplot(1, 3, 2)
    plt.plot(history['train_acc'], label='Train')
    plt.plot(history['val_acc'], label='Validation')
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Plot validation AUC
    plt.subplot(1, 3, 3)
    plt.plot(history['val_auc'], label='Validation')
    plt.title('ROC AUC')
    plt.xlabel('Epoch')
    plt.ylabel('AUC')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

## Training Function for Precomputed Features
Define the training and validation procedure for our CRNN model with precomputed features.

In [None]:
def train_precomputed_crnn(y):
    """Train CRNN model using precomputed features from .npz file"""
    if X_train is None or y_train is None:
        print("Precomputed features not loaded properly. Please check the path.")
        return None, None, None
        
    # Create datasets from precomputed features
    train_dataset = PrecomputedFeatureDataset(X_train, y_train)
    val_dataset = PrecomputedFeatureDataset(X_val, y_val)
    
    # Create data loaders with optimized batch sizes for A100
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=512,  # Larger batch size possible with precomputed features
        shuffle=True,
        num_workers=4,
        pin_memory=True
    )
    
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=512,
        shuffle=False,
        num_workers=4,
        pin_memory=True
    )
    
    # Number of classes from the data
    num_classes = len(np.unique(y))
    print(f"Number of classes: {num_classes}")
    
    # Initialize the PrecomputedFeatureCRNN model
    model = PrecomputedFeatureCRNN(
        num_classes=num_classes,
        gru_hidden_size=256
    ).to(DEVICE)
    
    # Set up loss function, optimizer, and scheduler
    criterion = nn.CrossEntropyLoss()
    
    # Optimizer with appropriate learning rates
    optimizer = optim.AdamW([
        {'params': model.features.parameters(), 'lr': 3e-4},
        {'params': model.gru.parameters(), 'lr': 3e-4},
        {'params': model.attention.parameters(), 'lr': 3e-4},
        {'params': model.classifier.parameters(), 'lr': 3e-4}
    ], weight_decay=0.01)
    
    # Learning rate scheduler based on validation AUC
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=3, verbose=True
    )
    
    # Set up mixed precision training
    scaler = GradScaler()
    
    # Training parameters
    num_epochs = 50
    best_auc = 0
    patience = 7
    wait = 0
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': [], 'val_auc': []}
    model_name = "crnn_precomputed"
    best_model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_best.pt")
    
    # Training loop with mixed precision
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        
        # Training phase
        model.train()
        running_loss = 0
        correct = 0
        total = 0
        
        for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
            x, y = x.to(DEVICE), y.to(DEVICE)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass with mixed precision
            with autocast():
                outputs = model(x)
                loss = criterion(outputs, y)
            
            # Backward and optimize with gradient scaling
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            # Update statistics
            running_loss += loss.item() * x.size(0)
            _, predicted = outputs.max(1)
            total += y.size(0)
            correct += predicted.eq(y).sum().item()
        
        train_loss = running_loss / total
        train_acc = correct / total
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        
        # Validation phase
        model.eval()
        val_running_loss = 0
        val_correct = 0
        val_total = 0
        all_outputs = []
        all_targets = []
        
        with torch.no_grad():
            for x, y in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
                x, y = x.to(DEVICE), y.to(DEVICE)
                
                # Forward pass with mixed precision
                with autocast():
                    outputs = model(x)
                    loss = criterion(outputs, y)
                
                # Update statistics
                val_running_loss += loss.item() * x.size(0)
                _, predicted = outputs.max(1)
                val_total += y.size(0)
                val_correct += predicted.eq(y).sum().item()
                
                # Store outputs and targets for AUC calculation
                all_outputs.append(F.softmax(outputs, dim=1).cpu().numpy())
                all_targets.append(F.one_hot(y, num_classes=outputs.size(1)).cpu().numpy())
        
        val_loss = val_running_loss / val_total
        val_acc = val_correct / val_total
        
        # Update history
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        
        # Calculate ROC-AUC
        try:
            all_outputs = np.concatenate(all_outputs)
            all_targets = np.concatenate(all_targets)
            
            # Classes with positive examples
            pos = (all_targets.sum(0) > 0)
            val_auc = roc_auc_score(all_targets[:, pos], all_outputs[:, pos], average='macro')
            history['val_auc'].append(val_auc)
            
            # Print results
            print(f'Epoch {epoch+1}: train_loss={train_loss:.4f}, train_acc={train_acc:.4f}, '
                  f'val_loss={val_loss:.4f}, val_acc={val_acc:.4f}, val_auc={val_auc:.4f}')
        except Exception as e:
            print(f"Error calculating AUC: {e}")
            val_auc = history['val_auc'][-1] if history['val_auc'] else 0
            history['val_auc'].append(val_auc)
        
        # Update scheduler based on validation AUC
        scheduler.step(val_auc)
        
        # Save best model
        if val_auc > best_auc:
            best_auc = val_auc
            torch.save({
                'model_state_dict': model.state_dict(),
                'epoch': epoch,
                'val_auc': val_auc,
            }, best_model_path)
            wait = 0
            print(f"New best model with val_auc={val_auc:.4f}")
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break
    
    # Load best model
    checkpoint = torch.load(best_model_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    
    print(f"Training complete. Best validation AUC: {best_auc:.4f} at epoch {checkpoint['epoch']+1}")
    
    # Save final model in a format suitable for ensemble
    label_mapping = {i: f"class_{i}" for i in range(num_classes)}
    torch.save({
        'model_state': model.state_dict(),
        'model_type': 'crnn_precomputed',
        'input_params': {
            'sr': 32000,
            'n_mels': 128,
            'fmin': 20,
            'fmax': 16000,
            'n_fft': 1024,
            'hop_length': 512,
            'duration': 5
        },
        'history': history
    }, os.path.join(MODEL_SAVE_DIR, f"{model_name}_ensemble.pth"))
    
    print(f"Model saved for ensemble use at '{os.path.join(MODEL_SAVE_DIR, f'{model_name}_ensemble.pth')}'")
    
    return model, label_mapping, history

## Run Training
Execute the training process with precomputed features.

In [None]:
# Train CRNN using precomputed features from the .npz file
precomputed_model, precomputed_label_mapping, precomputed_history = train_precomputed_crnn(y)

# Plot the training history
if precomputed_history is not None:
    plot_history(precomputed_history)

## Model Evaluation
Evaluate the trained model on the validation set.

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

def evaluate_precomputed_model(model):
    val_dataset = PrecomputedFeatureDataset(X_val, y_val)
    val_loader = torch.utils.data.DataLoader(
        val_dataset, 
        batch_size=256, 
        num_workers=2
    )
    
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for x, y in tqdm(val_loader):
            x = x.to(DEVICE)
            outputs = model(x)
            preds = outputs.argmax(1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(y.numpy())
    
    # Calculate accuracy
    accuracy = (np.array(all_preds) == np.array(all_labels)).mean()
    print(f"Validation accuracy: {accuracy:.4f}")
    
    # Compute and plot confusion matrix for top classes
    class_counts = np.bincount(y_val)
    top_classes = np.argsort(class_counts)[-20:]  # Top 20 most frequent classes
    
    # Filter predictions and labels for top classes only
    mask = np.isin(all_labels, top_classes)
    filtered_preds = [all_preds[i] for i, m in enumerate(mask) if m]
    filtered_labels = [all_labels[i] for i, m in enumerate(mask) if m]
    
    # Compute and plot confusion matrix
    cm = confusion_matrix(filtered_labels, filtered_preds)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
              xticklabels=[f"Class {i}" for i in top_classes],
              yticklabels=[f"Class {i}" for i in top_classes])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix (Top 20 Classes)')
    plt.tight_layout()
    plt.show()
    
    # Classification report
    report = classification_report(
        all_labels, all_preds,
        output_dict=True
    )
    
    # Display top and bottom performing classes
    df_report = pd.DataFrame(report).transpose()
    top_performing = df_report.sort_values(by='f1-score', ascending=False).head(10)
    bottom_performing = df_report.sort_values(by='f1-score').head(10)
    
    print("Top 10 best predicted classes:")
    print(top_performing[['precision', 'recall', 'f1-score', 'support']])
    
    print("\nBottom 10 worst predicted classes:")
    print(bottom_performing[['precision', 'recall', 'f1-score', 'support']])

In [None]:
# Evaluate the trained model
if precomputed_model is not None:
    evaluate_precomputed_model(precomputed_model)

## Conclusion

We've successfully implemented a CRNN model for the BirdCLEF+ 2025 competition using precomputed mel spectrograms. This approach provides several advantages:

1. **Consistency with CNN model**: Using the same precomputed features ensures both models see the same input representation
2. **Faster training**: No need to compute mel spectrograms on-the-fly
3. **Efficiency**: Allows for larger batch sizes and faster iteration
4. **Ensemble compatibility**: Making it easier to combine model predictions later

The CRNN model combines convolutional layers to extract spatial features from mel spectrograms with recurrent layers that capture temporal dynamics, enhanced with an attention mechanism. This architecture is well-suited for capturing both the frequency patterns and temporal evolution of bird calls, which is essential for accurate species classification.

This model now forms a key component of our ensemble approach for the BirdCLEF+ 2025 competition.