# Deep learning algorithms to classify audio (EfficientNet B2 + Two-stage fine-tuning)

In [12]:
import gc
import glob
import logging
import os
import random
import re
import sys
import time
import warnings
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    roc_auc_score,
)
from sklearn.model_selection import (
    RandomizedSearchCV,
    StratifiedKFold,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models
from tensorflow.keras.applications import EfficientNetB2, ResNet50
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications.efficientnet import preprocess_input
from tqdm import tqdm

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

os.environ["CUDA_VISIBLE_DEVICES"] = ""

print(tf.__version__)
print(dir(tf.keras))

2.19.0
['DTypePolicy', 'FloatDTypePolicy', 'Function', 'Initializer', 'Input', 'InputSpec', 'KerasTensor', 'Layer', 'Loss', 'Metric', 'Model', 'Operation', 'Optimizer', 'Quantizer', 'Regularizer', 'RematScope', 'Sequential', 'StatelessScope', 'SymbolicScope', 'Variable', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', 'activations', 'applications', 'backend', 'callbacks', 'config', 'constraints', 'datasets', 'device', 'distribution', 'dtype_policies', 'export', 'initializers', 'layers', 'legacy', 'losses', 'metrics', 'mixed_precision', 'models', 'name_scope', 'ops', 'optimizers', 'preprocessing', 'quantizers', 'random', 'regularizers', 'remat', 'tree', 'utils', 'version', 'visualization', 'wrappers']


## BirdCLEF 2025: EfficientNet B2 + Two-stage fine-tuning Multi-label Classification Approach

### Overview

To improve model performance beyond the ResNet50 baseline, we introduced **EfficientNet B2** as the feature extractor. EfficientNet models are known for their compound scaling of depth, width, and resolution, offering higher performance with fewer parameters.

Instead of training the entire network from scratch, we adopt a **two-stage fine-tuning strategy** to leverage pretrained weights from ImageNet and prevent overfitting on our limited dataset.

---

### Stage 1: Freeze the base model

- Load `EfficientNetB2` with `weights='imagenet'` and `include_top=False`
- **Freeze all layers** in the base model (i.e., `base_model.trainable = False`)
- Only train the **newly added top layers**:
  - GlobalAveragePooling → Dropout → Dense(206, activation='sigmoid')
- Use `BinaryCrossentropy` with `label_smoothing` to stabilize training

### Stage 2: Unfreeze and fine-tune

- After the top layers are warm-started, **unfreeze the last N layers** of EfficientNet
- Recompile the model with a **lower learning rate** (e.g., 1e-5)
- Continue training for another 5–10 epochs

---

### Why EfficientNet B2?

| Model       | Params | Accuracy | Speed | Notes                |
|-------------|--------|----------|-------|----------------------|
| ResNet50    | 24M    | High     | OK    | Strong baseline      |
| **EffNetB2**| 8M     | High+    | Fast  | Higher AUC with less |
| EffNetB3    | 12M    | Higher   | Slower| GPU memory ↑         |

EfficientNet-B2 offers **better performance-per-parameter ratio** than ResNet50, making it a solid upgrade path.

In [2]:
# -------------------------
# 1) Load train_data.npy
# -------------------------
# Content example:
# data_dict[fid] = {
# 'data': (128,256) Mel spectrum,
# 'label': 'Name of a species'
# }
# -------------------------
data_dict = np.load('dataset/train_data.npy', allow_pickle=True).item()

X_list = []
y_list = []

all_labels_set = set()

for fid, content in data_dict.items():
    mel_2d = content['data']             # shape=(128,256)
    label_str = content['label']         # 'species_xxx'

    X_list.append(mel_2d)
    y_list.append(label_str)
    all_labels_set.add(label_str)

X_array = np.array(X_list, dtype=np.float32)    # shape=(N,128,256)
y_array = np.array(y_list)                      # shape=(N,)

all_labels = sorted(list(all_labels_set))
label_to_idx = {lb: i for i, lb in enumerate(all_labels)}
num_species = len(all_labels)

print("Number of samples:", X_array.shape[0])
print("Mel shape: (128,256)")
print("Number of unique species:", num_species)

Number of samples: 28564
Mel shape: (128,256)
Number of unique species: 206


In [30]:
# -----------------------------
# 2) Multi-label One-Hot: Only one position in each record is 1
# -----------------------------
Y_one_hot = np.zeros((len(y_array), num_species), dtype=np.float32)
for i, lb in enumerate(y_array):
    Y_one_hot[i, label_to_idx[lb]] = 1.0

# -----------------------------
# 3) Split training/validation set (80/20)
# -----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X_array, Y_one_hot, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_array # Stratify by string label
)

print("Train shape:", X_train.shape, y_train.shape)
print("Val shape:",   X_val.shape,   y_val.shape)

print("Global Mel range in X_array:")
print("  min =", X_array.min(), ", max =", X_array.max())
print("Train Mel range:")
print("  min =", X_train.min(), ", max =", X_train.max())

Train shape: (22851, 128, 256) (22851, 206)
Val shape: (5713, 128, 256) (5713, 206)
Global Mel range in X_array:
  min = 0.0 , max = 1.0
Train Mel range:
  min = 0.0 , max = 1.0


In [31]:
# -----------------------------
# 4) Dealing with data imbalance -> class_weight
# Since each record has only one label, we can count the number of times each label appears
# and assign values ​​in reverse proportion.
# -----------------------------
label_counts = Counter(y_array)
max_count = max(label_counts.values())
# Give higher weight to less common categories
class_weight = {}
for lb, freq in label_counts.items():
    idx = label_to_idx[lb]
    class_weight[idx] = max_count / freq

print("Class weight example:", list(class_weight.items())[:5])

Class weight example: [(110, 7.7952755905511815), (177, 6.470588235294118), (71, 3.1832797427652735), (30, 33.0), (47, 47.142857142857146)]


In [None]:
# -----------------------------
# 5) Build data pipeline + data augmentation
# Random flip/rotate (for images)
# -----------------------------
augment_layers = tf.keras.Sequential([
    layers.RandomRotation(0.05),
    layers.RandomZoom(height_factor=0.05)
])


def preprocess_fn(x, y, training=True):
    x = tf.expand_dims(x, -1)            # (128,256,1)
    x = tf.image.grayscale_to_rgb(x)     # (128,256,3)
    x = preprocess_input(x * 255.0)     
    if training:
        x = augment_layers(x, training=True)
    return x, y


batch_size = 32
train_ds = (tf.data.Dataset.from_tensor_slices((X_train, y_train))
            .shuffle(2048)
            .map(lambda x, y: preprocess_fn(x, y, True))
            .batch(batch_size)
            .prefetch(tf.data.AUTOTUNE))
val_ds = (tf.data.Dataset.from_tensor_slices((X_val, y_val))
            .map(lambda x, y: preprocess_fn(x, y, False))
            .batch(batch_size)
            .prefetch(tf.data.AUTOTUNE))

In [39]:
# -----------------------------
# 6) Build effnet_b2 (Keras)
# -----------------------------
def build_effnet_b2(input_shape=(128,256,3), num_classes=206):
    inp = layers.Input(shape=input_shape)
    base = EfficientNetB2(include_top=False, weights='imagenet')
    base.trainable = False                          # stage‑1 
    x = base(inp, training=False)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(num_classes, activation='sigmoid')(x)
    return models.Model(inp, out, name='EffNetB2_BirdCLEF')


model = build_effnet_b2(num_classes=num_species)

loss_fn = tf.keras.losses.BinaryCrossentropy(
    from_logits=False,
    label_smoothing=0.0  # Smoothing
)

metrics = [
    tf.keras.metrics.AUC(name='auc', multi_label=True),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer=Adam(3e-4), loss=loss_fn, metrics=metrics)

model.summary()

# Early Stop Callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=1e-6,
    verbose=1
)

In [40]:
# stage 1
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    callbacks=[early_stopping, lr_scheduler],
    class_weight=class_weight 
)

Epoch 1/20
[1m715/715[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 515ms/step - auc: 0.4591 - loss: 1.1254 - precision: 0.0054 - recall: 0.0323 - val_auc: 0.4593 - val_loss: 0.2789 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 3.0000e-04
Epoch 2/20
[1m715/715[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m549s[0m 750ms/step - auc: 0.4589 - loss: 0.3202 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_auc: 0.4699 - val_loss: 0.1960 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 3.0000e-04
Epoch 3/20
[1m715/715[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m735s[0m 991ms/step - auc: 0.4622 - loss: 0.2677 - precision: 0.0015 - recall: 2.1861e-06 - val_auc: 0.4822 - val_loss: 0.1542 - val_precision: 0.0417 - val_recall: 1.7504e-04 - learning_rate: 3.0000e-04
Epoch 4/20
[1m715/715[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 449ms/step - auc: 0.4777 - loss: 0.2492 - precision: 0.0636 - recall: 1.2172e-04 - val_auc:

In [41]:
# stage 2
gc.collect()
for layer in model.get_layer('efficientnetb2').layers[-100:]:
    layer.trainable = True

model.compile(optimizer=Adam(1e-5), loss=loss_fn, metrics=metrics)
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    callbacks=[early_stopping, lr_scheduler]
)

model.save("effnetb2_two_stage.h5")

Epoch 1/20
[1m715/715[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m498s[0m 665ms/step - auc: 0.5184 - loss: 0.0364 - precision: 0.0906 - recall: 7.7586e-04 - val_auc: 0.5526 - val_loss: 0.0311 - val_precision: 0.2475 - val_recall: 0.0044 - learning_rate: 1.0000e-05
Epoch 2/20
[1m715/715[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m420s[0m 588ms/step - auc: 0.5011 - loss: 0.0317 - precision: 0.1855 - recall: 9.6353e-04 - val_auc: 0.5996 - val_loss: 0.0282 - val_precision: 0.2421 - val_recall: 0.0135 - learning_rate: 1.0000e-05
Epoch 3/20
[1m715/715[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m421s[0m 588ms/step - auc: 0.5159 - loss: 0.0301 - precision: 0.1994 - recall: 0.0027 - val_auc: 0.6143 - val_loss: 0.0272 - val_precision: 0.2767 - val_recall: 0.0222 - learning_rate: 1.0000e-05
Epoch 4/20
[1m715/715[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m424s[0m 593ms/step - auc: 0.5231 - loss: 0.0295 - precision: 0.2640 - recall: 0.0038 - val_auc: 0.6282 - val_loss: 0.0268 -

In [42]:
results = model.evaluate(val_ds, verbose=1)
val_loss = results[0]
val_auc = results[1]
val_precision = results[2]
val_recall = results[3]

print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation AUC: {val_auc:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")

[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 352ms/step - auc: 0.5990 - loss: 0.0230 - precision: 0.5784 - recall: 0.0539
Validation Loss: 0.0230
Validation AUC: 0.6667
Precision: 0.5754
Recall: 0.0541
