In [3]:
# ============================================================
# 🐦 BirdCLEF 2023 dataset download via Kaggle API (Method B)
# Works on Windows / RunPod / Local Jupyter
# ============================================================
import os, sys, subprocess, glob, zipfile, pandas as pd, pathlib

# 1️⃣  Fill in your Kaggle credentials here
os.environ["KAGGLE_USERNAME"] = "vijaykalmani"
os.environ["KAGGLE_KEY"]      = "04ab696511da6beee0c5baf050229d57"   # 64-character token, not your password

# 2️⃣  Install Kaggle CLI if needed
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "kaggle"], check=True)

# 3️⃣  Create ~/.kaggle/kaggle.json automatically
home = pathlib.Path.home()
kaggle_dir = home / ".kaggle"
kaggle_dir.mkdir(exist_ok=True)
(kaggle_dir / "kaggle.json").write_text(
    f'{{"username":"{os.environ["KAGGLE_USERNAME"]}","key":"{os.environ["KAGGLE_KEY"]}"}}'
)
print(f"✅ Created credentials at: {kaggle_dir / 'kaggle.json'}")

# 4️⃣  Choose where to save BirdCLEF data
DEST = r"C:\birdclef_2023" if os.name == "nt" else "/workspace/birdclef_2023"
os.makedirs(DEST, exist_ok=True)
print(f"📁 Download directory: {DEST}")

# 5️⃣  Download all competition files
print("⬇️  Downloading BirdCLEF 2023 (this may take time)...")
subprocess.run(
    ["kaggle", "competitions", "download", "-c", "birdclef-2023", "-p", DEST, "--force"],
    check=True
)

# 6️⃣  Unzip and clean up
for zf in glob.glob(os.path.join(DEST, "*.zip")):
    print(f"📦 Extracting {os.path.basename(zf)} ...")
    with zipfile.ZipFile(zf, "r") as z:
        z.extractall(DEST)
    os.remove(zf)
print("✅ All ZIPs extracted and removed.")

# 7️⃣  Verify the data
meta_path = os.path.join(DEST, "train_metadata.csv")
if os.path.exists(meta_path):
    df = pd.read_csv(meta_path)
    print("✅ train_metadata.csv loaded:", df.shape)
    print(df.head(3))
else:
    print("ℹ️  train_metadata.csv not found — check that download succeeded.")


✅ Created credentials at: /root/.kaggle/kaggle.json
📁 Download directory: /workspace/birdclef_2023
⬇️  Downloading BirdCLEF 2023 (this may take time)...
Downloading birdclef-2023.zip to /workspace/birdclef_2023


100%|██████████| 4.91G/4.91G [00:21<00:00, 251MB/s]



📦 Extracting birdclef-2023.zip ...
✅ All ZIPs extracted and removed.
✅ train_metadata.csv loaded: (16941, 12)
  primary_label secondary_labels      type  latitude  longitude  \
0       abethr1               []  ['song']    4.3906    38.2788   
1       abethr1               []  ['call']   -2.9524    38.2921   
2       abethr1               []  ['song']   -2.9524    38.2921   

      scientific_name               common_name         author  \
0  Turdus tephronotus  African Bare-eyed Thrush  Rolf A. de By   
1  Turdus tephronotus  African Bare-eyed Thrush  James Bradley   
2  Turdus tephronotus  African Bare-eyed Thrush  James Bradley   

                                             license  rating  \
0  Creative Commons Attribution-NonCommercial-Sha...     4.0   
1  Creative Commons Attribution-NonCommercial-Sha...     3.5   
2  Creative Commons Attribution-NonCommercial-Sha...     3.5   

                                 url              filename  
0  https://www.xeno-canto.org/128013 

In [6]:
# ============================================================
# Cell 1 — Preprocessing: build & save log-mel segments
# Produces:
#   /workspace/birdclef_2023/out/features/X_logmel.npy  (object array of (128, SEG_FRAMES))
#   /workspace/birdclef_2023/out/features/labels.csv    (columns: species, filename)
# Run this only when you change preprocessing params or dataset scope.
# ============================================================

import os, glob, random, warnings
warnings.filterwarnings("ignore")

import numpy as np, pandas as pd, librosa
from tqdm import tqdm

# -------------------------
# PATHS (RunPod)
# -------------------------
ROOT_DATA = "/workspace/birdclef_2023"               # <— dataset root on RunPod
OUT_DIR   = "/workspace/birdclef_2023/out"           # <— save outputs under the same tree
os.makedirs(f"{OUT_DIR}/features", exist_ok=True)
os.makedirs(f"{OUT_DIR}/val_soundscapes", exist_ok=True)

meta_csv   = os.path.join(ROOT_DATA, "train_metadata.csv")
audio_base = os.path.join(ROOT_DATA, "train_audio")

assert os.path.isfile(meta_csv),  f"Missing {meta_csv}"
assert os.path.isdir(audio_base), f"Missing {audio_base}"

print(f"[ENV] ROOT_DATA = {ROOT_DATA}")
print(f"[ENV] OUT_DIR   = {OUT_DIR}")
print(f"[ENV] meta_csv  = {meta_csv}  | exists? {os.path.isfile(meta_csv)}")
print(f"[ENV] audio_dir = {audio_base} | exists? {os.path.isdir(audio_base)}")

# -------------------------
# PREPROCESSING PARAMS
# -------------------------
SR, N_MELS, HOP = 32000, 128, 512
SEG_DUR = 10.0
SEG_FRAMES = int(SR * SEG_DUR / HOP)  # = 625 for 32k/512 over 10s
N_SPECIES_QUICK = 15   # set to None to use full set

# -------------------------
# Helpers
# -------------------------
def audio_to_logmel(path, sr=SR, n_mels=N_MELS, hop=HOP):
    y, _ = librosa.load(path, sr=sr, mono=True, res_type="kaiser_fast")
    if y.size == 0: return None
    m = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop, fmin=20, fmax=sr//2)
    x = librosa.power_to_db(m, ref=np.max).astype(np.float32)
    return np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0)

def segment_logmel(logmel, seg_frames=SEG_FRAMES, overlap=0.5, pad_short=True):
    m = np.asarray(logmel, dtype=np.float32)
    if m.ndim != 2: return
    T = m.shape[1]
    if T <= 0: return
    if T < seg_frames:
        if pad_short:
            pad = seg_frames - T
            pad_right = np.flip(m[:, max(0, T-pad):T], axis=1) if T > 0 else np.zeros((m.shape[0], pad), dtype=np.float32)
            out = np.concatenate([m, pad_right], axis=1)[:, :seg_frames]
            yield out
        return
    step = max(1, int(seg_frames * (1 - overlap)))
    for s in range(0, T - seg_frames + 1, step):
        e = s + seg_frames
        yield m[:, s:e]

def _fix_to_shape(seg, target_h=N_MELS, target_w=SEG_FRAMES):
    """Ensure segment is (target_h, target_w); pad/crop safely if needed."""
    if not isinstance(seg, np.ndarray) or seg.ndim != 2:
        return None
    seg = np.asarray(seg, dtype=np.float32)
    h, w = seg.shape
    # Fix freq dimension
    if h != target_h:
        if h > target_h:
            seg = seg[:target_h, :]
        else:
            pad = np.zeros((target_h - h, w), dtype=np.float32)
            seg = np.concatenate([seg, pad], axis=0)
    # Fix time dimension
    if w != target_w:
        if w > target_w:
            seg = seg[:, :target_w]
        else:
            need = target_w - w
            ref = np.flip(seg[:, max(0, w-need):w], axis=1) if w > 0 else np.zeros((seg.shape[0], need), dtype=np.float32)
            seg = np.concatenate([seg, ref], axis=1)[:, :target_w]
    return np.nan_to_num(seg, nan=0.0, posinf=0.0, neginf=0.0)

# -------------------------
# Load metadata & (optional) species filter
# -------------------------
meta = pd.read_csv(meta_csv)

if N_SPECIES_QUICK is not None:
    keep_species = meta["primary_label"].value_counts().head(N_SPECIES_QUICK).index.tolist()
    meta = meta[meta["primary_label"].isin(keep_species)].reset_index(drop=True)

print(f"[INFO] Using {meta['primary_label'].nunique()} species and {len(meta)} files for this run.")
print(meta.head(3)[["primary_label", "filename"]])

# -------------------------
# Build features (tracks filename per segment)
# -------------------------
X_feats, Y_labels, filenames = [], [], []
bad_paths = 0

for _, row in tqdm(meta.iterrows(), total=len(meta), desc="Preprocess"):
    sp = str(row["primary_label"])
    fn = str(row["filename"])                  # e.g., "barswa/XC113914.ogg"
    fpath = os.path.join(audio_base, fn)       # filename already includes subfolder
    if not os.path.isfile(fpath):
        bad_paths += 1
        continue
    lm = audio_to_logmel(fpath)
    if lm is None:
        continue
    for seg in segment_logmel(lm, overlap=0.5, pad_short=True):
        seg_fixed = _fix_to_shape(seg)
        if seg_fixed is None:
            continue
        X_feats.append(seg_fixed)
        Y_labels.append(sp)
        filenames.append(fn)

# Filter malformed (paranoia)
ok = [i for i, s in enumerate(X_feats) if isinstance(s, np.ndarray) and s.shape == (N_MELS, SEG_FRAMES)]
if len(ok) != len(X_feats):
    dropped = len(X_feats) - len(ok)
    print(f"[WARN] Dropping {dropped} malformed segments.")
    X_feats   = [X_feats[i] for i in ok]
    Y_labels  = [Y_labels[i] for i in ok]
    filenames = [filenames[i] for i in ok]

# -------------------------
# Save features + labels (with filename)
# -------------------------
labels_df = pd.DataFrame({"species": Y_labels, "filename": filenames})

# Save robustly as object array (avoids broadcasting)
X_obj = np.empty(len(X_feats), dtype=object)
for i, seg in enumerate(X_feats):
    X_obj[i] = seg.astype(np.float32, copy=False)

np.save(f"{OUT_DIR}/features/X_logmel.npy", X_obj)    # load later with allow_pickle=True
labels_df.to_csv(f"{OUT_DIR}/features/labels.csv", index=False)

print(f"[INFO] Saved features: {len(X_feats)} | bad_paths: {bad_paths}")
print("[INFO] Top species:\n", labels_df["species"].value_counts().head(10))
print("✅ Preprocessing complete. You can skip this cell next runs.")


[ENV] ROOT_DATA = /workspace/birdclef_2023
[ENV] OUT_DIR   = /workspace/birdclef_2023/out
[ENV] meta_csv  = /workspace/birdclef_2023/train_metadata.csv  | exists? True
[ENV] audio_dir = /workspace/birdclef_2023/train_audio | exists? True
[INFO] Using 15 species and 6227 files for this run.
  primary_label             filename
0        barswa  barswa/XC113914.ogg
1        barswa  barswa/XC129647.ogg
2        barswa  barswa/XC132406.ogg


Preprocess: 100%|██████████| 6227/6227 [21:15<00:00,  4.88it/s]  


[WARN] Dropping 72 malformed segments.
[INFO] Saved features: 55062 | bad_paths: 0
[INFO] Top species:
 species
thrnig1    11614
wlwwar      6025
eubeat1     5596
hoopoe      4446
combuz1     4338
cohmar1     3574
barswa      3185
eaywag1     2723
comsan      2545
combul2     2493
Name: count, dtype: int64
✅ Preprocessing complete. You can skip this cell next runs.


In [None]:
!pip install -q numpy pandas librosa soundfile tqdm matplotlib scipy scikit-learn tensorflow


In [13]:
!pip install -q numpy pandas librosa soundfile tqdm matplotlib scipy scikit-learn "tensorflow[and-cuda]"


In [1]:
# ============================================================
# CELL 2 — Train + Evaluate (CRNN+ full stack + clip-level)
# ============================================================

import os, random, warnings, glob, json, gc, shutil, time
warnings.filterwarnings("ignore")
import numpy as np, pandas as pd
from collections import Counter

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import (
    classification_report, confusion_matrix, average_precision_score,
    accuracy_score, f1_score
)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import mixed_precision

# -------------------------
# Paths (RunPod)
# -------------------------
# NOTE: Only paths changed to match your Cell 1 RunPod setup.
OUT_DIR = "/workspace/birdclef_2023/out"
FEAT_DIR = f"{OUT_DIR}/features"
SCAPE_DIR = f"{OUT_DIR}/val_soundscapes"
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(SCAPE_DIR, exist_ok=True)

CACHE_X = f"{FEAT_DIR}/X_logmel.npy"
CACHE_Y = f"{FEAT_DIR}/labels.csv"
META_JS = f"{FEAT_DIR}/meta.json"

assert os.path.isfile(CACHE_X), f"Missing {CACHE_X}"
assert os.path.isfile(CACHE_Y), f"Missing {CACHE_Y}"

# -------------------------
# Seed, GPU, Threads
# -------------------------
SEED = 42
np.random.seed(SEED); random.seed(SEED); tf.random.set_seed(SEED)
for g in tf.config.experimental.list_physical_devices('GPU'):
    try: tf.config.experimental.set_memory_growth(g, True)
    except: pass
try:
    tf.config.threading.set_intra_op_parallelism_threads(2)
    tf.config.threading.set_inter_op_parallelism_threads(2)
except: pass

mixed_precision.set_global_policy("float32")
print("[MP] Using float32 for stability.")

def float32_dense(units, **kw):
    kw.setdefault("dtype", "float32")
    return layers.Dense(units, **kw)

# -------------------------
# Load metadata + cache
# -------------------------
with open(META_JS) as f:
    meta = json.load(f)
SR, N_MELS, HOP = int(meta["SR"]), int(meta["N_MELS"]), int(meta["HOP"])
SEG_DUR, SEG_FRAMES = float(meta["SEG_DUR"]), int(meta["SEG_FRAMES"])
IMG_H, IMG_W = 224, 224
print(f"[META] SR={SR} N_MELS={N_MELS} HOP={HOP} SEG_DUR={SEG_DUR} -> SEG_FRAMES={SEG_FRAMES}")

labels_df = pd.read_csv(CACHE_Y)
# OLD (replace these two lines)
# X_LOGMEL = np.load(CACHE_X).astype(np.float32)
# X_LOGMEL = np.nan_to_num(X_LOGMEL, nan=0.0, posinf=0.0, neginf=0.0)

# NEW — robust loader for object .npy caches
def load_logmel(path, n_mels, seg_frames):
    try:
        arr = np.load(path, allow_pickle=False)
    except ValueError as e:
        if "Object arrays cannot be loaded" not in str(e):
            raise
        print("[LOAD] Detected object array; reloading with allow_pickle=True and stacking...")
        obj = np.load(path, allow_pickle=True)
        fixed = []
        for i, a in enumerate(obj):
            a = np.asarray(a, dtype=np.float32).squeeze()
            if a.ndim != 2:
                raise ValueError(f"Element {i} has shape {a.shape}, expected 2D.")
            # ensure mel dimension is n_mels
            if a.shape[0] == n_mels:
                pass
            elif a.shape[1] == n_mels:
                a = a.T
            else:
                raise ValueError(f"Element {i} has mel dim {a.shape}, expected N_MELS={n_mels}.")
            # pad or center-crop time axis to seg_frames
            t = a.shape[1]
            if t < seg_frames:
                pad = np.zeros((n_mels, seg_frames - t), dtype=np.float32)
                a = np.concatenate([a, pad], axis=1)
            elif t > seg_frames:
                start = (t - seg_frames) // 2
                a = a[:, start:start+seg_frames]
            fixed.append(a)
        arr = np.stack(fixed, axis=0)

    arr = np.asarray(arr, dtype=np.float32)
    return np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)

# use it here
X_LOGMEL = load_logmel(CACHE_X, N_MELS, SEG_FRAMES)


le = LabelEncoder()
y = le.fit_transform(labels_df["species"].astype(str))
num_classes = len(le.classes_)
print(f"[CACHE] Loaded {len(X_LOGMEL)} segments, {num_classes} species.")
print("[CHECK] class counts:", Counter(y))

# -------------------------
# Grouped split (leakage-free)
# -------------------------
def _pick_groups(df):
    cand = ["rec_id", "recording_id", "filename", "file", "path", "filepath"]
    for c in cand:
        if c in df.columns:
            g = df[c].astype(str)
            if c in ["path", "filepath"]:
                g = g.apply(lambda p: os.path.basename(p))
            return g.values, c
    raise KeyError("No suitable group column found (need rec_id/filename).")

groups, group_col = _pick_groups(labels_df)
print(f"[SPLIT] Grouping by '{group_col}'.")
gss = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=SEED)
tr_idx, val_idx = next(gss.split(X_LOGMEL, y, groups=groups))
assert len(set(groups[tr_idx]).intersection(set(groups[val_idx]))) == 0, "Group leakage!"
X_tr_raw, X_val_raw = X_LOGMEL[tr_idx], X_LOGMEL[val_idx]
y_tr, y_val = y[tr_idx], y[val_idx]
print(f"[SPLIT] Train={len(tr_idx)} Val={len(val_idx)}")

# -------------------------
# Run tag (avoid old shape restores)
# -------------------------
RUN_TAG = f"crnnplus_H{IMG_H}W{IMG_W}_C{num_classes}_v5"
RUN_DIR = f"{OUT_DIR}/{RUN_TAG}"
os.makedirs(RUN_DIR, exist_ok=True)

# -------------------------
# tf.data pipeline
# -------------------------
AUTOTUNE = tf.data.AUTOTUNE

def mel_to_img_tf(mel):
    mel = tf.where(tf.math.is_finite(mel), mel, tf.zeros_like(mel))
    vmin, vmax = tf.reduce_min(mel), tf.reduce_max(mel)
    mel = (mel - vmin) / tf.maximum(vmax - vmin, 1e-6)
    mel = tf.expand_dims(mel, -1)
    mel = tf.image.resize(mel, (IMG_H, IMG_W))
    mel = tf.tile(mel, [1, 1, 3])
    return tf.cast(mel, tf.float32)

def _map_fn(mel, label_oh):
    return mel_to_img_tf(mel), tf.cast(label_oh, tf.float32)

# MixUp WITHOUT tfp / tf.distributions
def _beta_from_gamma(batch, alpha=0.4):
    g1 = tf.random.gamma([batch], alpha, dtype=tf.float32)
    g2 = tf.random.gamma([batch], alpha, dtype=tf.float32)
    l = g1 / tf.maximum(g1 + g2, 1e-8)
    return tf.reshape(l, [batch, 1, 1, 1])  # for images

def mixup_batch(x, y, alpha=0.4):
    b = tf.shape(x)[0]
    l_img = _beta_from_gamma(b, alpha=alpha)
    l_lbl = tf.reshape(l_img, [b, 1])
    idx = tf.random.shuffle(tf.range(b))
    xm = x * l_img + tf.gather(x, idx) * (1.0 - l_img)
    ym = y * l_lbl + tf.gather(y, idx) * (1.0 - l_lbl)
    return xm, ym

def make_ds(X, y_oh, batch, shuffle=True, do_mixup=False):
    ds = tf.data.Dataset.from_tensor_slices((X, y_oh))
    if shuffle: ds = ds.shuffle(min(len(y_oh), 8192), seed=SEED)
    ds = ds.map(_map_fn, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(batch, drop_remainder=do_mixup)
    if do_mixup: ds = ds.map(lambda a,b: mixup_batch(a,b), num_parallel_calls=AUTOTUNE)
    return ds.prefetch(AUTOTUNE)

# -------------------------
# One-hot + class weights
# -------------------------
y_tr_oh = tf.one_hot(y_tr, depth=num_classes)
y_val_oh = tf.one_hot(y_val, depth=num_classes)
cnt = Counter(y_tr.tolist())
maxc = max(cnt.values())
class_weights = {c: maxc/cnt[c] for c in cnt}

# -------------------------
# CRNN+ Model (Res + SE + MHSA)
# -------------------------
class SpecAugment(layers.Layer):
    def __init__(self, fm=16, tm=32, nf=2, nt=2, **kwargs):
        super().__init__(**kwargs)
        self.fm, self.tm, self.nf, self.nt = int(fm), int(tm), int(nf), int(nt)
        self.active = False
    def get_config(self):
        cfg = super().get_config()
        cfg.update({"fm": self.fm, "tm": self.tm, "nf": self.nf, "nt": self.nt, "active": self.active})
        return cfg
    def call(self,x,training=False):
        if not training or not self.active: return x
        B,H,W,C = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2], tf.shape(x)[3]
        def mask(axis_len, wmax, axis):
            w = tf.random.uniform([], 0, tf.maximum(wmax, 1), dtype=tf.int32)
            w = tf.minimum(w, axis_len-1)
            s = tf.random.uniform([], 0, tf.maximum(axis_len-w,1), dtype=tf.int32)
            if axis==1:
                return tf.concat([tf.ones([B,s,W,C]), tf.zeros([B,w,W,C]), tf.ones([B,H-s-w,W,C])], axis=1)
            else:
                return tf.concat([tf.ones([B,H,s,C]), tf.zeros([B,H,w,C]), tf.ones([B,H,W-s-w,C])], axis=2)
        y=x
        for _ in range(self.nf): y = y*mask(H, self.fm, 1)
        for _ in range(self.nt): y = y*mask(W, self.tm, 2)
        return y

def se_block(x,r=8):
    c=x.shape[-1]
    s=layers.GlobalAveragePooling2D()(x)
    s=layers.Dense(max(c//r,4),activation="relu")(s)
    s=layers.Dense(c,activation="sigmoid")(s)
    return layers.Multiply()([x, layers.Reshape((1,1,c))(s)])

def res_block(x,ch,pool=(2,2),drop=0.2):
    h=layers.Conv2D(ch,3,padding="same",use_bias=False)(x); h=layers.BatchNormalization()(h); h=layers.Activation("relu")(h)
    h=layers.Conv2D(ch,3,padding="same",use_bias=False)(h); h=layers.BatchNormalization()(h)
    if x.shape[-1]!=ch: x=layers.Conv2D(ch,1,padding="same",use_bias=False)(x); x=layers.BatchNormalization()(x)
    h=layers.Add()([x,h]); h=layers.Activation("relu")(h); h=se_block(h)
    h=layers.MaxPooling2D(pool)(h); h=layers.Dropout(drop)(h); return h

def build_crnn(n_classes):
    H3,W3,C3=IMG_H//8, IMG_W//4, 192
    inp=keras.Input(shape=(IMG_H,IMG_W,3))
    spec=SpecAugment(name="spec_augment")
    x=spec(inp)
    x=res_block(x,64,(2,2),0.15)
    x=res_block(x,128,(2,2),0.20)
    x=res_block(x,192,(2,1),0.30)
    x=layers.Permute((2,1,3))(x)      # (W3,H3,C3)
    x=layers.Reshape((W3,H3*C3))(x)   # (56, 28*192)
    x=layers.MultiHeadAttention(num_heads=4,key_dim=64,dropout=0.1)(x,x)
    x=layers.LayerNormalization()(x)
    x=layers.Bidirectional(layers.GRU(160,return_sequences=True))(x)
    x=layers.Bidirectional(layers.GRU(160,return_sequences=True))(x)
    a=layers.Dense(128,activation="tanh")(x)
    a=layers.Dense(1)(a)
    a=layers.Softmax(axis=1)(a)
    x=layers.Multiply()([x,a])
    x=layers.Lambda(lambda t: keras.ops.sum(t,1))(x)
    x=layers.Dropout(0.4)(x)
    out=float32_dense(n_classes,activation="softmax")(x)
    return keras.Model(inp,out)

model=build_crnn(num_classes)

# -------------------------
# Optimizer & LR schedule (serializable)
# -------------------------
try:
    register = keras.utils.register_keras_serializable
except AttributeError:
    from tensorflow.keras.utils import register_keras_serializable as register

@register(package="custom")
class WarmupCosine(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, base_lr, warmup_steps, total_steps, name="WarmupCosine"):
        super().__init__()
        self.base_lr = float(base_lr)
        self.warmup_steps = int(warmup_steps)
        self.total_steps = int(total_steps)
        self.name = name
    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        warm = tf.cast(tf.maximum(1, self.warmup_steps), tf.float32)
        total = tf.cast(tf.maximum(self.warmup_steps+1, self.total_steps), tf.float32)
        warm_lr = self.base_lr * (step / warm)
        pct = (step - warm) / tf.maximum(1.0, total - warm)
        pct = tf.clip_by_value(pct, 0.0, 1.0)
        cos_lr = 0.5 * self.base_lr * (1.0 + tf.cos(np.pi * pct))
        return tf.where(step < warm, warm_lr, cos_lr)
    def get_config(self):
        return {
            "base_lr": self.base_lr,
            "warmup_steps": self.warmup_steps,
            "total_steps": self.total_steps,
            "name": self.name,
        }

batch_size = 64
steps_per_epoch = max(1, len(y_tr)//batch_size)
total_steps = steps_per_epoch * 40
warmup_steps = int(0.1 * total_steps)

sched = WarmupCosine(base_lr=2e-4, warmup_steps=warmup_steps, total_steps=total_steps)
opt = keras.optimizers.AdamW(learning_rate=sched, weight_decay=1e-5, clipnorm=1.0)
loss = keras.losses.CategoricalCrossentropy(label_smoothing=0.1)
model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])

# -------------------------
# Callbacks
# -------------------------
class ToggleSpecAug(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
        if epoch>=2:
            try: self.model.get_layer("spec_augment").active=True
            except: pass

class LrLogger(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        lr_fn = self.model.optimizer.learning_rate
        try:
            lr_val = float(tf.keras.backend.get_value(lr_fn(self.model.optimizer.iterations)))
        except TypeError:
            lr_val = float(tf.keras.backend.get_value(lr_fn))
        print(f"[LR] epoch {epoch+1}: {lr_val:.6g}")

cb=[
    keras.callbacks.BackupAndRestore(backup_dir=f"{RUN_DIR}/backup_state"),
    keras.callbacks.EarlyStopping(monitor="val_accuracy",patience=6,restore_best_weights=True),
    keras.callbacks.ModelCheckpoint(f"{RUN_DIR}/best_crnn.keras",monitor="val_accuracy",save_best_only=True),
    keras.callbacks.CSVLogger(f"{RUN_DIR}/train_log.csv",append=True),
    ToggleSpecAug(),
    LrLogger(),
]

# -------------------------
# Training
# -------------------------
train_ds = make_ds(X_tr_raw, y_tr_oh, batch=batch_size, shuffle=True,  do_mixup=True)
val_ds   = make_ds(X_val_raw, y_val_oh, batch=128,       shuffle=False, do_mixup=False)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=40,
    callbacks=cb,
    class_weight=class_weights,
    verbose=1
)

keras.models.save_model(model, f"{RUN_DIR}/model_crnn_final.keras")
print("[SAVED] model ->", f"{RUN_DIR}/model_crnn_final.keras")

# -------------------------
# Eval (patch-level)
# -------------------------
y_val_prob = model.predict(val_ds, verbose=0)
y_val_pred = y_val_prob.argmax(1)
print(classification_report(y_val, y_val_pred, target_names=[le.classes_[k] for k in sorted(np.unique(y_val))], digits=4))
print(f"[PATCH] Accuracy={accuracy_score(y_val,y_val_pred)*100:.2f}%  Macro-F1={f1_score(y_val,y_val_pred,average='macro'):.4f}")

# -------------------------
# Confusion matrix (top-30)
# -------------------------
top30 = [c for c,_ in Counter(y_val).most_common(30)]
cm = confusion_matrix(y_val, y_val_pred, labels=top30, normalize='true')
plt.figure(figsize=(10,10)); plt.imshow(cm, aspect='auto'); plt.title("Top-30 species — normalized confusion")
plt.xlabel("Predicted"); plt.ylabel("True"); plt.colorbar(); plt.tight_layout()
plt.savefig(f"{RUN_DIR}/val_confusion_top30.png", dpi=150); plt.close()
print("[SAVED] Confusion matrix ->", f"{RUN_DIR}/val_confusion_top30.png")

# =========================
# Clip-level evaluation (pool by filename/recording)
# =========================
val_groups = np.asarray(groups)[val_idx]
val_true   = y_val
val_prob   = y_val_prob

def group_mode(labels):
    labels = np.asarray(labels, dtype=np.int64)
    b = np.bincount(labels, minlength=num_classes)
    return int(b.argmax())

clip_rows = []
for g in np.unique(val_groups):
    sel = (val_groups == g)
    probs_g = val_prob[sel]                 # (k, C)
    labels_g = val_true[sel]                # (k,)
    true_clip = group_mode(labels_g)
    p_max  = probs_g.max(axis=0)
    p_mean = probs_g.mean(axis=0)
    pred_max  = int(p_max.argmax())
    pred_mean = int(p_mean.argmax())
    clip_rows.append({
        "group": g, "true": true_clip,
        "pred_max": pred_max, "pred_mean": pred_mean,
        "correct_max": int(pred_max == true_clip),
        "correct_mean": int(pred_mean == true_clip),
        "conf_max": float(p_max[pred_max]),
        "conf_mean": float(p_mean[pred_mean]),
    })

clip_df = pd.DataFrame(clip_rows)
y_true_clip = clip_df["true"].values
y_pred_clip_max  = clip_df["pred_max"].values
y_pred_clip_mean = clip_df["pred_mean"].values

acc_clip_max  = accuracy_score(y_true_clip, y_pred_clip_max)
acc_clip_mean = accuracy_score(y_true_clip, y_pred_clip_mean)
f1_clip_max   = f1_score(y_true_clip, y_pred_clip_max, average="macro")
f1_clip_mean  = f1_score(y_true_clip, y_pred_clip_mean, average="macro")

print("\n===== 🎧 Clip-level (grouped by", group_col, ") =====")
print(f"[CLIP][MAX ] Accuracy = {acc_clip_max*100:.2f}% | Macro-F1 = {f1_clip_max:.4f}")
print(f"[CLIP][MEAN] Accuracy = {acc_clip_mean*100:.2f}% | Macro-F1 = {f1_clip_mean:.4f}")

tnames_all = list(le.classes_)
print("\n[CLIP][MAX ] classification report:")
print(classification_report(y_true_clip, y_pred_clip_max, target_names=tnames_all, digits=4))
print("\n[CLIP][MEAN] classification report:")
print(classification_report(y_true_clip, y_pred_clip_mean, target_names=tnames_all, digits=4))

clip_out = clip_df.copy()
clip_out["true_species"] = [tnames_all[i] for i in clip_out["true"].values]
clip_out["pred_max_species"]  = [tnames_all[i] for i in clip_out["pred_max"].values]
clip_out["pred_mean_species"] = [tnames_all[i] for i in clip_out["pred_mean"].values]
clip_csv = f"{RUN_DIR}/val_clip_level_pooling.csv"
clip_out.to_csv(clip_csv, index=False)
print("[SAVED] Clip-level results ->", clip_csv)

cm_max = confusion_matrix(y_true_clip, y_pred_clip_max, normalize="true")
cm_mean = confusion_matrix(y_true_clip, y_pred_clip_mean, normalize="true")
plt.figure(figsize=(10,10)); plt.imshow(cm_max, aspect='auto'); plt.title("Clip-level (MAX) — normalized confusion")
plt.xlabel("Predicted"); plt.ylabel("True"); plt.colorbar(); plt.tight_layout()
plt.savefig(f"{RUN_DIR}/clip_confusion_max.png", dpi=150); plt.close()
print("[SAVED] Clip conf (MAX) ->", f"{RUN_DIR}/clip_confusion_max.png")
plt.figure(figsize=(10,10)); plt.imshow(cm_mean, aspect='auto'); plt.title("Clip-level (MEAN) — normalized confusion")
plt.xlabel("Predicted"); plt.ylabel("True"); plt.colorbar(); plt.tight_layout()
plt.savefig(f"{RUN_DIR}/clip_confusion_mean.png", dpi=150); plt.close()
print("[SAVED] Clip conf (MEAN) ->", f"{RUN_DIR}/clip_confusion_mean.png")


2025-10-04 08:52:36.159938: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[MP] Using float32 for stability.
[META] SR=32000 N_MELS=128 HOP=512 SEG_DUR=10.0 -> SEG_FRAMES=625
[LOAD] Detected object array; reloading with allow_pickle=True and stacking...
[CACHE] Loaded 55062 segments, 15 species.
[CHECK] class counts: Counter({np.int64(12): 11614, np.int64(13): 6025, np.int64(7): 5596, np.int64(9): 4446, np.int64(4): 4338, np.int64(2): 3574, np.int64(0): 3185, np.int64(6): 2723, np.int64(5): 2545, np.int64(3): 2493, np.int64(14): 2133, np.int64(1): 2004, np.int64(11): 1904, np.int64(10): 1553, np.int64(8): 929})
[SPLIT] Grouping by 'filename'.
[SPLIT] Train=42911 Val=12151


I0000 00:00:1759568237.460859    2080 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 79078 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:41:00.0, compute capability: 8.0


Epoch 1/40


E0000 00:00:1759568288.002735    2080 meta_optimizer.cc:967] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inStatefulPartitionedCall/functional_1/dropout_1/stateless_dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
2025-10-04 08:58:16.407892: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91002


[1m670/670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step - accuracy: 0.4392 - loss: 7.6187[LR] epoch 1: 5e-05
[1m670/670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 278ms/step - accuracy: 0.6197 - loss: 5.7348 - val_accuracy: 0.5402 - val_loss: 1.8414
Epoch 2/40
[1m670/670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 250ms/step - accuracy: 0.6899 - loss: 5.9260[LR] epoch 2: 0.0001
[1m670/670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 265ms/step - accuracy: 0.7671 - loss: 4.7614 - val_accuracy: 0.6522 - val_loss: 1.4946
Epoch 3/40
[1m670/670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 247ms/step - accuracy: 0.7374 - loss: 5.5047[LR] epoch 3: 0.00015
[1m670/670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 262ms/step - accuracy: 0.7968 - loss: 4.5161 - val_accuracy: 0.6943 - val_loss: 1.3926
Epoch 4/40
[1m670/670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 247ms/step - accuracy: 0.7732 - loss: 5.232

In [2]:
# ============================================================
# CELL — pcmAP (padded class-averaged mean average precision)
# ============================================================
# Requirements: assumes you already have these in memory from Cell 2:
#   RUN_DIR, le, num_classes, y_val (int class ids), y_val_prob (N_val x C),
#   groups (full array from labels_df), val_idx, and group_col string.
# If you only want patch-level pcmAP, you can ignore the clip section.

import numpy as np, pandas as pd
from sklearn.metrics import average_precision_score, accuracy_score, f1_score
import os

assert 'y_val' in globals() and 'y_val_prob' in globals(), "Run training/eval first to get y_val & y_val_prob."
assert 'le' in globals() and 'num_classes' in globals(), "Need label encoder and num_classes."
assert 'RUN_DIR' in globals(), "RUN_DIR not found."
os.makedirs(RUN_DIR, exist_ok=True)

# -------------------------
# Utilities
# -------------------------
def _padded_ap(y_true_bin: np.ndarray, y_score: np.ndarray) -> float:
    """
    Compute AP with conservative 'padding' to avoid undefined cases:
    - If a class has zero positives, append one positive with score 0 (and one negative with score 0).
    - If a class has all positives, append one negative with score 0.
    This keeps AP defined without inflating it.
    """
    y_true = y_true_bin.astype(np.int8).ravel()
    scores = y_score.astype(np.float32).ravel()
    pos = y_true.sum()
    if pos == 0:
        y_true = np.concatenate([y_true, [1, 0]])
        scores = np.concatenate([scores, [0.0, 0.0]])
    elif pos == len(y_true):
        y_true = np.concatenate([y_true, [0]])
        scores = np.concatenate([scores, [0.0]])
    return float(average_precision_score(y_true, scores))

def compute_pcmAP(y_true_idx: np.ndarray, y_prob: np.ndarray, class_names=None):
    """
    Patch-level pcmAP: per-class AP with padding, then macro average.
    y_true_idx : (N,) integer class ids.
    y_prob     : (N, C) probabilities (or scores) per class.
    Returns: per_class_df, macro_pcmAP
    """
    N, C = y_prob.shape
    class_names = class_names or [f"class_{i}" for i in range(C)]
    assert len(class_names) == C

    # one-vs-rest ground truth
    y_true_bin = np.eye(C, dtype=np.int8)[y_true_idx]  # (N, C)

    aps, supports = [], []
    for c in range(C):
        aps.append(_padded_ap(y_true_bin[:, c], y_prob[:, c]))
        supports.append(int(y_true_bin[:, c].sum()))
    aps = np.array(aps, dtype=np.float32)
    supports = np.array(supports, dtype=np.int32)

    df = pd.DataFrame({
        "class_idx": np.arange(C),
        "class_name": class_names,
        "support": supports,
        "AP_padded": aps
    }).sort_values("class_idx").reset_index(drop=True)

    return df, float(np.mean(aps))

def make_clip_level_probs(y_true_idx: np.ndarray,
                          y_prob: np.ndarray,
                          groups_array: np.ndarray,
                          pooling: str = "mean"):
    """
    Create clip-level (group-level) targets and probs using pooling.
    y_true_idx  : (N,) int labels per segment
    y_prob      : (N, C) probs per segment
    groups_array: (N,) group key per segment (e.g., filename)
    pooling     : 'mean' or 'max'
    Returns: y_true_clip_idx (M,), y_prob_clip (M, C), unique_groups (M,)
    """
    uniq = np.unique(groups_array)
    M, C = len(uniq), y_prob.shape[1]
    y_prob_clip = np.zeros((M, C), dtype=np.float32)
    y_true_clip_idx = np.zeros((M,), dtype=np.int64)

    # majority vote (mode) for clip's true label
    def _mode1d(v):
        v = np.asarray(v, dtype=np.int64)
        b = np.bincount(v, minlength=C)
        return int(b.argmax())

    for i, g in enumerate(uniq):
        sel = (groups_array == g)
        P = y_prob[sel]  # (k, C)
        if pooling == "mean":
            pooled = P.mean(axis=0)
        elif pooling == "max":
            pooled = P.max(axis=0)
        else:
            raise ValueError("pooling must be 'mean' or 'max'")
        y_prob_clip[i] = pooled
        y_true_clip_idx[i] = _mode1d(y_true_idx[sel])

    return y_true_clip_idx, y_prob_clip, uniq

# -------------------------
# Patch-level pcmAP
# -------------------------
tnames_all = list(le.classes_)

patch_df, patch_pcmAP = compute_pcmAP(y_val, np.asarray(y_val_prob, dtype=np.float32), class_names=tnames_all)
patch_csv = os.path.join(RUN_DIR, "pcmAP_patch_per_class.csv")
patch_df.to_csv(patch_csv, index=False)

print("\n===== 📊 pcmAP — Patch level =====")
print(f"[pcmAP][PATCH] macro (mean over classes) = {patch_pcmAP:.4f}")
print(f"[pcmAP][PATCH] per-class CSV -> {patch_csv}")

# -------------------------
# Clip-level pcmAP (MEAN & MAX pooling)
# -------------------------
assert 'groups' in globals() and 'val_idx' in globals() and 'group_col' in globals(), \
    "Need groups, val_idx, group_col from the split."

val_groups = np.asarray(groups)[val_idx]

# MEAN pooling
y_true_clip_mean, y_prob_clip_mean, uniq_groups_mean = make_clip_level_probs(
    y_true_idx=y_val,
    y_prob=np.asarray(y_val_prob, dtype=np.float32),
    groups_array=val_groups,
    pooling="mean"
)
clip_mean_df, clip_mean_pcmAP = compute_pcmAP(y_true_clip_mean, y_prob_clip_mean, class_names=tnames_all)
clip_mean_csv = os.path.join(RUN_DIR, "pcmAP_clip_mean_per_class.csv")
clip_mean_df.to_csv(clip_mean_csv, index=False)

# MAX pooling
y_true_clip_max, y_prob_clip_max, uniq_groups_max = make_clip_level_probs(
    y_true_idx=y_val,
    y_prob=np.asarray(y_val_prob, dtype=np.float32),
    groups_array=val_groups,
    pooling="max"
)
clip_max_df, clip_max_pcmAP = compute_pcmAP(y_true_clip_max, y_prob_clip_max, class_names=tnames_all)
clip_max_csv = os.path.join(RUN_DIR, "pcmAP_clip_max_per_class.csv")
clip_max_df.to_csv(clip_max_csv, index=False)

print(f"\n===== 🎧 pcmAP — Clip level (grouped by {group_col}) =====")
print(f"[pcmAP][CLIP][MEAN] macro = {clip_mean_pcmAP:.4f}  | per-class CSV -> {clip_mean_csv}")
print(f"[pcmAP][CLIP][MAX ] macro = {clip_max_pcmAP:.4f}  | per-class CSV -> {clip_max_csv}")





===== 📊 pcmAP — Patch level =====
[pcmAP][PATCH] macro (mean over classes) = 0.9237
[pcmAP][PATCH] per-class CSV -> /workspace/birdclef_2023/out/crnnplus_H224W224_C15_v5/pcmAP_patch_per_class.csv

===== 🎧 pcmAP — Clip level (grouped by filename) =====
[pcmAP][CLIP][MEAN] macro = 0.9444  | per-class CSV -> /workspace/birdclef_2023/out/crnnplus_H224W224_C15_v5/pcmAP_clip_mean_per_class.csv
[pcmAP][CLIP][MAX ] macro = 0.8983  | per-class CSV -> /workspace/birdclef_2023/out/crnnplus_H224W224_C15_v5/pcmAP_clip_max_per_class.csv

===== 📝 Reviewer notes on pcmAP =====
- We report *padded class-averaged mean average precision (pcmAP)* as the primary metric.
- For each class we compute AP in a one-vs-rest setting on probabilities.
- To avoid undefined cases for classes with no positives (or all positives) in a split, we apply conservative padding (append a zero-scored positive and/or a zero-scored negative) before AP.
- pcmAP is then the macro-average of per-class APs. We provide results at bo