In [11]:
import numpy as np 
import pandas as pd 
import os
import cv2
import math
import time
import librosa
from tqdm import tqdm

In [12]:
SAMPLE_RATE = 32000
CLIP_DURATION = 5
FFT_SIZE = 2048
HOP_SIZE = 128
NUM_MELS = 512
FREQ_MIN = 20
FREQ_MAX = 16000
SPEC_SHAPE = (256, 256)

In [13]:
# root = '/kaggle/input/birdclef-2025/'
root = '/Users/nikhilpaleti/Documents/UCSD/Q3/MLC/birdclef-2025/'

In [14]:
taxonomy = pd.read_csv( root + 'taxonomy.csv')
train_meta = pd.read_csv(root + 'train.csv')

In [15]:
species_to_class = dict(zip(taxonomy['primary_label'], taxonomy['class_name']))
species_labels = sorted(train_meta['primary_label'].unique())
label_ids = list(range(len(species_labels)))
label_to_id = dict(zip(species_labels, label_ids))
id_to_label = dict(zip(label_ids, species_labels))

print(f"Number of distinct species found: {len(species_labels)}")

Number of distinct species found: 206


In [16]:
data = train_meta[['primary_label', 'rating', 'filename']].copy()
data['label_id'] = data['primary_label'].map(label_to_id)
data['file_path'] = root + 'train_audio/' + data['filename']
data['sample_id'] = data['filename'].apply(lambda f: f.split('/')[0] + '-' + f.split('/')[-1].split('.')[0])
data['class_name'] = data['primary_label'].map(lambda k: species_to_class.get(k, 'Unknown'))

print(f"Preparing {len(data)} samples")

Preparing 28564 samples


In [17]:
print("Class distribution:")
print(data['class_name'].value_counts())

Class distribution:
class_name
Aves        27648
Amphibia      583
Mammalia      178
Insecta       155
Name: count, dtype: int64


In [18]:
def waveform_to_melspectrogram(waveform):
    if np.isnan(waveform).any():
        waveform = np.nan_to_num(waveform, nan=np.nanmean(waveform))

    mel = librosa.feature.melspectrogram(
        y=waveform,
        sr=SAMPLE_RATE,
        n_fft=FFT_SIZE,
        hop_length=HOP_SIZE,
        n_mels=NUM_MELS,
        fmin=FREQ_MIN,
        fmax=FREQ_MAX,
        power=2.0,
        norm='slaney',
        htk=True,
        center=True,
        pad_mode='reflect',
    )

    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_normalized = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-8)

    return mel_normalized.astype(np.float32)

In [19]:
print("Beginning spectrogram extraction...")
start_time = time.time()

processed_specs = {}
required_len = int(CLIP_DURATION * SAMPLE_RATE)

for idx, row in tqdm(data.iterrows(), total=len(data)):
    try:
        waveform, _ = librosa.load(row.file_path, sr=SAMPLE_RATE)

        # Repeat if too short
        if len(waveform) < required_len:
            repeat_factor = math.ceil(required_len / len(waveform))
            waveform = np.tile(waveform, repeat_factor)

        # Center crop
        mid = len(waveform) // 2
        start = max(0, mid - required_len // 2)
        clip = waveform[start:start + required_len]

        if len(clip) < required_len:
            clip = np.pad(clip, (0, required_len - len(clip)), mode='constant')

        mel = waveform_to_melspectrogram(clip)

        # Resize to (256, 256)
        if mel.shape != SPEC_SHAPE:
            mel = cv2.resize(mel, SPEC_SHAPE, interpolation=cv2.INTER_LINEAR)

        processed_specs[row.sample_id] = {
            "mel_spec": mel,
            "label_id": row["label_id"],
            "label": row["primary_label"],
            "class_name": row["class_name"]
        }

    except Exception as err:
        print(f"Failed: {row.file_path} → {err}")

print(f"✅ Processed {len(processed_specs)} samples in {(time.time() - start_time)/60:.2f} min")

Beginning spectrogram extraction...


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
100%|██████████| 28564/28564 [24:36<00:00, 19.34it/s] 

✅ Processed 28564 samples in 24.62 min





In [20]:
np.save(root + 'falcon_birdclef_cnn_preprocessed_128hop.npy', processed_specs, allow_pickle=True)