In [1]:
"""! wget https://storage.googleapis.com/audioset/yamnet.h5 -O ../src/models/yamnet.h5"""

'! wget https://storage.googleapis.com/audioset/yamnet.h5 -O ../src/models/yamnet.h5'

# Imports

In [2]:
#!/usr/bin/env python3
"""
finetune_yamnet_tf.py
Full TensorFlow pipeline to fine-tune YAMNet for a small custom label set.
Produces a clip-level classifier on top of YAMNet embeddings, plus sliding-window
inference helper to approximate onset/offset.
"""

import os
#import random
import numpy as np
#import tensorflow as tf
#import tensorflow_hub as hub
#from tensorflow import keras
#import librosa
#import soundfile as sf
#from pathlib import Path
import sys
import pickle
%load_ext autoreload

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
    path_prefix = ''
else:
    path_prefix = '..'

split = 'train'
file_path = os.path.join(path_prefix, 'data/detection', split)

UNFREEZE_YAMNET = True      # set True to fine-tune YamNet weights (careful: small LR)
TARGET_SR = 16000            # YAMNet requires 16kHz       
RANDOM_SEED = 0

# For sliding-window inference (higher temporal resolution)
SLIDING_WIN_SEC = 1.0   # window length for inference
SLIDING_HOP_SEC = 0.1   # hop between windows -> effective temporal resolution

%autoreload 2
from src.single_stage_yamnet_finetune import *

#device = "cuda" if torch.cuda.is_available() else "cpu"

  from pkg_resources import parse_version





In [3]:
data = pickle.load(open(os.path.join(path_prefix, f'data/processed/detection_{split}.p') , 'rb'))
print(data.keys())

dict_keys(['sr', 'S_db', 'files', 'onset', 'offset', 'event_label', 'background_label'])


In [4]:
# manual train test split (stratified)
np.random.seed(0)
train_size = 0.8
train_idx = []
for label in np.unique(data['event_label']):
    choices = np.where(data['event_label'] == label)[0]
    train_idx.append(np.sort(np.random.choice(choices, size = int(np.round(len(choices)*train_size)), replace = False)))
train_idx = np.sort(np.concatenate(train_idx))
val_idx = [i for i in range(len(data['event_label'])) if i not in train_idx]
len(train_idx)

400

In [5]:
classes = np.unique(data['event_label'])
NUM_CLASSES = len(classes)
class_to_id = {c:i for i,c in enumerate(classes)}
print("Classes:", classes)

filepaths = [os.path.join(file_path, file) for file in data['files']]
labels = [class_to_id[c] for c in data['event_label']]

train_files = [filepaths[i] for i in train_idx]
train_labels = [labels[i] for i in train_idx]
val_files = [filepaths[i] for i in val_idx]
val_labels = [labels[i] for i in val_idx]
val_onsets = [data['onset'][i] for i in val_idx]
val_offsets = [data['offset'][i] for i in val_idx]

Classes: ['car_horn' 'cough' 'dog_bark' 'gun_shot' 'siren']


In [7]:
trainer = Trainer(train_files, train_labels, val_files, val_labels, lr = 1e-4, epochs = 30, batch_size = 16, device = 'cpu', model_save_dir = "../src/models/yamnet_finetune_model.keras")
trainer.train()

Starting training...
Epoch 1/30
     25/Unknown [1m31s[0m 505ms/step - accuracy: 0.2554 - loss: 1.6006



[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 733ms/step - accuracy: 0.2375 - loss: 1.5972 - val_accuracy: 0.4100 - val_loss: 1.5564 - learning_rate: 1.0000e-04
Epoch 2/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 691ms/step - accuracy: 0.3350 - loss: 1.5375 - val_accuracy: 0.5400 - val_loss: 1.4959 - learning_rate: 1.0000e-04
Epoch 3/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 585ms/step - accuracy: 0.4550 - loss: 1.4933 - val_accuracy: 0.5900 - val_loss: 1.4434 - learning_rate: 1.0000e-04
Epoch 4/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 614ms/step - accuracy: 0.5050 - loss: 1.4452 - val_accuracy: 0.6300 - val_loss: 1.3910 - learning_rate: 1.0000e-04
Epoch 5/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 712ms/step - accuracy: 0.5175 - loss: 1.3986 - val_accuracy: 0.6300 - val_loss: 1.3372 - learning_rate: 1.0000e-04
Epoch 6/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [8]:
i = 2
test_path = val_files[i]
print(f'Actual label: {classes[val_labels[i]]}')
print(f'Actual onset: {val_onsets[i]}')
print(f'Actual offset: {val_offsets[i]}')
wav = load_audio_mono(test_path, sr=TARGET_SR)
clip_idx, clip_prob, clip_vec = trainer.predict_clip(wav)
print(f"Clip-level prediction: {classes[clip_idx]} ({clip_prob:.3f})")

times, probs = trainer.sliding_window_inference(wav, win_sec=SLIDING_WIN_SEC, hop_sec=SLIDING_HOP_SEC)
# For each class, convert sliding-window probs into onset/offset segments
THRESH = 0.3
print("Sliding-window results (time centers):")
for cid, cname in enumerate(classes):
    mask = probs[:, cid] >= THRESH
    if not np.any(mask):
        continue
    # Merge consecutive windows into segments
    idxs = np.where(mask)[0]
    splits = np.split(idxs, np.where(np.diff(idxs) != 1)[0] + 1)
    for seg in splits:
        start_time = times[seg[0]] - SLIDING_WIN_SEC / 2.0
        end_time   = times[seg[-1]] + SLIDING_WIN_SEC / 2.0
        start_time = max(0.0, start_time)
        end_time = min(len(wav)/TARGET_SR, end_time)
        print(f"  {cname}: {start_time:.2f}s -> {end_time:.2f}s (max prob {float(np.max(probs[seg, cid])):.3f})")


Actual label: gun_shot
Actual onset: 7.326152328956148
Actual offset: 10.0
Clip-level prediction: gun_shot (0.284)
Sliding-window results (time centers):
  car_horn: 3.60s -> 6.60s (max prob 0.944)
  dog_bark: 0.00s -> 1.10s (max prob 0.324)
  dog_bark: 0.30s -> 1.90s (max prob 0.388)
  dog_bark: 1.10s -> 3.20s (max prob 0.385)
  dog_bark: 2.40s -> 3.40s (max prob 0.326)
  dog_bark: 5.80s -> 6.90s (max prob 0.313)
  dog_bark: 6.10s -> 8.10s (max prob 0.341)
  gun_shot: 7.50s -> 10.00s (max prob 0.986)
  siren: 0.80s -> 1.80s (max prob 0.305)
  siren: 1.00s -> 2.00s (max prob 0.335)
  siren: 2.80s -> 3.80s (max prob 0.346)
  siren: 3.10s -> 4.10s (max prob 0.361)
  siren: 3.50s -> 4.50s (max prob 0.331)
  siren: 5.00s -> 6.20s (max prob 0.458)
  siren: 5.50s -> 6.50s (max prob 0.333)
