# 02: Baseline Performance Evaluation

This notebook establishes the ground truth performance of Whisper on clean LibriSpeech data before applying adversarial attacks.

**Goals:**
1. Load LibriSpeech `test-clean` data.
2. Initialize Whisper model.
3. Transcribe clean audio.
4. Calculate WER and CER using `jiwer`.
5. Log results for later comparison.

In [None]:
import os
import torch
import librosa
import soundfile as sf
import numpy as np
from jiwer import wer, cer
from tqdm import tqdm
import json
from pathlib import Path

import sys
sys.path.append('../src')

from data.download_data import get_dataset_path
from data.audio_loader import load_audio, normalize_audio

print("Imports successful.")

## 1. Setup & Configuration

We need to ensure reproducibility and check device availability.

In [None]:
# Reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Model Selection (Small-V3 is fast enough for testing, Large-V2 for quality)
# We'll use small-v3 for faster iteration during initial baseline checks
model_name = "openai/whisper-small"
print(f"Loading Whisper model: {model_name}")

import whisper
model = whisper.load_model(model_name, device=device)
print("Model loaded.")

## 2. Data Loading

We load the LibriSpeech `test-clean` dataset.

In [None]:
# Assume download_data.py has created the path structure
dataset_root = get_dataset_path('librispeech')
print(f"Dataset root: {dataset_root}")

# LibriSpeech structure: ./LibriSpeech/test-clean/
test_clean_path = Path(dataset_root) / 'LibriSpeech' / 'test-clean'

if not test_clean_path.exists():
    raise FileNotFoundError(f"Test-clean directory not found at {test_clean_path}.")

print(f"Found {len(list(test_clean_path.rglob('*.flac')))} audio files.")

## 3. Baseline Transcription Loop

Iterate through audio files, transcribe, and compute metrics.

In [None]:
results = []
base_wer = 0
base_cer = 0

audio_files = list(test_clean_path.rglob('*.flac'))[:50] # Limit to 50 for initial baseline

print(f"Processing {len(audio_files)} samples...")

for audio_path in tqdm(audio_files):
    # 1. Load Audio
    # Whisper expects 16kHz input
    audio_array, sample_rate = load_audio(str(audio_path))
    
    # 2. Transcribe
    # Use small model settings for faster processing
    result = model.transcribe(
        audio_array, 
        language='en',
        fp16=False if device == 'cpu' else True
    )
    
    # 3. Extract Ground Truth (LibriSpeech filenames often contain the text)
    # Example: 119 / 128104 / 128104-0000.wav -> "119 128104 128104-0000"
    base_filename = audio_path.stem
    text_segments = base_filename.split('-')
    # Clean up numbers and whitespace
    ground_truth = " ".join(text_segments).strip()
    
    # 4. Metrics
    transcribed_text = result['text'].strip().lower()
    
    current_wer = wer(ground_truth, transcribed_text)
    current_cer = cer(ground_truth, transcribed_text)
    
    results.append({
        'audio_path': str(audio_path.relative_to(test_clean_path.parent.parent.parent)),
        'ground_truth': ground_truth,
        'transcription': transcribed_text,
        'wer': current_wer,
        'cer': current_cer
    })
    
    base_wer += current_wer
    base_cer += current_cer

print("\n--- Baseline Results (Clean) ---")
print(f"WER: {base_wer / len(results):.4f}")
print(f"CER: {base_cer / len(results):.4f}")

## 4. Save Results

Save baseline metrics for later comparison with adversarial results.

In [None]:
# Save detailed results
with open('results/baseline_clean_results.json', 'w') as f:
    json.dump(results, f, indent=2)

metrics = {'mean_wer': base_wer / len(results), 'mean_cer': base_cer / len(results)}
with open('results/baseline_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print("Baseline metrics saved.")