# 05. Comprehensive Evaluation of Universal Adversarial Perturbations

This notebook performs the final evaluation:
1.  **Load Model**: Initialize Whisper model.
2.  **Load UAP**: Load the pre-trained perturbation vector.
3.  **Inference**: Run Whisper on Clean vs. Adversarial audio.
4.  **Metrics**: Calculate WER, CER, and SNR.
5.  **Visualization**: Plot Success Rate vs. SNR trade-off.

In [None]:
import os
import torch
import numpy as np
import librosa
import matplotlib.pyplot as plt
import jiwer
from datasets import load_dataset
import pandas as pd

# Ensure reproducibility
torch.manual_seed(42)
np.random.seed(42)

# --- Configuration ---
# Whisper Model
model_id = "openai/whisper-tiny" # or base, small
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# UAP Configuration
# Assuming the UAP is saved as a .pt file from notebook 04
UAP_PATH = "src/uap_vector.pt" 
EPSILON = 0.05

# Audio Config
SAMPLE_RATE = 16000

print("Configuration loaded.")

## Load Model and UAP

In [None]:
import whisper

print(f"Loading Whisper model {model_id}...")
model = whisper.load_model(model_id, device=device)
model.eval() # Set to evaluation mode

print("Model loaded.")

In [None]:
# Load UAP vector
def load_uap(path):
    if os.path.exists(path):
        uap = torch.load(path)
        print(f"UAP loaded from {path}, shape: {uap.shape}")
        return uap
    else:
        print(f"Warning: {path} not found. Creating random UAP for demo.")
        # Return a random vector of max duration (e.g., 30s)
        max_dur = 30.0
        uap_len = int(max_dur * SAMPLE_RATE)
        uap = torch.randn(uap_len).float() * EPSILON
        return uap

uap_vector = load_uap(UAP_PATH)
# Pad or truncate UAP to match audio length dynamically if needed, 
# but for this demo, we assume the loader handles length alignment.

## Define Evaluation Functions

In [None]:
def calculate_snr(clean_audio, adv_audio):
    """Calculates Signal-to-Noise Ratio (dB)"""
    clean = clean_audio.astype(np.float32)
    adv = adv_audio.astype(np.float32)
    
    # Calculate power
    signal_power = np.mean(clean ** 2)
    noise_power = np.mean((clean - adv) ** 2)
    
    if noise_power == 0:
        return float('inf')
    
    snr = 10 * np.log10(signal_power / noise_power)
    return snr

def apply_uap_to_audio(waveform, uap, eps=0.05):
    """
    Aligns UAP to waveform length and applies perturbation.
    """
    # Ensure waveform is numpy array
    if isinstance(waveform, torch.Tensor):
        waveform = waveform.cpu().numpy()
    
    # Simple length handling: Crop or Pad
    orig_len = len(waveform)
    uap_len = len(uap)
    
    if orig_len > uap_len:
        # Crop UAP to fit audio
        adv_audio = waveform + uap[:orig_len] * eps
    else:
        # Repeat UAP
        repeat_factor = int(np.ceil(orig_len / uap_len))
        uap_expanded = np.tile(uap, repeat_factor)[:orig_len]
        adv_audio = waveform + uap_expanded * eps
    
    # Clip to [-1, 1]
    adv_audio = np.clip(adv_audio, -1.0, 1.0)
    return adv_audio

def run_evaluation(model, audio_array, language="en"):
    """Run Whisper on audio and return transcription"""
    result = model.transcribe(audio_array, language=language, fp16=False)
    return result["text"]

def wer_ref(ref, hyp):
    return jiwer.wer(ref, hyp)

def cer_ref(ref, hyp):
    return jiwer.cer(ref, hyp)

## Run Evaluation on Test Set

In [None]:
# Setup Dataset (Simulated or Load LibriSpeech)
# We will use the 'test-clean' split if available, otherwise load a subset.
print("Loading LibriSpeech test-clean...")
librispeech = load_dataset("librispeech_asr", "clean", split="test")
librispeech = librispeech.select(range(50)) # Use first 50 for speed in notebook
print(f"Loaded {len(librispeech)} samples.")

In [None]:
# Evaluation Loop
results = []

print("Starting Evaluation...")
for idx, item in enumerate(librispeech):
    # Load audio
    # Note: librispeech audio is typically numpy array already
    clean_audio = item["audio"]["array"]
    clean_text = item["text"]
    
    # Apply UAP
    adv_audio = apply_uap_to_audio(clean_audio, uap_vector, eps=EPSILON)
    
    # Inference
    try:
        pred_clean = run_evaluation(model, clean_audio)
        pred_adv = run_evaluation(model, adv_audio)
    except Exception as e:
        print(f"Error processing {idx}: {e}")
        continue
    
    # Metrics
    w_clean = wer_ref(clean_text, pred_clean)
    c_clean = cer_ref(clean_text, pred_clean)
    
    w_adv = wer_ref(clean_text, pred_adv)
    c_adv = cer_ref(clean_text, pred_adv)
    
    snr = calculate_snr(clean_audio, adv_audio)
    
    results.append({
        "idx": idx,
        "clean_wer": w_clean,
        "clean_cer": c_clean,
        "adv_wer": w_adv,
        "adv_cer": c_adv,
        "snr": snr
    })
    
    if (idx + 1) % 10 == 0:
        print(f"Processed {idx + 1}/{len(librispeech)} samples.")

df_results = pd.DataFrame(results)
print("Evaluation Complete.")

## Analyze Results

In [None]:
print(df_results.describe())

In [None]:
success_rate = (df_results['adv_cer'] > 0.5).mean()
print(f"Attack Success Rate (CER > 0.5): {success_rate:.2%}")

## Visualization: WER vs SNR

In [None]:
plt.figure(figsize=(10, 6))

# Plot Clean
plt.scatter(df_results['snr'], df_results['clean_wer'], c='blue', alpha=0.5, label='Clean Audio', marker='o')

# Plot Adversarial
plt.scatter(df_results['snr'], df_results['adv_wer'], c='red', alpha=0.5, label='Adversarial Audio', marker='x')

plt.title('WER vs SNR for Clean and Adversarial Audio')
plt.xlabel('SNR (dB)')
plt.ylabel('Word Error Rate')
plt.legend()
plt.grid(True)
plt.ylim(0, 1.0) 
plt.show()

## Defense Mechanism: Randomized Smoothing (Optional Evaluation)

To demonstrate defense, we can test how adding random Gaussian noise affects the SNR and WER.

*Note: This is a simplified demonstration. True Randomized Smoothing involves running the model multiple times with noise and aggregating predictions.*

In [None]:
def apply_gaussian_smoothing(audio, std_dev=0.01):
    """Add Gaussian noise"""
    noise = np.random.normal(0, std_dev, audio.shape)
    smoothed = audio + noise
    smoothed = np.clip(smoothed, -1.0, 1.0)
    return smoothed

# Test on a few samples
test_idx = 0
sample_clean = df_results.iloc[test_idx]
sample_audio = librispeech[test_idx]["audio"]["array"]

smoothed_audio = apply_gaussian_smoothing(sample_audio, std_dev=0.01)
snr_smoothed = calculate_snr(sample_audio, smoothed_audio)

print(f"Original SNR: {df_results.iloc[test_idx]['snr']:.2f} dB")
print(f"Smoothed SNR: {snr_smoothed:.2f} dB")
print(f"Clean WER: {sample_clean['clean_wer']:.2f}")
print(f"Adversarial WER: {sample_clean['adv_wer']:.2f}")