In [None]:
# Install required packages
!pip install faster-whisper
!pip install noisereduce
!pip install soundfile librosa
!pip install pydub
!pip install scipy

# Install ffmpeg for audio processing
!apt update && apt install -y ffmpeg

# Import required libraries
import os
import numpy as np
import soundfile as sf
import librosa
import noisereduce as nr
from faster_whisper import WhisperModel
import time
from pydub import AudioSegment
import requests
import zipfile
from tqdm import tqdm
import pandas as pd

Collecting faster-whisper
  Downloading faster_whisper-1.1.1-py3-none-any.whl.metadata (16 kB)
Collecting ctranslate2<5,>=4.0 (from faster-whisper)
  Downloading ctranslate2-4.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting av>=11 (from faster-whisper)
  Downloading av-14.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting coloredlogs (from onnxruntime<2,>=1.14->faster-whisper)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading faster_whisper-1.1.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m

In [None]:


### Use LibriMix Dataset (Mini Version for Testing)

def setup_librimix_mini():
    """
    Set up a mini version of LibriMix for testing in Google Colab
    Full LibriMix is 430GB - we'll create a smaller subset
    """

    print("Setting up LibriMix mini dataset for testing...")

    # Install SoX (required for LibriMix)
    print("Installing SoX...")
    !apt-get update -qq
    !apt-get install -y -qq sox

    # Clone LibriMix repository
    print("\nCloning LibriMix repository...")
    !git clone https://github.com/JorisCos/LibriMix

    # Create storage directory
    os.makedirs('librimix_data', exist_ok=True)

    # Generate a minimal subset
    generate_minimal_librimix()


def generate_minimal_librimix():
    """
    Generate a minimal LibriMix dataset suitable for Colab
    ~100 samples instead of full dataset
    """

    # Modify the generation script for minimal data
    script_content = """#!/bin/bash

    # Minimal LibriMix generation for testing
    # Only generates 2-speaker mixtures at 16kHz

    storage_dir=$1
    n_src=2

    # Download minimal LibriSpeech subset
    echo "Downloading LibriSpeech test-clean..."
    mkdir -p $storage_dir/LibriSpeech
    cd $storage_dir/LibriSpeech
    wget -q --show-progress http://www.openslr.org/resources/12/test-clean.tar.gz
    tar -xzf test-clean.tar.gz
    cd ../..

    # Download minimal WHAM noise
    echo "Downloading WHAM noise samples..."
    mkdir -p $storage_dir/wham_noise
    # Note: Full WHAM is large. For testing, we'll use ESC-50 as substitute
    cd $storage_dir/wham_noise
    wget -q --show-progress https://github.com/karoldvl/ESC-50/archive/master.zip
    unzip -q master.zip
    mv ESC-50-master/* .
    rm -rf ESC-50-master master.zip
    cd ../..

    # Generate metadata for mini version
    cd LibriMix
    python scripts/create_librimix_from_metadata.py \
        --librispeech_dir $storage_dir/LibriSpeech \
        --wham_dir $storage_dir/wham_noise \
        --metadata_dir metadata/Libri2Mix \
        --librimix_outdir $storage_dir/Libri2Mix_mini \
        --n_src $n_src \
        --freqs 16k \
        --modes min \
        --types mix_clean mix_both mix_single \
        --max_samples 35  # Limit samples per type
    """

    with open('generate_librimix_mini.sh', 'w') as f:
        f.write(script_content)

    !chmod +x generate_librimix_mini.sh
    !./generate_librimix_mini.sh librimix_data

    # Process the generated files for our 10-second requirement
    process_librimix_to_10s()

def process_librimix_to_10s():
    """
    Process LibriMix samples to ensure they are exactly 10 seconds
    """
    import soundfile as sf

    print("\nProcessing LibriMix samples to 10-second clips...")

    os.makedirs('sample_audios', exist_ok=True)

    # Define mixture types
    mixture_types = ['mix_clean', 'mix_both', 'mix_single']

    for mix_type in mixture_types:
        print(f"\nProcessing {mix_type}...")

        # Create subdirectories
        os.makedirs(f'sample_audios/{mix_type}', exist_ok=True)

        # Find generated files
        mix_dir = f'librimix_data/Libri2Mix_mini/wav16k/min/dev/{mix_type}'

        if os.path.exists(mix_dir):
            files = [f for f in os.listdir(mix_dir) if f.endswith('.wav')]

            # Process up to 35 samples per type
            for i, filename in enumerate(files[:35]):
                if 'mix' in filename:
                    # Load audio
                    audio_path = os.path.join(mix_dir, filename)
                    audio, sr = sf.read(audio_path)

                    # Ensure 10 seconds (160000 samples at 16kHz)
                    target_samples = 10 * 16000

                    if len(audio) > target_samples:
                        audio = audio[:target_samples]
                    else:
                        # Pad with zeros
                        padding = target_samples - len(audio)
                        audio = np.pad(audio, (0, padding), mode='constant')

                    # Save processed audio
                    output_path = f'sample_audios/{mix_type}/sample_{i:03d}.wav'
                    sf.write(output_path, audio, sr)

            print(f"✓ Processed {len(files[:35])} {mix_type} samples")

def create_simple_librimix_alternative():
    """
    Alternative: Create LibriMix-style dataset without full generation
    This is faster and more suitable for Colab
    """

    print("Creating LibriMix-style dataset (simplified version)...")

    os.makedirs('sample_audios', exist_ok=True)

    # Download LibriSpeech test-clean
    print("Downloading LibriSpeech test-clean...")
    !wget -q --show-progress http://www.openslr.org/resources/12/test-clean.tar.gz
    !tar -xzf test-clean.tar.gz

    # Download noise samples (using ESC-50 as WHAM alternative)
    print("\nDownloading noise samples...")
    !wget -q --show-progress https://github.com/karoldvl/ESC-50/archive/master.zip
    !unzip -q master.zip

    # Collect audio files
    speech_files = []
    for root, dirs, files in os.walk('LibriSpeech/test-clean'):
        for file in files:
            if file.endswith('.flac'):
                speech_files.append(os.path.join(root, file))

    noise_files = []
    for root, dirs, files in os.walk('ESC-50-master/audio'):
            if file.endswith('.wav'):
        for file in files:
                noise_files.append(os.path.join(root, file))

    print(f"Found {len(speech_files)} speech files and {len(noise_files)} noise files")

    # Create LibriMix-style mixtures
    create_mixtures_librimix_style(speech_files, noise_files)

    # Clean up
    !rm -rf LibriSpeech test-clean.tar.gz ESC-50-master master.zip

def create_mixtures_librimix_style(speech_files, noise_files):
    """
    Create mixtures following LibriMix conventions
    """
    import soundfile as sf
    import json

    np.random.seed(42)  # For reproducibility

    # Shuffle files
    np.random.shuffle(speech_files)
    np.random.shuffle(noise_files)

    # Create metadata
    metadata = {
        'mix_clean': [],
        'mix_single': [],
        'mix_both': []
    }

    samples_per_type = 34

    # 1. Create mix_clean (2 speakers, no noise)
    print("\nCreating mix_clean samples...")
    os.makedirs('sample_audios/mix_clean', exist_ok=True)

    for i in tqdm(range(20)):
        if i*2+1 < len(speech_files):
            # Load two speakers
            s1_data, sr1 = librosa.load(speech_files[i*2], sr=16000, mono=True)
            s2_data, sr2 = librosa.load(speech_files[i*2+1], sr=16000, mono=True)

            # Adjust to 10 seconds
            s1_data = adjust_audio_length(s1_data, 10*16000)
            s2_data = adjust_audio_length(s2_data, 10*16000)

            # Mix with equal energy (LibriMix style)
            s1_rms = np.sqrt(np.mean(s1_data**2))
            s2_rms = np.sqrt(np.mean(s2_data**2))
            s2_data = s2_data * (s1_rms / s2_rms)  # Normalize s2 to s1's level

            mixture = s1_data + s2_data

            # Prevent clipping
            max_val = np.max(np.abs(mixture))
            if max_val > 1.0:
                mixture = mixture / max_val * 0.95

            # Save files
            sf.write(f'sample_audios/mix_clean/mixture_{i:03d}.wav', mixture, 16000)
            sf.write(f'sample_audios/mix_clean/s1_{i:03d}.wav', s1_data, 16000)
            sf.write(f'sample_audios/mix_clean/s2_{i:03d}.wav', s2_data, 16000)

            metadata['mix_clean'].append({
                'mixture_ID': f'mixture_{i:03d}',
                'mixture_path': f'mix_clean/mixture_{i:03d}.wav',
                'source_1_path': f'mix_clean/s1_{i:03d}.wav',
                'source_2_path': f'mix_clean/s2_{i:03d}.wav',
                'noise_path': None,
                'SNR': None
            })

    # 2. Create mix_single (1 speaker + noise)
    print("\nCreating mix_single samples...")
    os.makedirs('sample_audios/mix_single', exist_ok=True)

    for i in tqdm(range(70)):
        if i < len(speech_files) and i < len(noise_files):
            # Load speaker and noise
            s1_data, sr1 = librosa.load(speech_files[i], sr=16000, mono=True)
            noise_data, sr_n = librosa.load(noise_files[i % len(noise_files)], sr=16000, mono=True)

            # Adjust to 10 seconds
            s1_data = adjust_audio_length(s1_data, 10*16000)
            noise_data = adjust_audio_length(noise_data, 10*16000)

            # Mix at 10 dB SNR (LibriMix uses various SNRs)
            snr_db = 10
            s1_rms = np.sqrt(np.mean(s1_data**2))
            noise_rms = np.sqrt(np.mean(noise_data**2))
            noise_data = noise_data * (s1_rms / noise_rms) * (10**(-snr_db/20))

            mixture = s1_data + noise_data

            # Prevent clipping
            max_val = np.max(np.abs(mixture))
            if max_val > 1.0:
                mixture = mixture / max_val * 0.95

            # Save files
            sf.write(f'sample_audios/mix_single/mixture_{i:03d}.wav', mixture, 16000)
            sf.write(f'sample_audios/mix_single/s1_{i:03d}.wav', s1_data, 16000)
            sf.write(f'sample_audios/mix_single/noise_{i:03d}.wav', noise_data, 16000)

            metadata['mix_single'].append({
                'mixture_ID': f'mixture_{i:03d}',
                'mixture_path': f'mix_single/mixture_{i:03d}.wav',
                'source_1_path': f'mix_single/s1_{i:03d}.wav',
                'source_2_path': None,
                'noise_path': f'mix_single/noise_{i:03d}.wav',
                'SNR': snr_db
            })

    # 3. Create mix_both (2 speakers + noise)
    print("\nCreating mix_both samples...")
    os.makedirs('sample_audios/mix_both', exist_ok=True)

    for i in tqdm(range(20)):
        if i*2+1 < len(speech_files) and i < len(noise_files):
            # Load two speakers and noise
            s1_data, _ = librosa.load(speech_files[i*2], sr=16000, mono=True)
            s2_data, _ = librosa.load(speech_files[i*2+1], sr=16000, mono=True)
            noise_data, _ = librosa.load(noise_files[i % len(noise_files)], sr=16000, mono=True)

            # Adjust to 10 seconds
            s1_data = adjust_audio_length(s1_data, 10*16000)
            s2_data = adjust_audio_length(s2_data, 10*16000)
            noise_data = adjust_audio_length(noise_data, 10*16000)

            # Mix speakers with equal energy
            s1_rms = np.sqrt(np.mean(s1_data**2))
            s2_rms = np.sqrt(np.mean(s2_data**2))
            s2_data = s2_data * (s1_rms / s2_rms)

            speech_mix = s1_data + s2_data

            # Add noise at 15 dB SNR
            snr_db = 15
            speech_rms = np.sqrt(np.mean(speech_mix**2))
            noise_rms = np.sqrt(np.mean(noise_data**2))
            noise_data = noise_data * (speech_rms / noise_rms) * (10**(-snr_db/20))

            mixture = speech_mix + noise_data

            # Prevent clipping
            max_val = np.max(np.abs(mixture))
            if max_val > 1.0:
                mixture = mixture / max_val * 0.95

            # Save files
            sf.write(f'sample_audios/mix_both/mixture_{i:03d}.wav', mixture, 16000)
            sf.write(f'sample_audios/mix_both/s1_{i:03d}.wav', s1_data, 16000)
            sf.write(f'sample_audios/mix_both/s2_{i:03d}.wav', s2_data, 16000)
            sf.write(f'sample_audios/mix_both/noise_{i:03d}.wav', noise_data, 16000)

            metadata['mix_both'].append({
                'mixture_ID': f'mixture_{i:03d}',
                'mixture_path': f'mix_both/mixture_{i:03d}.wav',
                'source_1_path': f'mix_both/s1_{i:03d}.wav',
                'source_2_path': f'mix_both/s2_{i:03d}.wav',
                'noise_path': f'mix_both/noise_{i:03d}.wav',
                'SNR': snr_db
            })

    # Save metadata
    with open('sample_audios/librimix_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"\n✓ Created LibriMix-style dataset:")
    print(f"  - {len(metadata['mix_clean'])} mix_clean samples (2 speakers, no noise)")
    print(f"  - {len(metadata['mix_single'])} mix_single samples (1 speaker + noise)")
    print(f"  - {len(metadata['mix_both'])} mix_both samples (2 speakers + noise)")
    print(f"\nTotal: ~100 samples following LibriMix conventions")

def adjust_audio_length(audio, target_length):
    """Adjust audio to target length by truncating or padding"""
    if len(audio) > target_length:
        return audio[:target_length]
    else:
        return np.pad(audio, (0, target_length - len(audio)), mode='constant')

# Run the simplified LibriMix creation
create_simple_librimix_alternative()

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 162)

In [None]:
import torch
# Initialize the model (you can choose different model sizes)
# Available models: tiny, base, small, medium, large, large-v2, large-v3
model_size = "base"  # Change this to your preferred model

# Initialize with GPU support if available
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"

print(f"Loading Whisper model: {model_size}")
print(f"Device: {device}, Compute type: {compute_type}")

model = WhisperModel(model_size, device=device, compute_type=compute_type)

In [None]:
!pip install -q jiwer

In [None]:
import gc
import json
from jiwer import wer, cer
def test_single_mixture_type(mix_type, model_size="base", model=None):
    """
    Test a single mixture type to manage memory better
    """
    print(f"\n{'='*50}")
    print(f"Testing {mix_type} samples...")
    print(f"{'='*50}")

    # Initialize model if not provided
    if model is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
        compute_type = "float16" if device == "cuda" else "int8"
        print(f"Loading model: {model_size} on {device}")
        model = WhisperModel(model_size, device=device, compute_type=compute_type)

    # Load metadata
    with open('sample_audios/librimix_metadata.json', 'r') as f:
        metadata = json.load(f)

    samples = metadata[mix_type]
    results = []

    for sample in tqdm(samples, desc=f"Processing {mix_type}"):
        # Paths
        mixture_path = f"sample_audios/{sample['mixture_path']}"
        s1_path = f"sample_audios/{sample['source_1_path']}"
        s2_path = f"sample_audios/{sample['source_2_path']}" if sample['source_2_path'] else None

        # Get ground truth by transcribing clean sources
        start_time = time.time()

        # Transcribe source 1
        segments_s1, _ = model.transcribe(s1_path, beam_size=5)
        ground_truth_s1 = " ".join([segment.text.strip() for segment in segments_s1])

        # Transcribe source 2 if exists
        if s2_path:
            segments_s2, _ = model.transcribe(s2_path, beam_size=5)
            ground_truth_s2 = " ".join([segment.text.strip() for segment in segments_s2])
            ground_truth = f"{ground_truth_s1} {ground_truth_s2}"
        else:
            ground_truth = ground_truth_s1

        # Transcribe mixture
        segments_mix, _ = model.transcribe(mixture_path, beam_size=5)
        transcription = " ".join([segment.text.strip() for segment in segments_mix])

        inference_time = time.time() - start_time

        # Calculate metrics
        if ground_truth.strip() and transcription.strip():
            word_error_rate = wer(ground_truth, transcription)
            char_error_rate = cer(ground_truth, transcription)
        else:
            word_error_rate = 1.0
            char_error_rate = 1.0

        # Store results
        result = {
            'mixture_type': mix_type,
            'sample_id': sample['mixture_ID'],
            'ground_truth': ground_truth,
            'transcription': transcription,
            'wer': word_error_rate,
            'cer': char_error_rate,
            'inference_time': inference_time,
            'snr': sample.get('SNR', 'N/A')
        }

        results.append(result)

        # Clear memory periodically
        if len(results) % 10 == 0:
            gc.collect()

    # Convert to DataFrame and save chunk
    df_chunk = pd.DataFrame(results)
    chunk_file = f'whisper_baseline_{model_size}_{mix_type}_results.csv'
    df_chunk.to_csv(chunk_file, index=False)

    # Print statistics for this chunk
    print(f"\n{mix_type} Results:")
    print(f"  Samples: {len(df_chunk)}")
    print(f"  Average WER: {df_chunk['wer'].mean():.3f} (±{df_chunk['wer'].std():.3f})")
    print(f"  Average CER: {df_chunk['cer'].mean():.3f} (±{df_chunk['cer'].std():.3f})")
    print(f"  Average time: {df_chunk['inference_time'].mean():.2f}s")

    # Show sample
    if len(df_chunk) > 0:
        sample = df_chunk.iloc[0]
        print(f"\n  Sample transcription:")
        print(f"  Ground truth: {sample['ground_truth'][:80]}...")
        print(f"  Transcription: {sample['transcription'][:80]}...")

    print(f"\n✓ Saved {mix_type} results to '{chunk_file}'")

    return df_chunk

In [None]:
df_clean = test_single_mixture_type('mix_clean', model_size='base')

In [None]:
df_single = test_single_mixture_type('mix_single', model_size='base')

In [None]:
df_both = test_single_mixture_type('mix_both', model_size='base')

In [None]:
!wget https://github.com/karoldvl/ESC-50/archive/master.zip -O esc50.zip
!unzip -q esc50.zip


In [None]:
import os
import json
import random
import numpy as np
import pandas as pd
import soundfile as sf
from pydub import AudioSegment

def create_noise_only_samples(num_samples=10, duration_seconds=10):
    """
    Create noise-only audio samples from ESC-50 and synthetic white noise.

    Args:
        num_samples: Total number of samples to generate
        duration_seconds: Duration of each sample in seconds

    Returns:
        List of file paths created
    """
    print("Creating noise-only samples...")
    os.makedirs('sample_audios/noise_only', exist_ok=True)

    # Load metadata
    meta_path = 'ESC-50-master/meta/esc50.csv'
    if not os.path.exists(meta_path):
        raise FileNotFoundError("esc50.csv not found. Download and place it in ESC-50-master/meta/")

    df_meta = pd.read_csv(meta_path)

    # Define broader noise groups and match their ESC-50 categories
    noise_category_groups = {
        'environmental': ['rain', 'wind', 'thunderstorm', 'water_drops'],
        'urban': ['car_horn', 'siren', 'engine', 'train'],
        'indoor': ['clock_tick', 'keyboard_typing', 'vacuum_cleaner'],
        'nature': ['insects', 'crow', 'dog', 'rooster', 'chirping_birds'],
        'human': ['breathing', 'coughing', 'snoring', 'laughing']
    }

    # Flatten desired ESC-50 categories
    desired_categories = [cat for group in noise_category_groups.values() for cat in group]

    # Filter ESC-50 metadata
    df_filtered = df_meta[df_meta['category'].isin(desired_categories)]

    # Map category to broader group
    def get_group(category):
        for group, members in noise_category_groups.items():
            if category in members:
                return group
        return "unknown"

    # Shuffle and prepare paths
    entries = df_filtered.sample(frac=1, random_state=42).reset_index(drop=True)

    created_files = []
    metadata = []
    sample_count = 0
    sample_rate = 16000
    target_duration_ms = duration_seconds * 1000

    # Reserve a few samples for white noise
    real_needed = num_samples - 3

    for _, row in entries.iterrows():
        if sample_count >= real_needed:
            break

        wav_path = os.path.join("ESC-50-master/audio", row["filename"])
        if not os.path.exists(wav_path):
            continue

        category = row["category"]
        group = get_group(category)

        noise_audio = AudioSegment.from_wav(wav_path).set_frame_rate(sample_rate).set_channels(1)

        if len(noise_audio) < target_duration_ms:
            loops = (target_duration_ms // len(noise_audio)) + 1
            noise_audio = noise_audio * loops

        noise_audio = noise_audio[:target_duration_ms]
        volume_adj = random.choice([-10, -5, 0, 5])
        noise_audio += volume_adj

        out_path = f'sample_audios/noise_only/{group}_{sample_count:03d}.wav'
        noise_audio.export(out_path, format='wav')
        created_files.append(out_path)

        metadata.append({
            'filename': os.path.basename(out_path),
            'group': group,
            'esc_category': category,
            'source_file': row["filename"],
            'volume_adjustment_db': volume_adj,
            'duration_seconds': duration_seconds
        })

        sample_count += 1

    # Add synthetic white noise
    for i in range(3):
        if sample_count >= num_samples:
            break

        samples = int(duration_seconds * sample_rate)
        noise_level = random.choice([0.01, 0.05, 0.1])
        white_noise = np.random.normal(0, noise_level, samples).astype(np.float32)
        white_noise = np.clip(white_noise, -1, 1)

        out_path = f'sample_audios/noise_only/white_noise_{sample_count:03d}.wav'
        sf.write(out_path, white_noise, sample_rate)
        created_files.append(out_path)

        metadata.append({
            'filename': os.path.basename(out_path),
            'group': 'synthetic',
            'esc_category': 'generated_white_noise',
            'source_file': 'none',
            'volume_adjustment_db': 0,
            'noise_level': noise_level,
            'duration_seconds': duration_seconds
        })

        sample_count += 1

    # Save metadata
    with open('sample_audios/noise_only/metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"\n✓ Created {len(created_files)} noise-only samples")
    df_summary = pd.DataFrame(metadata)
    print(df_summary['group'].value_counts())

    return created_files




In [None]:
def test_noise_only_baseline(model_size="base"):
    """
    Test Whisper on noise-only samples to see what it hallucinates (baseline only)

    Args:
        model_size: Whisper model size

    Returns:
        DataFrame with results
    """
    print(f"\n{'='*60}")
    print("Testing Whisper on Noise-Only Samples (Baseline)")
    print(f"{'='*60}")

    # Initialize model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    compute_type = "float16" if device == "cuda" else "int8"
    print(f"Loading model: {model_size} on {device}")
    model = WhisperModel(model_size, device=device, compute_type=compute_type)

    # Load metadata
    with open('sample_audios/noise_only/metadata.json', 'r') as f:
        metadata = json.load(f)

    results = []

    for sample in tqdm(metadata, desc="Processing noise-only samples"):
        audio_path = f"sample_audios/noise_only/{sample['filename']}"

        start_time = time.time()

        # Direct transcription
        segments, _ = model.transcribe(audio_path, beam_size=5)
        transcription = " ".join([segment.text.strip() for segment in segments])

        inference_time = time.time() - start_time

        # For noise-only, any transcription is a hallucination
        is_hallucination = len(transcription.strip()) > 0
        word_count = len(transcription.split()) if is_hallucination else 0
        char_count = len(transcription.strip()) if is_hallucination else 0

        result = {
            'filename': sample['filename'],
            'category': sample.get('group', sample.get('category', 'unknown')),
            'transcription': transcription,
            'is_hallucination': is_hallucination,
            'word_count': word_count,
            'char_count': char_count,
            'inference_time': inference_time,
            'volume_adjustment_db': sample.get('volume_adjustment_db', 0)
        }

        results.append(result)

    # Convert to DataFrame
    df_results = pd.DataFrame(results)

    # Analysis
    print("\n" + "="*60)
    print("NOISE-ONLY BASELINE RESULTS")
    print("="*60)

    # Overall hallucination rate
    hallucination_rate = df_results['is_hallucination'].mean() * 100
    avg_words = df_results[df_results['is_hallucination']]['word_count'].mean() if any(df_results['is_hallucination']) else 0

    print(f"\nOverall Results:")
    print(f"  Hallucination rate: {hallucination_rate:.1f}%")
    print(f"  Average words when hallucinating: {avg_words:.1f}")
    print(f"  Total samples tested: {len(df_results)}")

    # Hallucination by noise category
    print("\nHallucination Rate by Noise Category:")
    print("-" * 40)

    for category in df_results['category'].unique():
        cat_data = df_results[df_results['category'] == category]
        cat_hall_rate = cat_data['is_hallucination'].mean() * 100
        cat_samples = len(cat_data)
        print(f"{category}: {cat_hall_rate:.1f}% ({cat_samples} samples)")

    # Sample hallucinations
    print("\nSample Hallucinations:")
    print("-" * 40)

    hallucinations = df_results[df_results['is_hallucination']]
    if len(hallucinations) > 0:
        for idx, row in hallucinations.head(10).iterrows():
            print(f"\n{row['category']} noise → \"{row['transcription']}\"")
    else:
        print("No hallucinations detected!")

    # Save results
    output_file = f'whisper_{model_size}_noise_only_baseline.csv'
    df_results.to_csv(output_file, index=False)
    print(f"\n✓ Results saved to '{output_file}'")

    return df_results

In [None]:
def test_noise_only_complete(model_size="base"):
    """
    Complete noise-only test pipeline (baseline only)
    """
    # Step 1: Create noise-only samples if they don't exist
    #if not os.path.exists('sample_audios/noise_only/metadata.json'):
    create_noise_only_samples(num_samples=20, duration_seconds=10)
   #
        #print("Noise-only samples already exist")

    # Step 2: Test with Whisper (baseline only)
    df_results = test_noise_only_baseline(model_size=model_size)

    # Step 3: Visualize results
    #visualize_noise_baseline_results(df_results, model_size=model_size)

    # Step 4: Key findings
    print("\n" + "="*60)
    print("KEY FINDINGS")
    print("="*60)

    # Category with highest hallucination rate
    hall_by_cat = df_results.groupby('category')['is_hallucination'].mean()
    worst_cat = hall_by_cat.idxmax()
    worst_rate = hall_by_cat.max() * 100

    best_cat = hall_by_cat.idxmin()
    best_rate = hall_by_cat.min() * 100

    print(f"\nMost problematic noise type: {worst_cat} ({worst_rate:.1f}% hallucination)")
    print(f"Least problematic noise type: {best_cat} ({best_rate:.1f}% hallucination)")

    # Volume effect
    if 'volume_adjustment_db' in df_results.columns:
        vol_corr = df_results[['volume_adjustment_db', 'is_hallucination']].corr().iloc[0, 1]
        print(f"\nVolume correlation with hallucination: {vol_corr:.3f}")

    return df_results

# Usage
if __name__ == "__main__":
    # Run complete noise-only test
    results = test_noise_only_complete(model_size="base")

In [None]:
# Test Noise-Only Audio with Filters to Reduce Hallucinations

import os
import json
import pandas as pd
import numpy as np
import librosa
import soundfile as sf
from faster_whisper import WhisperModel
import noisereduce as nr
from scipy.signal import butter, filtfilt, wiener
from tqdm import tqdm
import time
import torch
import tempfile

# Install required packages
# !pip install -q noisereduce

class SimpleNoiseFilter:
    """
    Simple noise filtering methods for reducing hallucinations
    """

    def __init__(self, sample_rate=16000):
        self.sample_rate = sample_rate

    def spectral_gating(self, audio_data, sr):
        """Apply spectral gating noise reduction"""
        return nr.reduce_noise(y=audio_data, sr=sr, prop_decrease=1.0)

    def wiener_filter(self, audio_data, noise_power=0.1):
        """Apply Wiener filter"""
        return wiener(audio_data, noise=noise_power)

    def spectral_subtraction(self, audio_data, sr, noise_factor=0.1):
        """Apply spectral subtraction"""
        # STFT
        stft = librosa.stft(audio_data, n_fft=2048, hop_length=512)
        magnitude = np.abs(stft)
        phase = np.angle(stft)

        # Estimate noise from first 0.5 seconds
        noise_frames = int(0.5 * sr / 512)
        noise_magnitude = np.mean(magnitude[:, :noise_frames], axis=1, keepdims=True)

        # Subtract noise
        clean_magnitude = magnitude - noise_factor * noise_magnitude
        clean_magnitude = np.maximum(clean_magnitude, 0.002 * magnitude)

        # Reconstruct
        clean_stft = clean_magnitude * np.exp(1j * phase)
        clean_audio = librosa.istft(clean_stft, hop_length=512)

        return clean_audio

    def aggressive_filter(self, audio_data, sr):
        """Apply aggressive filtering to minimize hallucinations"""
        # Step 1: Heavy spectral gating
        filtered = nr.reduce_noise(y=audio_data, sr=sr, prop_decrease=1.5, stationary=True)

        # Step 2: Aggressive high-pass filter (remove low frequencies)
        nyquist = sr / 2
        cutoff = 300 / nyquist  # 300 Hz cutoff
        b, a = butter(5, cutoff, btype='high')
        filtered = filtfilt(b, a, filtered)

        # Step 3: Amplitude gating (silence very quiet parts)
        threshold = np.max(np.abs(filtered)) * 0.1
        mask = np.abs(filtered) > threshold
        filtered = filtered * mask

        return filtered

def test_noise_with_filters(model_size="base", filter_methods=['none', 'spectral_gating', 'aggressive']):
    """
    Test noise-only samples with different filters to measure hallucination reduction

    Args:
        model_size: Whisper model size
        filter_methods: List of filter methods to test

    Returns:
        DataFrame with results
    """
    print(f"\n{'='*60}")
    print("Testing Noise-Only Samples with Filters")
    print(f"{'='*60}")

    # Initialize model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    compute_type = "float16" if device == "cuda" else "int8"
    print(f"Loading model: {model_size} on {device}")
    model = WhisperModel(model_size, device=device, compute_type=compute_type)

    # Initialize filter
    noise_filter = SimpleNoiseFilter()

    # Load metadata
    with open('sample_audios/noise_only/metadata.json', 'r') as f:
        metadata = json.load(f)

    results = []

    # Test each sample with each filter
    for sample in tqdm(metadata, desc="Processing samples"):
        audio_path = f"sample_audios/noise_only/{sample['filename']}"

        # Load audio once
        audio_data, sr = librosa.load(audio_path, sr=16000)

        for filter_method in filter_methods:
            start_time = time.time()

            if filter_method == 'none':
                # Direct transcription
                segments, _ = model.transcribe(audio_path, beam_size=5)
            else:
                # Apply filter
                if filter_method == 'spectral_gating':
                    filtered_audio = noise_filter.spectral_gating(audio_data, sr)
                elif filter_method == 'wiener':
                    filtered_audio = noise_filter.wiener_filter(audio_data)
                elif filter_method == 'spectral_subtraction':
                    filtered_audio = noise_filter.spectral_subtraction(audio_data, sr)
                elif filter_method == 'aggressive':
                    filtered_audio = noise_filter.aggressive_filter(audio_data, sr)
                else:
                    filtered_audio = audio_data

                # Normalize
                max_val = np.max(np.abs(filtered_audio))
                if max_val > 0:
                    filtered_audio = filtered_audio / max_val * 0.95

                # Save to temp file and transcribe
                with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
                    sf.write(tmp.name, filtered_audio, sr)
                    segments, _ = model.transcribe(tmp.name, beam_size=5)
                    os.unlink(tmp.name)

            transcription = " ".join([segment.text.strip() for segment in segments])
            inference_time = time.time() - start_time

            # Analyze results
            is_hallucination = len(transcription.strip()) > 0
            word_count = len(transcription.split()) if is_hallucination else 0
            char_count = len(transcription.strip()) if is_hallucination else 0

            result = {
                'filename': sample['filename'],
                'category': sample.get('group', sample.get('category', 'unknown')),
                'filter_method': filter_method,
                'transcription': transcription,
                'is_hallucination': is_hallucination,
                'word_count': word_count,
                'char_count': char_count,
                'inference_time': inference_time,
                'volume_adjustment_db': sample.get('volume_adjustment_db', 0)
            }

            results.append(result)

    return pd.DataFrame(results)

def analyze_filter_effectiveness(df_results):
    """
    Analyze how effective filters are at reducing hallucinations
    """
    print("\n" + "="*60)
    print("FILTER EFFECTIVENESS ANALYSIS")
    print("="*60)

    # Overall hallucination rates by filter
    print("\nHallucination Rates by Filter Method:")
    print("-" * 40)

    filter_stats = []

    for filter_method in df_results['filter_method'].unique():
        filter_data = df_results[df_results['filter_method'] == filter_method]
        hall_rate = filter_data['is_hallucination'].mean() * 100
        avg_words = filter_data[filter_data['is_hallucination']]['word_count'].mean() if any(filter_data['is_hallucination']) else 0

        filter_stats.append({
            'Filter': filter_method,
            'Hallucination Rate (%)': f"{hall_rate:.1f}",
            'Avg Words': f"{avg_words:.1f}",
            'Samples': len(filter_data)
        })

        print(f"{filter_method}:")
        print(f"  Hallucination rate: {hall_rate:.1f}%")
        print(f"  Average words when hallucinating: {avg_words:.1f}")

    # Calculate improvement
    baseline_rate = df_results[df_results['filter_method'] == 'none']['is_hallucination'].mean() * 100

    print("\nImprovement over Baseline:")
    print("-" * 40)

    improvements = []

    for filter_method in df_results['filter_method'].unique():
        if filter_method != 'none':
            filter_rate = df_results[df_results['filter_method'] == filter_method]['is_hallucination'].mean() * 100
            improvement = baseline_rate - filter_rate
            relative_improvement = (improvement / baseline_rate * 100) if baseline_rate > 0 else 0

            improvements.append({
                'Filter': filter_method,
                'Reduction': f"{improvement:.1f}%",
                'Relative Improvement': f"{relative_improvement:.1f}%"
            })

            print(f"{filter_method}: {improvement:.1f}% absolute reduction ({relative_improvement:.1f}% relative)")

    # Best filter by category
    print("\nBest Filter by Noise Category:")
    print("-" * 40)

    for category in df_results['category'].unique():
        cat_data = df_results[df_results['category'] == category]

        # Find filter with lowest hallucination rate
        hall_by_filter = cat_data.groupby('filter_method')['is_hallucination'].mean()
        best_filter = hall_by_filter.idxmin()
        best_rate = hall_by_filter.min() * 100
        baseline = hall_by_filter.get('none', 1) * 100

        print(f"{category}: {best_filter} ({best_rate:.1f}% vs {baseline:.1f}% baseline)")

    return pd.DataFrame(filter_stats), pd.DataFrame(improvements)

def test_noise_complete_with_filters(model_size="base"):
    """
    Complete test pipeline with filters
    """
    # Define filters to test
    filter_methods = ['none', 'spectral_gating', 'wiener', 'spectral_subtraction', 'aggressive']

    # Test with all filters
    print("Testing multiple noise reduction filters...")
    df_results = test_noise_with_filters(model_size, filter_methods)

    # Analyze results
    filter_stats, improvements = analyze_filter_effectiveness(df_results)

    # Save detailed results
    output_file = f'whisper_{model_size}_noise_filtered_comparison.csv'
    df_results.to_csv(output_file, index=False)
    print(f"\n✓ Detailed results saved to '{output_file}'")

    # Save summary
    summary_file = f'whisper_{model_size}_filter_summary.csv'
    filter_stats.to_csv(summary_file, index=False)
    print(f"✓ Summary saved to '{summary_file}'")

    # Key findings
    print("\n" + "="*60)
    print("KEY FINDINGS")
    print("="*60)

    # Best overall filter
    hall_by_filter = df_results.groupby('filter_method')['is_hallucination'].mean()
    best_filter = hall_by_filter.idxmin()
    best_rate = hall_by_filter.min() * 100

    print(f"\nBest overall filter: {best_filter}")
    print(f"Hallucination rate: {best_rate:.1f}%")

    # Sample hallucinations that were fixed
    baseline_hall = df_results[(df_results['filter_method'] == 'none') & (df_results['is_hallucination'])]

    if len(baseline_hall) > 0:
        print("\nExamples of hallucinations prevented by filtering:")
        print("-" * 40)

        for idx, baseline_row in baseline_hall.head(3).iterrows():
            # Find same file with best filter
            filtered_row = df_results[
                (df_results['filename'] == baseline_row['filename']) &
                (df_results['filter_method'] == best_filter)
            ].iloc[0]

            if not filtered_row['is_hallucination']:
                print(f"\n{baseline_row['category']} noise:")
                print(f"  Baseline: \"{baseline_row['transcription']}\"")
                print(f"  {best_filter}: (no hallucination)")

    return df_results

# Quick comparison function
def quick_filter_comparison(audio_file, model_size="base"):
    """
    Quick test of all filters on a single audio file
    """
    print(f"\nTesting filters on: {audio_file}")
    print("-" * 40)

    model = WhisperModel(model_size)
    noise_filter = SimpleNoiseFilter()

    # Load audio
    audio_data, sr = librosa.load(audio_file, sr=16000)

    # Test each filter
    filters = {
        'none': audio_data,
        'spectral_gating': noise_filter.spectral_gating(audio_data, sr),
        'wiener': noise_filter.wiener_filter(audio_data),
        'aggressive': noise_filter.aggressive_filter(audio_data, sr)
    }

    for name, filtered_audio in filters.items():
        if name == 'none':
            segments, _ = model.transcribe(audio_file)
        else:
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
                sf.write(tmp.name, filtered_audio, sr)
                segments, _ = model.transcribe(tmp.name)
                os.unlink(tmp.name)

        transcription = " ".join([s.text.strip() for s in segments])
        print(f"\n{name}: \"{transcription}\"" if transcription else f"\n{name}: (no hallucination)")

# Usage
if __name__ == "__main__":
    # Run complete test with filters
    results = test_noise_complete_with_filters(model_size="base")

    # Optional: Quick test on single file
    # quick_filter_comparison("sample_audios/noise_only/human_001.wav")


In [None]:
!ffmpeg -version

In [None]:
! add-apt-repository -y ppa:savoury1/ffmpeg4
! apt-get -qq install -y ffmpeg

In [None]:
!ffmpeg -version

In [None]:
!rm -rf WhisperHallu
!git clone https://github.com/EtienneAb3d/WhisperHallu.git

In [None]:
!pip install -U demucs

In [None]:
import os
os.chdir("WhisperHallu")
!git pull

In [None]:
!pip install "faster-whisper @ git+https://github.com/guillaumekln/faster-whisper@master#faster-whisper[conversion]"

!ct2-transformers-converter --model openai/whisper-medium --output_dir whisper-medium-ct2 --quantization float16
!ct2-transformers-converter --model openai/whisper-large --output_dir whisper-large-ct2 --quantization float16


In [None]:
!pip3 install torchaudio

In [None]:
!ls -lahtr

In [None]:
import os
import json

folder = '../sample_audios/noise_only'
meta = []

for fname in os.listdir(folder):
    if fname.endswith('.wav'):
        category = fname.split('_')[0]
        meta.append({"filename": fname, "category": category})

# Overwrite metadata.json with only existing files
with open(os.path.join(folder, 'metadata.json'), 'w') as f:
    json.dump(meta, f, indent=2)

print(f"✅ metadata.json created with {len(meta)} valid entries!")


In [None]:
# Test WhisperHallu on Noise-Only Audio

# First install WhisperHallu
# !pip install -q git+https://github.com/Mageswaran1989/WhisperHallu

import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import torch

# Import WhisperHallu
try:
    from transcribeHallu import loadModel, transcribePrompt
except ImportError:
    print("Please install WhisperHallu first:")
    print("!pip install git+https://github.com/Mageswaran1989/WhisperHallu")
    raise

def test_whisperhallu_on_noise(model_size="base", max_samples=10):
    """
    Test WhisperHallu on noise-only samples to see if it reduces hallucinations

    Args:
        model_size: Model size (base, small, medium, large)
        max_samples: Number of samples to test

    Returns:
        DataFrame with results
    """
    print(f"\n{'='*60}")
    print("Testing WhisperHallu on Noise-Only Audio")
    print(f"{'='*60}")

    # Load WhisperHallu model
    device = "0" if torch.cuda.is_available() else "cpu"
    print(f"Loading WhisperHallu model: {model_size} on device: {device}")
    loadModel(device, modelSize=model_size)

    # Define anti-hallucination prompt for English
    # This prompt is designed to prevent common hallucinations
    anti_hallucination_prompt = (
        "Whisper, Ok. "
        "This is just noise, no speech. "
        "Ok, Whisper. Whisper, Ok. "
        "Ok, Whisper. Whisper, Ok. "
        "Please do not transcribe anything from noise. "
        "This is to avoid hallucinations. "
        "Ok, Whisper. "
    )

    # Alternative prompt that explicitly mentions silence
    silence_prompt = (
        "Whisper, Ok. "
        "Silence. No words. Just background noise. "
        "Ok, Whisper. Whisper, Ok. "
        "Empty audio. No speech detected. "
        "Ok, Whisper. Whisper, Ok. "
        "Please transcribe only actual speech. "
        "Ok, Whisper. "
    )

    # Load metadata
    with open('sample_audios/noise_only/metadata.json', 'r') as f:
        metadata = json.load(f)

    # Limit samples
    metadata = metadata[:max_samples]

    results = []

    # Test each noise sample with different prompts
    prompts = {
        'no_prompt': "",  # Baseline - empty prompt
        'anti_hallucination': anti_hallucination_prompt,
        'silence_prompt': silence_prompt
    }

    for sample in tqdm(metadata, desc="Processing noise samples"):
        audio_path = os.path.join("sample_audios", "noise_only", sample["filename"])

        if not os.path.exists(audio_path):
            print(f"File not found: {audio_path}")
            continue

        for prompt_name, prompt_text in prompts.items():
            start_time = time.time()

            try:
                # Transcribe with WhisperHallu
                result = transcribePrompt(
                    path=audio_path,
                    lng="en",
                    prompt=prompt_text
                )

                # Extract transcription
                # WhisperHallu returns a dict with 'text' key
                if isinstance(result, dict) and 'text' in result:
                    transcription = result['text'].strip()
                else:
                    transcription = str(result).strip()

            except Exception as e:
                print(f"Error with {audio_path} using {prompt_name}: {e}")
                transcription = ""

            inference_time = time.time() - start_time

            # Check for hallucination
            is_hallucination = len(transcription) > 0
            word_count = len(transcription.split()) if is_hallucination else 0

            # Store result
            results.append({
                'filename': sample['filename'],
                'category': sample.get('group', sample.get('category', 'unknown')),
                'prompt_type': prompt_name,
                'transcription': transcription[:200],  # Limit length
                'is_hallucination': is_hallucination,
                'word_count': word_count,
                'inference_time': inference_time
            })

    return pd.DataFrame(results)

def analyze_whisperhallu_results(df_results):
    """
    Analyze WhisperHallu effectiveness
    """
    print("\n" + "="*60)
    print("WHISPERHALLU ANALYSIS")
    print("="*60)

    # Overall results by prompt type
    print("\nHallucination Rates by Prompt Type:")
    print("-" * 40)

    summary = []

    for prompt_type in df_results['prompt_type'].unique():
        prompt_data = df_results[df_results['prompt_type'] == prompt_type]
        hall_rate = prompt_data['is_hallucination'].mean() * 100
        avg_words = prompt_data[prompt_data['is_hallucination']]['word_count'].mean() if any(prompt_data['is_hallucination']) else 0

        summary.append({
            'Prompt Type': prompt_type,
            'Hallucination Rate': f"{hall_rate:.1f}%",
            'Avg Words': f"{avg_words:.1f}",
            'Samples': len(prompt_data)
        })

        print(f"{prompt_type}:")
        print(f"  Hallucination rate: {hall_rate:.1f}%")
        print(f"  Average words when hallucinating: {avg_words:.1f}")

    summary_df = pd.DataFrame(summary)

    # Calculate improvement
    if 'no_prompt' in df_results['prompt_type'].values:
        baseline_rate = df_results[df_results['prompt_type'] == 'no_prompt']['is_hallucination'].mean() * 100

        print("\nImprovement over baseline (no prompt):")
        print("-" * 40)

        for prompt_type in ['anti_hallucination', 'silence_prompt']:
            if prompt_type in df_results['prompt_type'].values:
                prompt_rate = df_results[df_results['prompt_type'] == prompt_type]['is_hallucination'].mean() * 100
                reduction = baseline_rate - prompt_rate
                relative = (reduction / baseline_rate * 100) if baseline_rate > 0 else 0
                print(f"{prompt_type}: {reduction:.1f}% absolute ({relative:.1f}% relative)")

    # Show examples
    print("\nExample Hallucinations:")
    print("-" * 40)

    # Show baseline hallucinations
    baseline_hall = df_results[
        (df_results['prompt_type'] == 'no_prompt') &
        (df_results['is_hallucination'])
    ]

    for idx, row in baseline_hall.head(3).iterrows():
        print(f"\n{row['category']} noise:")
        print(f"  No prompt: \"{row['transcription']}\"")

        # Check if prompts prevented it
        for prompt_type in ['anti_hallucination', 'silence_prompt']:
            prompt_result = df_results[
                (df_results['filename'] == row['filename']) &
                (df_results['prompt_type'] == prompt_type)
            ]

            if len(prompt_result) > 0:
                if not prompt_result.iloc[0]['is_hallucination']:
                    print(f"  {prompt_type}: (no hallucination) ✓")
                else:
                    print(f"  {prompt_type}: \"{prompt_result.iloc[0]['transcription']}\"")

    return summary_df

def compare_with_standard_whisper(model_size="base", sample_file=None):
    """
    Direct comparison between standard Whisper and WhisperHallu on one file
    """
    if sample_file is None:
        sample_file = "sample_audios/noise_only/human_004.wav"

    print(f"\nDirect Comparison on: {sample_file}")
    print("-" * 60)

    # Test with standard Whisper
    from faster_whisper import WhisperModel

    print("1. Standard Whisper:")
    standard_model = WhisperModel(model_size)
    segments, _ = standard_model.transcribe(sample_file)
    standard_text = " ".join([s.text.strip() for s in segments])
    print(f"   Result: \"{standard_text}\"" if standard_text else "   Result: (no hallucination)")

    # Test with WhisperHallu
    print("\n2. WhisperHallu with anti-hallucination prompt:")
    device = "0" if torch.cuda.is_available() else "cpu"
    loadModel(device, modelSize=model_size)

    anti_prompt = (
        "Whisper, Ok. "
        "This is just noise, no speech. "
        "Ok, Whisper. Whisper, Ok. "
        "Please do not transcribe anything from noise. "
        "Ok, Whisper. "
    )

    result = transcribePrompt(path=sample_file, lng="en", prompt=anti_prompt)
    hallu_text = result.get('text', '').strip() if isinstance(result, dict) else str(result).strip()
    print(f"   Result: \"{hallu_text}\"" if hallu_text else "   Result: (no hallucination)")

    # Analysis
    if standard_text and not hallu_text:
        print("\n✓ WhisperHallu successfully prevented hallucination!")
    elif not standard_text and not hallu_text:
        print("\n✓ Both methods correctly identified no speech")
    elif standard_text and hallu_text:
        print("\n⚠ Both methods hallucinated")
    else:
        print("\n⚠ WhisperHallu hallucinated while standard Whisper didn't")

def run_whisperhallu_test(model_size="base"):
    """
    Complete WhisperHallu test pipeline
    """
    # Test on noise samples
    df_results = test_whisperhallu_on_noise(model_size=model_size, max_samples=10)

    # Analyze results
    summary = analyze_whisperhallu_results(df_results)

    # Save results
    output_file = f'whisperhallu_{model_size}_noise_test.csv'
    df_results.to_csv(output_file, index=False)
    print(f"\n✓ Results saved to '{output_file}'")

    # Direct comparison example
    print("\n" + "="*60)
    print("DIRECT COMPARISON EXAMPLE")
    print("="*60)
    compare_with_standard_whisper(model_size=model_size)

    return df_results

# Usage
if __name__ == "__main__":
    # Run complete test
    results = run_whisperhallu_test(model_size="base")

    # Or test single file comparison
    # compare_with_standard_whisper(model_size="base", sample_file="sample_audios/noise_only/urban_000.wav")