# ASR Benchmarking Notebook for Multitask-National-Speech-Corpus-v1


In [1]:
import os
import time
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset, Audio
from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq, WhisperProcessor, WhisperForConditionalGeneration
from jiwer import wer, mer, wip, wil
from tqdm import tqdm

In [2]:
# ----------------------- Configurations -----------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# ----------------------- Load ASR Model -----------------------
def load_asr_model(model_id):
    try:
        # Try using a generic pipeline-based ASR (e.g. Whisper, wav2vec2)
        asr_pipeline = pipeline(
            "automatic-speech-recognition",
            model=model_id,
            device=0 if DEVICE == "cuda" else -1
        )

        def wrapped_pipeline(audio_array, sampling_rate=16000):
            return asr_pipeline(audio_array)

        return wrapped_pipeline

    except Exception as e:
        print(f"Falling back to custom processor loading for model {model_id} due to: {e}")
        processor = AutoProcessor.from_pretrained(model_id)
        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id).to(DEVICE)

        def custom_asr_pipeline(audio_array, sampling_rate=16000):
            input_features = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features.to(DEVICE)
            with torch.no_grad():
                generated_ids = model.generate(input_features=input_features)
            transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return {"text": transcription}

        return custom_asr_pipeline


In [5]:
# ----------------------- Benchmarking Function -----------------------
def benchmark_asr(asr_pipeline, dataset):
    results = []
    total_start_time = time.time()

    for example in tqdm(dataset, desc="Transcribing"):
        audio_array = example["context"]["array"]
        reference = example["answer"]

        start = time.time()
        prediction = asr_pipeline(audio_array, sampling_rate=16000)["text"]
        end = time.time()

        results.append({
            "reference": reference,
            "prediction": prediction,
            "time": end - start
        })

    total_time = time.time() - total_start_time
    result_df = pd.DataFrame(results)
    return result_df, total_time

In [6]:
# ----------------------- Evaluation Metrics -----------------------
def evaluate_metrics(result_df):
    result_df["wer"] = result_df.apply(lambda x: wer(x["reference"], x["prediction"]), axis=1)
    result_df["mer"] = result_df.apply(lambda x: mer(x["reference"], x["prediction"]), axis=1)
    result_df["wil"] = result_df.apply(lambda x: wil(x["reference"], x["prediction"]), axis=1)
    result_df["wip"] = result_df.apply(lambda x: wip(x["reference"], x["prediction"]), axis=1)

    average_metrics = {
        "Average WER": result_df["wer"].mean(),
        "Average MER": result_df["mer"].mean(),
        "Average WIL": result_df["wil"].mean(),
        "Average WIP": result_df["wip"].mean(),
        "Average Time per Sample (s)": result_df["time"].mean()
    }

    return result_df, average_metrics

In [7]:
# ----------------------- Identify Best/Worst Words -----------------------
def word_analysis(result_df):
    from collections import Counter
    word_errors = []
    for _, row in result_df.iterrows():
        ref_words = row["reference"].lower().split()
        pred_words = row["prediction"].lower().split()
        missed = set(ref_words) - set(pred_words)
        for word in missed:
            word_errors.append(word)

    error_counter = Counter(word_errors)
    most_common_errors = error_counter.most_common(10)
    least_common_errors = error_counter.most_common()[-10:]
    return most_common_errors, least_common_errors

In [8]:
from tqdm import tqdm
import time
from datasets import load_dataset
from jiwer import wer
import numpy as np
import re

def clean_text(text):
    """
    Clean the reference text by:
    - Removing speaker labels like <speaker1>: or <speaker2>:
    - Removing round or square brackets but keeping the content inside.
    - Converting to lowercase.
    """
    # Remove speaker tags like <speaker1>: or <speaker2>:
    text = re.sub(r"<speaker\d+>:\s*", "", text, flags=re.IGNORECASE)

    # Remove round or square brackets but keep content inside
    text = re.sub(r"\[(.*?)\]", r"\1", text)
    text = re.sub(r"\((.*?)\)", r"\1", text)

    # Convert to lowercase and strip surrounding whitespace
    return text.lower().strip()


def run_benchmark(model_id, data_dir="ASR-PART1-Train", num_samples=100, batch_size=8):
    print(f"\n📊 Running benchmark for model: {model_id}")

    # Load dataset
    dataset = load_dataset("MERaLiON/Multitask-National-Speech-Corpus-v1", data_dir=data_dir)["train"]
    dataset = dataset.select(range(min(num_samples, len(dataset))))

    # Load ASR model pipeline
    model_fn = load_asr_model(model_id)

    # Prepare audio and references
    audio_arrays = [sample["context"]["array"] for sample in dataset]
    references = [sample["answer"] for sample in dataset]

    predictions = []
    start_time = time.time()

    # Run batched inference
    for i in tqdm(range(0, len(audio_arrays), batch_size), desc=f"Evaluating {model_id}", ncols=100):
        batch_audio = audio_arrays[i:i+batch_size]
        batch_refs = references[i:i+batch_size]

        try:
            # Support both batch and single-audio pipelines
            preds = model_fn(batch_audio)

            # Ensure output is iterable
            if isinstance(preds, dict):
                preds = [preds]

            for ref, pred_dict in zip(batch_refs, preds):
                hyp = pred_dict.get("text", "[ERROR]").strip()

                # Clean both reference and prediction
                ref_cleaned = clean_text(ref)
                hyp_cleaned = clean_text(hyp)

                predictions.append({"reference": ref_cleaned, "prediction": hyp_cleaned})

        except Exception as e:
            for ref in batch_refs:
                predictions.append({"reference": clean_text(ref), "prediction": f"[ERROR: {e}]"})

        # Dynamically print progress without breaking the progress bar
        tqdm.write(f"Processed {i + batch_size}/{len(audio_arrays)} samples.")

    end_time = time.time()
    total_time = end_time - start_time

    # Compute metrics
    references_clean = [x["reference"] for x in predictions]
    predictions_clean = [x["prediction"] for x in predictions]
    computed_wer = wer(references_clean, predictions_clean)

    # Print metrics
    print(f"\n✅ Benchmark complete for {model_id}")
    print(f"⏱️ Total time: {total_time:.2f} seconds")
    print(f"🧠 WER: {computed_wer:.4f}")

    # Sample output
    print("\n🔍 Sample predictions:")
    for sample in predictions[:5]:
        print(f"REF: {sample['reference']}")
        print(f"HYP: {sample['prediction']}")
        print("-----")

    return {
        "model_id": model_id,
        "wer": computed_wer,
        "runtime_sec": total_time,
        "results": predictions,
    }

In [20]:
# ----------------------- Multi-Model Benchmark Loop -----------------------
import gc

def benchmark_multiple_models(model_ids, data_dir="ASR-PART6-Test", num_samples=100, batch_size=8):
    comparison_results = []

    for model_id in model_ids:
        gc.collect()
        torch.cuda.empty_cache() # Clear the GPU cache

        print(f"\n===== Benchmarking Model: {model_id} =====")

        # Run benchmark and get results
        result = run_benchmark(model_id=model_id, data_dir=data_dir, num_samples=num_samples, batch_size=batch_size)

        # Collecting metrics for comparison
        metrics = {"WER": result["wer"], "Runtime (sec)": result["runtime_sec"], "Model": model_id}
        comparison_results.append(metrics)

    # Convert list of dicts to DataFrame for easy comparison
    comparison_df = pd.DataFrame(comparison_results)
    comparison_df = comparison_df.set_index("Model")

    return comparison_df



In [21]:
# Example usage:
model_list = [
    # "openai/whisper-medium.en",
    "openai/whisper-small.en",
    # "openai/whisper-tiny.en",
    # "openai/whisper-base.en",
    # "facebook/wav2vec2-base-960h",
    "jensenlwt/whisper-small-singlish-122k",
    "mjwong/whisper-large-v3-turbo-singlish",
    # "mjwong/whisper-small-singlish",
    # "path/to/your/local/model",  # Example for a local model
]

comparison_df = benchmark_multiple_models(model_list)
display(comparison_df.sort_values(by="WER"))



===== Benchmarking Model: openai/whisper-small.en =====

📊 Running benchmark for model: openai/whisper-small.en


Device set to use cuda:0
Evaluating openai/whisper-small.en:   8%|██▏                         | 1/13 [00:05<01:05,  5.49s/it]

Processed 8/100 samples.


Evaluating openai/whisper-small.en:  15%|████▎                       | 2/13 [00:10<00:54,  4.98s/it]

Processed 16/100 samples.


Evaluating openai/whisper-small.en:  23%|██████▍                     | 3/13 [00:14<00:45,  4.54s/it]

Processed 24/100 samples.


Evaluating openai/whisper-small.en:  31%|████████▌                   | 4/13 [00:19<00:42,  4.68s/it]

Processed 32/100 samples.


Evaluating openai/whisper-small.en:  38%|██████████▊                 | 5/13 [00:22<00:35,  4.38s/it]

Processed 40/100 samples.


Evaluating openai/whisper-small.en:  46%|████████████▉               | 6/13 [00:27<00:31,  4.47s/it]

Processed 48/100 samples.


Evaluating openai/whisper-small.en:  54%|███████████████             | 7/13 [00:31<00:26,  4.46s/it]

Processed 56/100 samples.


Evaluating openai/whisper-small.en:  62%|█████████████████▏          | 8/13 [00:36<00:21,  4.37s/it]

Processed 64/100 samples.


Evaluating openai/whisper-small.en:  69%|███████████████████▍        | 9/13 [00:40<00:17,  4.47s/it]

Processed 72/100 samples.


Evaluating openai/whisper-small.en:  77%|████████████████████▊      | 10/13 [00:44<00:13,  4.37s/it]

Processed 80/100 samples.


Evaluating openai/whisper-small.en:  85%|██████████████████████▊    | 11/13 [00:49<00:08,  4.49s/it]

Processed 88/100 samples.


Evaluating openai/whisper-small.en:  92%|████████████████████████▉  | 12/13 [00:54<00:04,  4.51s/it]

Processed 96/100 samples.


Evaluating openai/whisper-small.en: 100%|███████████████████████████| 13/13 [00:55<00:00,  4.31s/it]


Processed 104/100 samples.

✅ Benchmark complete for openai/whisper-small.en
⏱️ Total time: 56.00 seconds
🧠 WER: 0.3396

🔍 Sample predictions:
REF: so it depends on the child as compared to jc itself you are basically the child would be you know studying throughout the two whole two years to prepare for one whole major exam which which is the a levels
ah okay um so i actually like to inquire more about the cca in polytechnic and also jc um
HYP: so it depends on the child. as compared to jc itself, basically the child would be studying throughout the whole two years to prepare for one whole major exam which is the a levels. so i'd actually like to inquire more about the cca in polytania, so jcs.
-----
REF: uh currently my parents they are fifty seven both fifty seven this year so uh i think both of them are working desk bound jobs ya the normal nine to five jobs and ya i don't think there's any risk in terms of health wise ya i would say they're rather healthy
HYP: currently my parents,

Device set to use cuda:0
Evaluating jensenlwt/whisper-small-singlish-122k:   8%|█             | 1/13 [00:05<01:02,  5.21s/it]

Processed 8/100 samples.


Evaluating jensenlwt/whisper-small-singlish-122k:  15%|██▏           | 2/13 [00:13<01:15,  6.89s/it]

Processed 16/100 samples.


Evaluating jensenlwt/whisper-small-singlish-122k:  23%|███▏          | 3/13 [00:21<01:14,  7.47s/it]

Processed 24/100 samples.


Evaluating jensenlwt/whisper-small-singlish-122k:  31%|████▎         | 4/13 [00:27<01:01,  6.82s/it]

Processed 32/100 samples.


Evaluating jensenlwt/whisper-small-singlish-122k:  38%|█████▍        | 5/13 [00:38<01:06,  8.25s/it]

Processed 40/100 samples.


Evaluating jensenlwt/whisper-small-singlish-122k:  46%|██████▍       | 6/13 [00:43<00:51,  7.35s/it]

Processed 48/100 samples.


Evaluating jensenlwt/whisper-small-singlish-122k:  54%|███████▌      | 7/13 [00:51<00:45,  7.59s/it]

Processed 56/100 samples.


Evaluating jensenlwt/whisper-small-singlish-122k:  62%|████████▌     | 8/13 [00:57<00:34,  6.92s/it]

Processed 64/100 samples.


Evaluating jensenlwt/whisper-small-singlish-122k:  69%|█████████▋    | 9/13 [01:03<00:26,  6.71s/it]

Processed 72/100 samples.


Evaluating jensenlwt/whisper-small-singlish-122k:  77%|██████████   | 10/13 [01:11<00:21,  7.17s/it]

Processed 80/100 samples.


Evaluating jensenlwt/whisper-small-singlish-122k:  85%|███████████  | 11/13 [01:16<00:13,  6.61s/it]

Processed 88/100 samples.


Evaluating jensenlwt/whisper-small-singlish-122k:  92%|████████████ | 12/13 [01:22<00:06,  6.18s/it]

Processed 96/100 samples.


Evaluating jensenlwt/whisper-small-singlish-122k: 100%|█████████████| 13/13 [01:24<00:00,  6.48s/it]


Processed 104/100 samples.

✅ Benchmark complete for jensenlwt/whisper-small-singlish-122k
⏱️ Total time: 84.28 seconds
🧠 WER: 0.5954

🔍 Sample predictions:
REF: so it depends on the child as compared to jc itself you are basically the child would be you know studying throughout the two whole two years to prepare for one whole major exam which which is the a levels
ah okay um so i actually like to inquire more about the cca in polytechnic and also jc um
HYP: so it depends on the child and as come back to j c its self you are basically the child would be you know studied throughout the two whole two years to prepare for one whole major exam which is the a levels okay so i'd actually like to enquire more about the c c a's in polytechnic and also j c's
-----
REF: uh currently my parents they are fifty seven both fifty seven this year so uh i think both of them are working desk bound jobs ya the normal nine to five jobs and ya i don't think there's any risk in terms of health wise ya i wou

Device set to use cuda:0
Evaluating mjwong/whisper-large-v3-turbo-singlish:   8%|▋        | 1/13 [30:21<6:04:15, 1821.29s/it]

Processed 8/100 samples.


Evaluating mjwong/whisper-large-v3-turbo-singlish:  15%|█▌        | 2/13 [34:29<2:44:14, 895.86s/it]

Processed 16/100 samples.


Evaluating mjwong/whisper-large-v3-turbo-singlish:  23%|██▎       | 3/13 [38:34<1:39:46, 598.62s/it]

Processed 24/100 samples.


Evaluating mjwong/whisper-large-v3-turbo-singlish:  31%|███       | 4/13 [42:48<1:09:24, 462.72s/it]

Processed 32/100 samples.


Evaluating mjwong/whisper-large-v3-turbo-singlish:  38%|████▌       | 5/13 [46:51<51:07, 383.48s/it]

Processed 40/100 samples.


Evaluating mjwong/whisper-large-v3-turbo-singlish:  46%|█████▌      | 6/13 [51:02<39:29, 338.56s/it]

Processed 48/100 samples.


Evaluating mjwong/whisper-large-v3-turbo-singlish:  54%|██████▍     | 7/13 [55:10<30:52, 308.72s/it]

Processed 56/100 samples.


Evaluating mjwong/whisper-large-v3-turbo-singlish:  62%|███████▍    | 8/13 [59:14<24:01, 288.34s/it]

Processed 64/100 samples.


Evaluating mjwong/whisper-large-v3-turbo-singlish:  69%|██████▉   | 9/13 [1:03:25<18:25, 276.49s/it]

Processed 72/100 samples.


Evaluating mjwong/whisper-large-v3-turbo-singlish:  77%|██████▉  | 10/13 [1:07:29<13:19, 266.49s/it]

Processed 80/100 samples.


Evaluating mjwong/whisper-large-v3-turbo-singlish:  85%|███████▌ | 11/13 [1:11:39<08:42, 261.47s/it]

Processed 88/100 samples.


Evaluating mjwong/whisper-large-v3-turbo-singlish:  85%|███████▌ | 11/13 [1:36:05<17:28, 524.09s/it]


KeyboardInterrupt: 