In [1]:
from pathlib import Path

# Root of your CUP demo dataset
ROOT_DIR = r"C:\Users\manmo\OneDrive\Desktop\Cup demo dataset"
OUTPUT_TSV = (
    r"C:\Users\manmo\OneDrive\Desktop\Cup demo dataset\whisper_large_v3_eval.tsv"
)
LANG_SUM_TSV = r"C:\Users\manmo\OneDrive\Desktop\Cup demo dataset\whisper_large_v3_language_summary.tsv"

# Model + decoding settings
MODEL_NAME = "openai/whisper-large-v3"
TARGET_SR = 16000
TASK = "transcribe"  # keep "transcribe"
MAX_NEW_TOKENS = 256  # raise if you truly have long clips

# Dataset subfolders to process (must match your folder names under ROOT_DIR)
LANG_FOLDERS = ["chinese", "pashto", "urdu"]

# Map dataset folder name -> Whisper language prompt
LANG_PROMPT = {
    "chinese": "chinese",
    "pashto": "pashto",
    "urdu": "urdu",
}

# Name of TSV file inside each language folder (we’ll accept either)
TSV_CANDIDATES = ["validated.tsv", "validation.tsv"]

# Subfolder name that contains the audio files
CLIPS_SUBDIR = "clips"


In [None]:
import os, math, warnings
from typing import Dict, Any, List
import pandas as pd
import torch, torchaudio
from tqdm import tqdm

from transformers import WhisperProcessor, WhisperForConditionalGeneration

# jiwer v4 API:
from jiwer import process_words, process_characters

warnings.filterwarnings("ignore", category=UserWarning)  # quiet noisy warnings


def load_model_and_processor(model_name: str):
    """
    Load Whisper model & processor.
    Use float16 on GPU; float32 on CPU. We'll cast inputs to model.dtype.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dtype = torch.float16 if device == "cuda" else torch.float32

    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(
        model_name, torch_dtype=dtype
    )
    model.to(device)
    model.eval()
    return processor, model, device


def _to_mono(waveform: torch.Tensor) -> torch.Tensor:
    # waveform: (channels, n) or (n,)
    if waveform.ndim == 2:
        if waveform.size(0) > 1:
            waveform = waveform.mean(dim=0)
        else:
            waveform = waveform.squeeze(0)
    return waveform


def _resample_if_needed(
    waveform: torch.Tensor, sr: int, target_sr: int
) -> torch.Tensor:
    if sr == target_sr:
        return waveform
    return torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(waveform)


def load_audio_full(path: Path, target_sr: int = 16000) -> torch.Tensor:
    """
    Load FULL clip (no chunking), convert to mono, resample to target_sr, float32.
    Returns (num_samples,) tensor.
    """
    waveform, sr = torchaudio.load(str(path))
    waveform = _to_mono(waveform)
    waveform = _resample_if_needed(waveform, sr, target_sr).contiguous()
    if waveform.dtype != torch.float32:
        waveform = waveform.to(torch.float32)
    return waveform


# ---------- Metrics helpers using jiwer v4 alignments ----------


def _pr_from_counts(hits: int, subs: int, dels: int, ins: int) -> Dict[str, float]:
    """
    precision = hits / (hits + substitutions + insertions)
    recall    = hits / (hits + substitutions + deletions)
    """
    predicted = hits + subs + ins
    reference = hits + subs + dels
    precision = (hits / predicted) if predicted > 0 else 0.0
    recall = (hits / reference) if reference > 0 else 0.0
    return {"precision": precision, "recall": recall}


def word_metrics(ref: str, hyp: str) -> Dict[str, float]:
    """
    Compute per-utterance word-level metrics using jiwer.process_words (v4).
    WER = (S + D + I) / (S + D + H)    where H=subs? no, H=hits.
    """
    w = process_words(ref, hyp)  # alignment object with counts
    hits = int(w.hits)
    subs = int(w.substitutions)
    dels = int(w.deletions)
    ins = int(w.insertions)

    ref_len = hits + subs + dels
    wer_val = (subs + dels + ins) / ref_len if ref_len > 0 else 0.0
    pr = _pr_from_counts(hits, subs, dels, ins)

    return {
        "wer": wer_val,
        "precision": pr["precision"],
        "recall": pr["recall"],
        "counts": {"hits": hits, "subs": subs, "dels": dels, "ins": ins},
    }


def char_cer(ref: str, hyp: str) -> float:
    """
    Compute character error rate using jiwer.process_characters (v4).
    CER = (S + D + I) / (S + D + H)
    """
    c = process_characters(ref, hyp)
    hits = int(c.hits)
    subs = int(c.substitutions)
    dels = int(c.deletions)
    ins = int(c.insertions)
    ref_len = hits + subs + dels
    return (subs + dels + ins) / ref_len if ref_len > 0 else 0.0


def aggregate_word_counts(pairs: List[Dict[str, int]]) -> Dict[str, int]:
    """
    Sum a list of word-count dicts {'hits','subs','dels','ins'}.
    """
    H = S = D = I = 0
    for d in pairs:
        H += d["hits"]
        S += d["subs"]
        D += d["dels"]
        I += d["ins"]
    return {"hits": H, "subs": S, "dels": D, "ins": I}


def overall_word_metrics(refs: List[str], hyps: List[str]) -> Dict[str, float]:
    """
    Global word-level metrics by summing alignment counts over all pairs.
    This matches standard pooled WER/precision/recall.
    """
    counts_list = []
    for r, h in zip(refs, hyps):
        w = process_words(r, h)
        counts_list.append(
            {
                "hits": int(w.hits),
                "subs": int(w.substitutions),
                "dels": int(w.deletions),
                "ins": int(w.insertions),
            }
        )
    agg = aggregate_word_counts(counts_list)
    H, S, D, I = agg["hits"], agg["subs"], agg["dels"], agg["ins"]
    ref_len = H + S + D
    wer_val = (S + D + I) / ref_len if ref_len > 0 else 0.0
    pr = _pr_from_counts(H, S, D, I)
    return {"wer": wer_val, "precision": pr["precision"], "recall": pr["recall"]}


def overall_char_cer(refs: List[str], hyps: List[str]) -> float:
    """
    Global character error rate by summing character-level counts.
    """
    H = S = D = I = 0
    for r, h in zip(refs, hyps):
        c = process_characters(r, h)
        H += int(c.hits)
        S += int(c.substitutions)
        D += int(c.deletions)
        I += int(c.insertions)
    ref_len = H + S + D
    return (S + D + I) / ref_len if ref_len > 0 else 0.0


# ---------- Decoding helpers ----------


def decode_clip(
    waveform: torch.Tensor,
    processor: WhisperProcessor,
    model: WhisperForConditionalGeneration,
    device: str,
    language: str,
) -> str:
    """
    Run Whisper generation on the FULL clip (no chunking).
    """
    with torch.no_grad():
        inputs = processor(
            waveform.numpy(),  # WhisperProcessor expects numpy audio
            sampling_rate=TARGET_SR,
            return_tensors="pt",
            padding=True,
        )
        # Cast inputs to model dtype (fp16 on GPU, fp32 on CPU)
        input_features = inputs.input_features.to(device=device, dtype=model.dtype)
        attention_mask = inputs.get("attention_mask")
        attention_mask = (
            attention_mask.to(device) if attention_mask is not None else None
        )

        forced_ids = processor.get_decoder_prompt_ids(language=language, task=TASK)
        pred_ids = model.generate(
            input_features=input_features,
            attention_mask=attention_mask,
            forced_decoder_ids=forced_ids,
            max_new_tokens=MAX_NEW_TOKENS,
        )
        text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
        return text.strip()


def evaluate_language_folder(
    lang_folder: Path,
    language_prompt: str,
    processor: WhisperProcessor,
    model: WhisperForConditionalGeneration,
    device: str,
    clips_subdir: str = "clips",
) -> List[Dict[str, Any]]:
    """
    Evaluate all rows in validated/validation.tsv under lang_folder.
    Returns list of per-clip result dicts.
    """
    # find TSV
    tsv_path = None
    for cand in TSV_CANDIDATES:
        p = lang_folder / cand
        if p.exists():
            tsv_path = p
            break
    if tsv_path is None:
        print(f"[WARN] No validated.tsv/validation.tsv in {lang_folder}")
        return []

    df = pd.read_csv(tsv_path, sep="\t", quoting=3, encoding="utf-8")
    if not {"path", "sentence"}.issubset(df.columns):
        missing = {"path", "sentence"} - set(df.columns)
        print(f"[WARN] {tsv_path} missing columns: {missing}")
        return []

    clips_dir = lang_folder / clips_subdir
    if not clips_dir.exists():
        print(f"[WARN] Clips folder not found: {clips_dir}")
        return []

    # keep rows whose audio exists
    df["__fullpath__"] = df["path"].apply(lambda p: str(clips_dir / p))
    df = df[df["__fullpath__"].apply(os.path.exists)].reset_index(drop=True)
    if len(df) == 0:
        print(f"[WARN] No audio files found for {lang_folder.name}")
        return []

    results = []
    print(
        f"\n=== Processing language: {lang_folder.name} (prompt: {language_prompt}) | {len(df)} clips ==="
    )

    for _, row in tqdm(df.iterrows(), total=len(df)):
        fname = row["path"]
        ref = str(row["sentence"])
        full_audio = clips_dir / fname

        try:
            waveform = load_audio_full(full_audio, TARGET_SR)
            hyp = decode_clip(waveform, processor, model, device, language_prompt)

            # metrics via jiwer v4
            wm = word_metrics(ref, hyp)
            per_wer = wm["wer"]
            per_precision = wm["precision"]
            per_recall = wm["recall"]
            per_cer = char_cer(ref, hyp)

            # per-clip print
            print(f"\nClip: {fname}")
            print(f"ASR: {hyp}")
            print(f"REF: {ref}")
            print(
                f"WER: {per_wer:.4f}  CER: {per_cer:.4f}  Precision: {per_precision:.4f}  Recall: {per_recall:.4f}"
            )
            print("-" * 70)

            results.append(
                {
                    "language": lang_folder.name,
                    "filename": fname,
                    "reference": ref,
                    "hypothesis": hyp,
                    "wer": per_wer,
                    "cer": per_cer,
                    "precision": per_precision,
                    "recall": per_recall,
                }
            )

        except Exception as e:
            print(f"\n[ERROR] {fname}: {e}")
            results.append(
                {
                    "language": lang_folder.name,
                    "filename": fname,
                    "reference": ref,
                    "hypothesis": "",
                    "wer": float("nan"),
                    "cer": float("nan"),
                    "precision": float("nan"),
                    "recall": float("nan"),
                }
            )

    return results


In [3]:
root = Path(ROOT_DIR)
if not root.exists():
    raise FileNotFoundError(f"Root directory not found: {root}")

print("Loading model…")
processor, model, device = load_model_and_processor(MODEL_NAME)
print(f"Loaded {MODEL_NAME} on device: {device}")


Loading model…
Loaded openai/whisper-large-v3 on device: cuda


In [4]:
all_results: List[Dict[str, Any]] = []

for lf in LANG_FOLDERS:
    lang_dir = root / lf
    if not lang_dir.exists():
        print(f"[WARN] Language folder missing: {lang_dir} (skipping)")
        continue
    prompt = LANG_PROMPT.get(lf, lf)
    res = evaluate_language_folder(
        lang_folder=lang_dir,
        language_prompt=prompt,
        processor=processor,
        model=model,
        device=device,
        clips_subdir=CLIPS_SUBDIR,
    )
    all_results.extend(res)

len(all_results)



=== Processing language: chinese (prompt: chinese) | 16 clips ===


  0%|          | 0/16 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  6%|▋         | 1/16 [00:00<00:11,  1.28it/s]


Clip: 38_5715_20170914193306.wav
ASR: 口口音樂
REF: 口口音乐
WER: 1.0000  CER: 0.2500  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 12%|█▎        | 2/16 [00:01<00:10,  1.34it/s]


Clip: 38_5716_20170914202211.wav
ASR: 嗨天气还冷记得天衣保暖哦
REF: 嗨天气寒冷记得添衣保暖哦
WER: 1.0000  CER: 0.1667  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 19%|█▉        | 3/16 [00:02<00:13,  1.06s/it]


Clip: 38_5716_20170914202228.wav
ASR: 人的能耐再大大不过天这雨转又变了天了天要下雨娘要嫁人由她去吧
REF: 人的能耐再大大不过天阵雨转阴又变了天了天要下雨娘要嫁人由她去吧
WER: 1.0000  CER: 0.0645  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 25%|██▌       | 4/16 [00:03<00:09,  1.21it/s]


Clip: 38_5716_20170914202312.wav
ASR: 播放二胡独奏
REF: 播放二胡独奏
WER: 0.0000  CER: 0.0000  Precision: 1.0000  Recall: 1.0000
----------------------------------------------------------------------


 31%|███▏      | 5/16 [00:03<00:07,  1.50it/s]


Clip: 38_5716_20170914202319.wav
ASR: 可是刮风了
REF: 可是刮风了
WER: 0.0000  CER: 0.0000  Precision: 1.0000  Recall: 1.0000
----------------------------------------------------------------------


 38%|███▊      | 6/16 [00:05<00:09,  1.08it/s]


Clip: 38_5716_20170914202333.wav
ASR: 南洋持续降雨水深路滑出行一定要注意而且刚回家的路上我差点被冲进下水道
REF: 南阳持续降雨水深路滑出行一定要注意安全刚回家的路上我差点儿被冲进下水道
WER: 1.0000  CER: 0.1143  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 44%|████▍     | 7/16 [00:05<00:07,  1.28it/s]


Clip: 38_5716_20170914202341.wav
ASR: 播放雪莉的歌曲
REF: 播放雪莉的歌曲
WER: 0.0000  CER: 0.0000  Precision: 1.0000  Recall: 1.0000
----------------------------------------------------------------------


 50%|█████     | 8/16 [00:06<00:07,  1.11it/s]


Clip: 38_5716_20170914202408.wav
ASR: 我想这出最好歌曲把歌词发到网上请别人帮我作曲集集
REF: 我想这出最好歌曲把歌词发到网上请别人帮我作曲急急
WER: 1.0000  CER: 0.0833  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 56%|█████▋    | 9/16 [00:07<00:05,  1.22it/s]


Clip: 38_5716_20170914202417.wav
ASR: 别说个大概其说具体是哪首歌
REF: 别说个大概齐 说具体是哪首歌
WER: 1.0000  CER: 0.1429  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 62%|██████▎   | 10/16 [00:08<00:04,  1.27it/s]


Clip: 38_5716_20170914202426.wav
ASR: 除了自命无凡和爱是一种幸福
REF: 除了自命不凡和爱是一种幸福
WER: 1.0000  CER: 0.0769  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 69%|██████▉   | 11/16 [00:08<00:03,  1.44it/s]


Clip: 38_5716_20170914202435.wav
ASR: 有点急 因为很喜欢他
REF: 有点儿急哎嘿是因为很喜欢他啊
WER: 2.0000  CER: 0.3571  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 75%|███████▌  | 12/16 [00:09<00:02,  1.45it/s]


Clip: 38_5716_20170914202446.wav
ASR: 听音乐不错 休闲哈稀
REF: 听音乐不错休闲哈嘻
WER: 2.0000  CER: 0.2222  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 81%|████████▏ | 13/16 [00:09<00:01,  1.57it/s]


Clip: 38_5716_20170914202454.wav
ASR: 来首南泉妈妈的歌
REF: 来一首南拳妈妈的歌
WER: 1.0000  CER: 0.2222  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 88%|████████▊ | 14/16 [00:10<00:01,  1.79it/s]


Clip: 38_5716_20170914202531.wav
ASR: 放一下好想你
REF: 放一下好想你
WER: 0.0000  CER: 0.0000  Precision: 1.0000  Recall: 1.0000
----------------------------------------------------------------------


 94%|█████████▍| 15/16 [00:10<00:00,  2.09it/s]


Clip: 38_5716_20170914202537.wav
ASR: 不热
REF: 不热
WER: 0.0000  CER: 0.0000  Precision: 1.0000  Recall: 1.0000
----------------------------------------------------------------------


100%|██████████| 16/16 [00:10<00:00,  1.46it/s]



Clip: 38_5716_20170914202545.wav
ASR: 播放自由行走的话
REF: 播放自由行走的花
WER: 1.0000  CER: 0.1250  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------

=== Processing language: pashto (prompt: pashto) | 16 clips ===


  6%|▋         | 1/16 [00:00<00:12,  1.23it/s]


Clip: common_voice_ps_41827239.mp3
ASR: حدیعه علاق کمی پامشو
REF: په دې حالت کې مې پام شو.
WER: 1.0000  CER: 0.5833  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 12%|█▎        | 2/16 [00:01<00:09,  1.46it/s]


Clip: common_voice_ps_40358420.mp3
ASR: اخترم و مبارکش
REF: اختر مو مبارک سه!
WER: 1.0000  CER: 0.3529  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 19%|█▉        | 3/16 [00:02<00:11,  1.16it/s]


Clip: common_voice_ps_42182197.mp3
ASR: سویدو تارپ ملونو کده نه بخبرو که
REF: د سړیتوب تعریف په عملونو کې دی، نه په خبرو کې.
WER: 0.9091  CER: 0.4348  Precision: 0.1429  Recall: 0.0909
----------------------------------------------------------------------


 25%|██▌       | 4/16 [00:04<00:14,  1.20s/it]


Clip: common_voice_ps_42443251.mp3
ASR: دی شریر و عناصر و فعالیتون دی طولنی پر امنیت منفی اغیز کوی
REF: د شریر عناصر فعالیتونه د ټولنې پر امنیت منفي اغېز کوي.
WER: 0.8182  CER: 0.2407  Precision: 0.3077  Recall: 0.3636
----------------------------------------------------------------------


 31%|███▏      | 5/16 [00:05<00:13,  1.24s/it]


Clip: common_voice_ps_40390233.mp3
ASR: یو کس یا یو شیسرہ زون عدد قول پریش دئی؟
REF: يو کس يا يو شې سره ځان عادت کول پريږدٸ.
WER: 0.9000  CER: 0.4359  Precision: 0.1000  Recall: 0.1000
----------------------------------------------------------------------


 38%|███▊      | 6/16 [00:06<00:12,  1.26s/it]


Clip: common_voice_ps_42439944.mp3
ASR: تا یا او داد دا تیری دولا پارا چینه
REF: • د یو درد د تېرېدو لپاره کښېنه.
WER: 1.1250  CER: 0.5625  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 44%|████▍     | 7/16 [00:08<00:13,  1.48s/it]


Clip: common_voice_ps_42091691.mp3
ASR: زناور دا ساروی و دزمکی دا ایکو سیستم پا برخاکی مهم رول لو باوی.
REF: ځناور د څارویو د ځمکې د اکوسیستم په برخه کې مهم رول لوبوي.
WER: 0.9231  CER: 0.2931  Precision: 0.1429  Recall: 0.1538
----------------------------------------------------------------------


 50%|█████     | 8/16 [00:09<00:09,  1.13s/it]


Clip: common_voice_ps_40296469.mp3
ASR: موسیقی
REF: که راحت غواړې زحمت درلره بويه.
WER: 1.0000  CER: 0.9667  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 56%|█████▋    | 9/16 [00:10<00:08,  1.24s/it]


Clip: common_voice_ps_41875133.mp3
ASR: ارده مرسلام شاملو روم برسل دخانی مو لوک کڑیدی
REF: د ادم علیه السلام زامنو لومړی ځل د غنمو لو کړی دی.
WER: 1.0000  CER: 0.5000  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 62%|██████▎   | 10/16 [00:11<00:07,  1.18s/it]


Clip: common_voice_ps_42436374.mp3
ASR: پتنگ دست را پچا پیره و گردید
REF: پتنګ د څراغ په چاپېره وګرځېد.
WER: 1.1667  CER: 0.4828  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 69%|██████▉   | 11/16 [00:12<00:05,  1.07s/it]


Clip: common_voice_ps_42441496.mp3
ASR: سر انگور هم دیر محبوب دی
REF: • سره انګور هم ډېر محبوب دي.
WER: 0.7143  CER: 0.2857  Precision: 0.3333  Recall: 0.2857
----------------------------------------------------------------------


 75%|███████▌  | 12/16 [00:14<00:05,  1.39s/it]


Clip: common_voice_ps_40390072.mp3
ASR: ده هنرونو و ده میرسونو و ده پا مخانا نوی پرهنگی مرکز و پرانی ستلخو
REF: د هنرونو او میراثونو د ودې په موخه نوی فرهنګي مرکز پرانیستل شو.
WER: 0.9231  CER: 0.3810  Precision: 0.2000  Recall: 0.2308
----------------------------------------------------------------------


 81%|████████▏ | 13/16 [00:15<00:03,  1.28s/it]


Clip: common_voice_ps_40481886.mp3
ASR: غسا او خبکان روخ صحت لمنزوری
REF: غوصه او خفګان روغ صحت له مینځه وړي
WER: 0.7500  CER: 0.4118  Precision: 0.3333  Recall: 0.2500
----------------------------------------------------------------------


 88%|████████▊ | 14/16 [00:16<00:02,  1.21s/it]


Clip: common_voice_ps_42461043.mp3
ASR: زین درمل دا بکتریا زد خاصیت لری
REF: ځینې درمل د بکتریا ضد خاصیت لري.
WER: 0.5714  CER: 0.1875  Precision: 0.4286  Recall: 0.4286
----------------------------------------------------------------------


 94%|█████████▍| 15/16 [00:17<00:01,  1.20s/it]


Clip: common_voice_ps_40893732.mp3
ASR: اندر ناپلو روانیم تنی منسی ای خالق
REF: د رڼا پر لور روان يم، ته مي مل سې اې خالقه!
WER: 1.0000  CER: 0.5116  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


100%|██████████| 16/16 [00:18<00:00,  1.16s/it]



Clip: common_voice_ps_40298938.mp3
ASR: ده شرطخه شروع خدا
REF: د ى يې تر شير په ښور وا خوشحاله دى.
WER: 1.0000  CER: 0.6857  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------

=== Processing language: urdu (prompt: urdu) | 16 clips ===


  6%|▋         | 1/16 [00:01<00:28,  1.90s/it]


Clip: common_voice_ur_38165827.mp3
ASR: کہ انتخابی امیدواروں کو سیکیورٹی بریفنگز کے مطالق تسلیم کی گزشتہ
REF: کے انتخاباتی امیدواروں کو سیکیورٹی بریفنگز کے متعلق تسنیم کی گذشتہ
WER: 0.4545  CER: 0.1061  Precision: 0.5455  Recall: 0.5455
----------------------------------------------------------------------


 12%|█▎        | 2/16 [00:02<00:16,  1.16s/it]


Clip: common_voice_ur_28976627.mp3
ASR: یہی تناسب یوت کا بھی ہے
REF: یہی تناسب "یوتھ" کا بھی ہے۔
WER: 0.3333  CER: 0.1481  Precision: 0.6667  Recall: 0.6667
----------------------------------------------------------------------


 19%|█▉        | 3/16 [00:03<00:15,  1.19s/it]


Clip: common_voice_ur_38190665.mp3
ASR: عشق میں بیداد عشق کے غیر بیمارہ مچا
REF: عشق میں بیدادِ رشکِ غیر نے مارا مجھے
WER: 0.7500  CER: 0.3333  Precision: 0.3750  Recall: 0.3750
----------------------------------------------------------------------


 25%|██▌       | 4/16 [00:04<00:12,  1.03s/it]


Clip: common_voice_ur_42003248.mp3
ASR: میں اگر انخلاب کا مخالف ہوں
REF: میں اگر انقلاب کا مخالف ہوں۔
WER: 0.3333  CER: 0.0714  Precision: 0.6667  Recall: 0.6667
----------------------------------------------------------------------


 31%|███▏      | 5/16 [00:05<00:10,  1.08it/s]


Clip: common_voice_ur_40393969.mp3
ASR: کسی نے کچھ لکھا ہوگا
REF: کسی نے کچھ لکھا ہوگا کسی نے کچھ لکھا ہوگا
WER: 0.5000  CER: 0.5122  Precision: 1.0000  Recall: 0.5000
----------------------------------------------------------------------


 38%|███▊      | 6/16 [00:06<00:09,  1.05it/s]


Clip: common_voice_ur_41192401.mp3
ASR: جو دیکھتا گیا اس کا یقین ہونے لگا
REF: جو دیکھتا گیا اُسکا یقین ہونے لگا
WER: 0.2857  CER: 0.0606  Precision: 0.7500  Recall: 0.8571
----------------------------------------------------------------------


 44%|████▍     | 7/16 [00:07<00:08,  1.00it/s]


Clip: common_voice_ur_35294301.mp3
ASR: رکھیو یارب یہ در گنجینہ گوھر کھلا
REF: رکھیو یا رب یہ در گنجینۂ گوہر کھلا
WER: 0.5000  CER: 0.0882  Precision: 0.5714  Recall: 0.5000
----------------------------------------------------------------------


 50%|█████     | 8/16 [00:08<00:07,  1.13it/s]


Clip: common_voice_ur_41183648.mp3
ASR: میں پڑھا رہی ہوں
REF: میں پڑھا رہی ہوں
WER: 0.0000  CER: 0.0000  Precision: 1.0000  Recall: 1.0000
----------------------------------------------------------------------


 56%|█████▋    | 9/16 [00:09<00:07,  1.02s/it]


Clip: common_voice_ur_40903841.mp3
ASR: کوویڈ عراق ہوائی جہازوں میں کمی کا کام کر رہا ہے
REF: کوویڈ عراق ہوائی جہازوں میں کمی کا کام کر رہا ہے
WER: 0.0000  CER: 0.0000  Precision: 1.0000  Recall: 1.0000
----------------------------------------------------------------------


 62%|██████▎   | 10/16 [00:10<00:05,  1.00it/s]


Clip: common_voice_ur_38378432.mp3
ASR: صبح کرنا شام کا لانا ہے جوے شیر کا
REF: صبح کرنا شام کا، لانا ہے جوئےشِیر کا
WER: 0.3750  CER: 0.1111  Precision: 0.6667  Recall: 0.7500
----------------------------------------------------------------------


 69%|██████▉   | 11/16 [00:11<00:05,  1.03s/it]


Clip: common_voice_ur_33450791.mp3
ASR: اے عارضو اے شہید وفا خون بہان امان
REF: اے آرزو شہیدِ وفا! خوں بہا نہ مانگ
WER: 0.8750  CER: 0.3824  Precision: 0.1250  Recall: 0.1250
----------------------------------------------------------------------


 75%|███████▌  | 12/16 [00:11<00:03,  1.18it/s]


Clip: common_voice_ur_41154202.mp3
ASR: موسیقی
REF: اپنا دل پیش کروں اپنی وفا پیش کروں
WER: 1.0000  CER: 0.9118  Precision: 0.0000  Recall: 0.0000
----------------------------------------------------------------------


 81%|████████▏ | 13/16 [00:12<00:02,  1.09it/s]


Clip: common_voice_ur_38228308.mp3
ASR: دل میں ذوقِ وصل و یادِ یار تک باقی نہیں
REF: دل میں ذوقِ وصل و یادِ یار تک باقی نہیں
WER: 0.0000  CER: 0.0000  Precision: 1.0000  Recall: 1.0000
----------------------------------------------------------------------


 88%|████████▊ | 14/16 [00:14<00:02,  1.05s/it]


Clip: common_voice_ur_40988989.mp3
ASR: جرمنی میں ابھی بھی داکھ کا نظام استعمال کیا جاتا ہے
REF: جرمنی میں ابھی بھی ڈاک کا نظام استعمال کیا جاتا ہے
WER: 0.0909  CER: 0.0400  Precision: 0.9091  Recall: 0.9091
----------------------------------------------------------------------


 94%|█████████▍| 15/16 [00:15<00:01,  1.01s/it]


Clip: common_voice_ur_41175465.mp3
ASR: واسی کی کانوں کو بھلی لگتی ہو
REF: موسیقی کانوں کو بھلی لگتی ہو۔
WER: 0.5000  CER: 0.1724  Precision: 0.5714  Recall: 0.6667
----------------------------------------------------------------------


100%|██████████| 16/16 [00:16<00:00,  1.03s/it]


Clip: common_voice_ur_41103986.mp3
ASR: بولی وڈ بلن کے بوتے ہیرو بننے کے لیے تیار
REF: بولی وڈ ولن کے پوتے ہیرو بننے کے لیے تیار
WER: 0.2000  CER: 0.0488  Precision: 0.8000  Recall: 0.8000
----------------------------------------------------------------------





48

In [5]:
import pandas as pd
import math

# Per-clip TSV
results_df = pd.DataFrame(all_results)
Path(OUTPUT_TSV).parent.mkdir(parents=True, exist_ok=True)
results_df.to_csv(OUTPUT_TSV, index=False, sep="\t", encoding="utf-8")
print(f"Saved per-clip results to: {OUTPUT_TSV}")

# Per-language overall summaries
lang_summaries = []
for lang, grp in results_df.groupby("language"):
    grp_valid = grp.dropna(subset=["wer", "cer", "precision", "recall"])
    if len(grp_valid) == 0:
        print(f"\n[SUMMARY] {lang}: no valid rows.")
        lang_summaries.append(
            {
                "language": lang,
                "clips": 0,
                "overall_wer": float("nan"),
                "overall_cer": float("nan"),
                "overall_precision": float("nan"),
                "overall_recall": float("nan"),
            }
        )
        continue

    refs = grp_valid["reference"].tolist()
    hyps = grp_valid["hypothesis"].tolist()

    # Global metrics per language: sum alignment counts over utterances (jiwer v4)
    lang_word = overall_word_metrics(refs, hyps)
    lang_cer = overall_char_cer(refs, hyps)

    print(f"\n===== OVERALL ({lang}) =====")
    print(f"Clips: {len(grp_valid)}")
    print(f"WER: {lang_word['wer']:.4f}")
    print(f"CER: {lang_cer:.4f}")
    print(f"Precision: {lang_word['precision']:.4f}")
    print(f"Recall: {lang_word['recall']:.4f}")
    print("============================")

    lang_summaries.append(
        {
            "language": lang,
            "clips": len(grp_valid),
            "overall_wer": lang_word["wer"],
            "overall_cer": lang_cer,
            "overall_precision": lang_word["precision"],
            "overall_recall": lang_word["recall"],
        }
    )

# Global overall (all languages together)
valid_all = results_df.dropna(subset=["wer", "cer", "precision", "recall"])
if len(valid_all) > 0:
    refs_all = valid_all["reference"].tolist()
    hyps_all = valid_all["hypothesis"].tolist()

    glob_word = overall_word_metrics(refs_all, hyps_all)
    glob_cer = overall_char_cer(refs_all, hyps_all)

    print("\n========== GLOBAL OVERALL (all languages) ==========")
    print(f"Clips: {len(valid_all)}")
    print(f"Overall WER: {glob_word['wer']:.4f}")
    print(f"Overall CER: {glob_cer:.4f}")
    print(f"Overall Precision: {glob_word['precision']:.4f}")
    print(f"Overall Recall: {glob_word['recall']:.4f}")
    print("====================================================\n")
else:
    glob_word = {"wer": float("nan"), "precision": float("nan"), "recall": float("nan")}
    glob_cer = float("nan")
    print("\nNo valid results to compute GLOBAL overall metrics.\n")

# Save per-language summary TSV
lang_sum_df = pd.DataFrame(lang_summaries).sort_values("language")
lang_sum_df.to_csv(LANG_SUM_TSV, index=False, sep="\t", encoding="utf-8")
print(f"Saved per-language summary to: {LANG_SUM_TSV}")

# Show summaries in notebook
lang_sum_df


Saved per-clip results to: C:\Users\manmo\OneDrive\Desktop\Cup demo dataset\whisper_large_v3_eval.tsv

===== OVERALL (chinese) =====
Clips: 16
WER: 0.8235
CER: 0.1206
Precision: 0.2778
Recall: 0.2941

===== OVERALL (pashto) =====
Clips: 16
WER: 0.9241
CER: 0.4414
Precision: 0.1475
Recall: 0.1241

===== OVERALL (urdu) =====
Clips: 16
WER: 0.3788
CER: 0.1824
Precision: 0.7049
Recall: 0.6515

Clips: 48
Overall WER: 0.6735
Overall CER: 0.2868
Overall Precision: 0.4160
Overall Recall: 0.3707

Saved per-language summary to: C:\Users\manmo\OneDrive\Desktop\Cup demo dataset\whisper_large_v3_language_summary.tsv


Unnamed: 0,language,clips,overall_wer,overall_cer,overall_precision,overall_recall
0,chinese,16,0.823529,0.120603,0.277778,0.294118
1,pashto,16,0.924138,0.441368,0.147541,0.124138
2,urdu,16,0.378788,0.182432,0.704918,0.651515
