Creating conda environment

In [None]:
#!pip install git+https://github.com/openai/whisper.git 
#!pip install setuptools-rust
#!CALL C:\Users\yyomn\Anaconda3\Scripts\activate.bat whisper_env

In [2]:
# scripts/evaluate_whisper.pyS

import csv
import os
import re
import unicodedata
from pathlib import Path
from typing import Tuple, List

import torch
import whisper


DATA_ROOT = Path("data")
RESULTS_DIR = Path("results")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_CSV = RESULTS_DIR / "whisper_eval_results.csv"


# Try to use Whisper's normalizer if available; otherwise, use a fallback
def build_normalizer():
    try:
        from whisper.normalizers import BasicTextNormalizer
        print("Using Whisper BasicTextNormalizer")
        normalizer = BasicTextNormalizer()

        def normalize(text: str) -> str:
            return normalizer(text)

        return normalize
    except Exception:
        print("Whisper normalizer not found; using simple fallback normalizer")

        def normalize(text: str) -> str:
            # Lowercase
            text = text.lower()
            # Unicode normalization
            text = unicodedata.normalize("NFKC", text)
            # Remove most punctuation but keep letters/numbers in all languages
            text = re.sub(r"[^\w\s]", " ", text, flags=re.UNICODE)
            # Collapse whitespace
            text = re.sub(r"\s+", " ", text).strip()
            return text

        return normalize


NORMALIZE = build_normalizer()


def levenshtein_distance(ref: List[str], hyp: List[str]) -> int:
    """Standard Levenshtein distance on token lists."""
    n, m = len(ref), len(hyp)
    dp = [[0] * (m + 1) for _ in range(n + 1)]

    for i in range(n + 1):
        dp[i][0] = i
    for j in range(m + 1):
        dp[0][j] = j

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = 0 if ref[i - 1] == hyp[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,      # deletion
                dp[i][j - 1] + 1,      # insertion
                dp[i - 1][j - 1] + cost,  # substitution
            )
    return dp[n][m]


def compute_wer(ref: str, hyp: str) -> float:
    ref_tokens = ref.split()
    hyp_tokens = hyp.split()
    if len(ref_tokens) == 0:
        return 0.0 if len(hyp_tokens) == 0 else 1.0
    dist = levenshtein_distance(ref_tokens, hyp_tokens)
    return dist / len(ref_tokens)


def compute_cer(ref: str, hyp: str) -> float:
    # Optionally we can remove spaces; here we keep them
    ref_chars = list(ref)
    hyp_chars = list(hyp)
    if len(ref_chars) == 0:
        return 0.0 if len(hyp_chars) == 0 else 1.0
    dist = levenshtein_distance(ref_chars, hyp_chars)
    return dist / len(ref_chars)


def iter_samples():
    """
    Iterate over data/<lang_dir>/<video_dir>/
    and yield (lang_dir_name, video_dir_name, audio_path, ref_txt_path).

    We:
      - Look for any *.wav inside each video_dir
      - Expect a reference.txt in the same folder
    """
    for lang_dir in sorted(DATA_ROOT.iterdir()):
        if not lang_dir.is_dir():
            continue
        lang_code = lang_dir.name

        for video_dir in sorted(lang_dir.iterdir()):
            if not video_dir.is_dir():
                continue

            # Find the .wav file (yt-dlp names it with the title)
            audio_candidates = sorted(video_dir.glob("*.wav"))
            if not audio_candidates:
                print(f"[WARN] No .wav file found in: {video_dir}")
                continue

            audio_path = audio_candidates[0]  # first .wav in that folder
            ref_txt_path = video_dir / "reference.txt"

            if not ref_txt_path.exists():
                print(f"[WARN] Missing reference.txt: {ref_txt_path}")
                continue

            yield lang_code, video_dir.name, audio_path, ref_txt_path



def main():
    # ---- Load Whisper model on GPU if available ----
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"torch.cuda.is_available() = {torch.cuda.is_available()}")
    print(f"Loading Whisper model on device: {device}")

    model = whisper.load_model("turbo", device=device)
    print(f"Model actual device: {next(model.parameters()).device}")

    # ---- CSV setup ----
    with RESULTS_CSV.open("w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            "lang_folder",
            "video_folder",
            "whisper_detected_lang",
            "wer",
            "cer",
            "ref_len_chars",
            "hyp_len_chars",
            "ref_norm",
            "hyp_norm",
        ])

        for lang_folder, video_folder, audio_path, ref_txt_path in iter_samples():
            print(f"\n=== Evaluating: {lang_folder}/{video_folder} ===")

            # Load reference text
            ref_raw = ref_txt_path.read_text(encoding="utf-8").strip()
            ref_norm = NORMALIZE(ref_raw)

            # Whisper transcription with automatic language detection
            result = model.transcribe(
                str(audio_path),
                task="transcribe",
                language=None,        # let Whisper detect
                verbose=False,
            )

            hyp_raw = result.get("text", "").strip()
            hyp_norm = NORMALIZE(hyp_raw)
            detected_lang = result.get("language", "")

            wer = compute_wer(ref_norm, hyp_norm)
            cer = compute_cer(ref_norm, hyp_norm)

            print(f"  Detected language: {detected_lang}")
            print(f"  WER: {wer:.3f} | CER: {cer:.3f}")

            writer.writerow([
                lang_folder,
                video_folder,
                detected_lang,
                f"{wer:.6f}",
                f"{cer:.6f}",
                len(ref_norm),
                len(hyp_norm),
                ref_norm,
                hyp_norm,
            ])

    print(f"\nResults written to {RESULTS_CSV}")


if __name__ == "__main__":
    main()


Using Whisper BasicTextNormalizer
torch.cuda.is_available() = True
Loading Whisper model on device: cuda
Model actual device: cuda:0

=== Evaluating: ar/_ ===
Detected language: Arabic


100%|██████████| 65604/65604 [00:41<00:00, 1586.99frames/s]


  Detected language: ar
  WER: 0.336 | CER: 0.096
[WARN] Missing reference.txt: data\ar\Five_Steps_to_Create_a_New_AI_Model\reference.txt

=== Evaluating: de/US-Gesandter_auf_Putins_Seite_DW_Nachrichten ===
Detected language: German


100%|██████████| 40300/40300 [00:13<00:00, 3086.83frames/s]


  Detected language: de
  WER: 0.053 | CER: 0.015

=== Evaluating: de/Was_ist_los_im_Sudan_analyse ===
Detected language: German


100%|██████████| 90653/90653 [00:36<00:00, 2517.18frames/s]


  Detected language: de
  WER: 0.050 | CER: 0.013

=== Evaluating: es/Que_significa_Quien_pudiera_y_como_se_usa_Espanol_directo_al_grano ===
Detected language: Spanish


100%|██████████| 46646/46646 [00:14<00:00, 3245.14frames/s]


  Detected language: es
  WER: 0.009 | CER: 0.004

=== Evaluating: es/Vlog_2_-_Dia_de_las_velitas_en_Colombia._Bunuelos_y_tamal# ===
Detected language: Spanish


100%|██████████| 141469/141469 [00:52<00:00, 2697.55frames/s]


  Detected language: es
  WER: 0.287 | CER: 0.243

=== Evaluating: fr/Biarritz_-_la_Californie_francaise ===
Detected language: French


100%|██████████| 81013/81013 [00:28<00:00, 2806.73frames/s]


  Detected language: fr
  WER: 0.062 | CER: 0.046

=== Evaluating: fr/Ne_dites_pas_J_ai_deux# ===
Detected language: French


100%|██████████| 63061/63061 [00:22<00:00, 2840.46frames/s]


  Detected language: fr
  WER: 0.034 | CER: 0.025

=== Evaluating: zh/How_to_Survive_in_China_as_an_Introvert_or_Extrovert_Learn_Chinese_Through_Vlogs ===
Detected language: Chinese


100%|██████████| 91422/91422 [00:33<00:00, 2735.05frames/s]


  Detected language: zh
  WER: 0.982 | CER: 0.331
[WARN] Missing reference.txt: data\zh\Slow_Chinese_Vlog_What_I_Eat_in_a_Day_comprehensible_input_HSK1-3\reference.txt

=== Evaluating: zh/Why_Chinese_Prefer_Big_Hospitals_Learn_Chinese_Through_Vlogs ===
Detected language: Chinese


100%|██████████| 70949/70949 [00:23<00:00, 3066.65frames/s]


  Detected language: zh
  WER: 1.000 | CER: 0.188

Results written to results\whisper_eval_results.csv
