In [39]:
# Install necessary libraries
!pip install --quiet datasets[audio]==3.6.0 transformers torchaudio evaluate ffmpeg-python tiktoken sentencepiece


[0m

In [25]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
# --- Parameters ---
DATASET = "dsfsi-anv/za-african-next-voices"  # Hugging Face dataset repo name
MODELS = ["facebook/mms-1b-fl102", "facebook/mms-1b-all"]                # Models to evaluate
# MODELS = ["facebook/seamless-m4t-v2-large"]
# MODELS = ["facebook/mms-1b-fl102"]
EVAL_SPLIT = "dev_test"                          # Can be "train", "dev", "dev_test"
SAMPLE_LIMIT = 0                             # Set 0 to evaluate on full split
LANG = "zul"                                      # Auto-detect config name
RESULTS_SAVE_ROOT_DIR = '/home/alp'

In [27]:
import string
def normalize_transcript(text):
    ref = text.strip().lower()
    ref = ref.translate(str.maketrans('', '', string.punctuation))
    return ref

normalize_transcript("Iqhaza le sifundazwe - kuya ngezilimo ezikhethwa abalimi ezilungele kakhulu indawo ethile.")

'iqhaza le sifundazwe  kuya ngezilimo ezikhethwa abalimi ezilungele kakhulu indawo ethile'

In [28]:
from datasets import load_dataset, Audio

dataset = load_dataset(DATASET, LANG, split=EVAL_SPLIT, streaming=True)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

# Take the first N samples from the stream
from itertools import islice

dataset_iter = islice(dataset, SAMPLE_LIMIT if SAMPLE_LIMIT != 0 else None)
dataset_list = list(dataset_iter)  # Convert to list so you can use in evaluation


Resolving data files:   0%|          | 0/101 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/101 [00:00<?, ?it/s]

# Listen to some samples

In [None]:
from IPython.display import Audio, display

NUM_SAMPLES = 5  # Number of samples to preview
START_FROM = 100

print(f"🔍 Listening to {NUM_SAMPLES} sample(s) from the dataset starting from {START_FROM}...\n")

# For streaming datasets, use iter()
sample_iter = iter(dataset)

for i in range(START_FROM):
    next(sample_iter)

for i in range(NUM_SAMPLES):
    sample = next(sample_iter)
    audio = sample["audio"]
    transcript = sample["transcript"] if "transcript" in sample else "[no transcript]"
    normalized_transcript = normalize_transcript(transcript)
    
    print(f"🎧 Sample {i+1}")
    print(f"📜 Transcript: {transcript}")
    print(f"📜 Normalized: {normalized_transcript}")
    display(Audio(audio["array"], rate=audio["sampling_rate"]))
    print("-" * 40)


# Start evaluation

In [34]:
from tqdm import tqdm
from evaluate import load
from pathlib import Path
from datetime import datetime
import torch

In [35]:
def load_model(model_id):
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if 'mms' in model_id:
        from transformers import Wav2Vec2ForCTC, AutoProcessor
        
        # Load processor and model
        processor = AutoProcessor.from_pretrained(model_id)
        model = Wav2Vec2ForCTC.from_pretrained(model_id)

        # Set target language adapter
        processor.tokenizer.set_target_lang(LANG)
        model.load_adapter(LANG)
        
    elif 'seamless' in model_id:
        #DOCS: https://huggingface.co/docs/transformers/main/en/model_doc/seamless_m4t_v2#transformers.SeamlessM4Tv2ForSpeechToText
        #NOTE: Doesn't work 
        from transformers import AutoProcessor, SeamlessM4Tv2ForSpeechToText
        
        processor = AutoProcessor.from_pretrained(model_id, use_fast=False)
        model = SeamlessM4Tv2ForSpeechToText.from_pretrained(model_id)

    model.to(device).eval()

    return processor, model, device

def transcribe_sample(model_id, sample, device):
    if 'mms' in model_id:
        audio = sample["audio"]
        waveform = audio["array"]
        sampling_rate = audio["sampling_rate"]
    
        # Prepare input
        inputs = processor(waveform, sampling_rate=16000, return_tensors="pt").to(device)
    
        with torch.no_grad():
            logits = model(**inputs).logits
    
        ids_tensor = torch.argmax(logits, dim=-1)[0]
        pred = processor.decode(ids_tensor).strip()
    elif 'seamless' in model_id:
        audio = sample["audio"]
        waveform = audio["array"]
        sampling_rate = audio["sampling_rate"]
    
        # Prepare input
        inputs = processor(waveform, sampling_rate=16000, return_tensors="pt").to(device)
    
        with torch.no_grad():
            logits = model(**inputs).logits
    
        ids_tensor = torch.argmax(logits, dim=-1)

        transcription = processor.batch_decode(ids_tensor)
        
        
    return pred

In [36]:
no_samples_info = SAMPLE_LIMIT if SAMPLE_LIMIT > 0 else "all"

for model_id in MODELS:
    # Load processor and model
    processor, model, device = load_model(model_id)

    print(f"Evaluating {no_samples_info} samples on {model_id}")

    preds, refs, ids = [], [], []
    
    for idx, sample in enumerate(tqdm(dataset_list, desc="Transcribing")):
        #Run ASR on sample
        pred = transcribe_sample(model_id, sample, device)
    
        # Normalize reference: lowercase + remove punctuation
        ref = normalize_transcript(sample["transcript"])
        
        # Use dataset filename as  ID 
        sample_id = sample.get("file_name")
    
        preds.append(pred)
        refs.append(ref)
        ids.append(sample_id)
    
        # ✅ Debug output
        if SAMPLE_LIMIT != 0:
            print(f"\n🔊 Sample {idx + 1}")
            print(f"🆔 ID        : {sample_id}")
            print(f"🎙️  Reference : {ref}")
            print(f"🤖 Prediction: {pred}")
            wer = load("wer").compute(predictions=[pred], references=[ref])
            cer = load("cer").compute(predictions=[pred], references=[ref])
            print(f"❌ WER: {wer:.2%}")
            print(f"❌ CER: {cer:.2%}")

    #Report all evaluation 

    wer = load("wer").compute(predictions=preds, references=refs)
    cer = load("cer").compute(predictions=preds, references=refs)
    
    print(f"\n✅ Model: {model_id}")
    print(f"🔢 Samples evaluated: {len(preds)}")
    print(f"❌ WER: {wer:.2%}")
    print(f"❌ CER: {cer:.2%}")
    print("-"*50)
    print("-"*50)

    # Prepare paths
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base_path = Path(f"{RESULTS_SAVE_ROOT_DIR}/asr_eval_{model_id.split('/')[-1]}_{LANG}_{timestamp}")
    base_path.mkdir(parents=True, exist_ok=True)
    
    # Save report
    report_path = base_path / "report.txt"
    with open(report_path, "w") as f:
        f.write(f"✅ Model: {model_id}\n")
        f.write(f"📦 Dataset: {DATASET}\n")
        f.write(f"📂 Split: {EVAL_SPLIT}\n")
        f.write(f"🌍 Language: {LANG}\n")
        f.write(f"🔢 Samples evaluated: {len(preds)}\n")
        f.write(f"❌ WER: {wer * 100:.2f}%\n")
        f.write(f"❌ CER: {cer * 100:.2f}%\n")
    
    # Save references
    refs_path = base_path / "references-predictions.tsv"
    with open(refs_path, "w", encoding="utf-8") as f:
        f.write("id\treference\tprediction\n")
        for idx, ref, pred in zip(ids, refs, preds):
            f.write(f"{idx}\t{ref}\t{pred}\n")
    
    print(f"📁 Results saved in: {base_path}")

Evaluating all samples on facebook/mms-1b-fl102


Transcribing: 100%|██████████| 1583/1583 [03:59<00:00,  6.60it/s]



✅ Model: facebook/mms-1b-fl102
🔢 Samples evaluated: 1583
❌ WER: 40.47%
❌ CER: 9.53%
--------------------------------------------------
--------------------------------------------------
📁 Results saved in: /home/alp/asr_eval_mms-1b-fl102_zul_20250725_123753
Evaluating all samples on facebook/mms-1b-all


Transcribing: 100%|██████████| 1583/1583 [03:56<00:00,  6.69it/s]



✅ Model: facebook/mms-1b-all
🔢 Samples evaluated: 1583
❌ WER: 39.30%
❌ CER: 8.73%
--------------------------------------------------
--------------------------------------------------
📁 Results saved in: /home/alp/asr_eval_mms-1b-all_zul_20250725_124155
