In [1]:
!pip install datasets transformers jiwer hf_xet

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting hf_xet
  Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jiwer-3.1.0-py

In [2]:
import os
from datasets import load_dataset
from transformers.models.whisper import WhisperProcessor, WhisperForConditionalGeneration
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from transformers.generation.configuration_utils import GenerationConfig
import torchaudio
import torch
from jiwer import wer, cer

In [3]:
MODEL_PATHS = [
    "victors3136/whisper-ro-multilingual-finetune-it00-es00",
    "victors3136/whisper-ro-multilingual-finetune-it05-es05",
    "victors3136/whisper-ro-multilingual-finetune-it15-es15",
    "victors3136/whisper-ro-multilingual-finetune-it15-es35",
    "victors3136/whisper-ro-multilingual-finetune-it25-es25",
    "victors3136/whisper-ro-multilingual-finetune-it35-es15",
]
LANGUAGE = "ro"
SPLIT = "test"
MAX_SAMPLES = 1_000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
print("Loading Common Voice Romanian test data...")
cv = load_dataset("mozilla-foundation/common_voice_11_0", LANGUAGE, split=SPLIT)
cv = cv.filter(lambda x: x["sentence"] is not None and x["audio"] is not None)
cv = cv.select(range(min(MAX_SAMPLES, len(cv))))

Loading Common Voice Romanian test data...


README.md:   0%|          | 0.00/14.4k [00:00<?, ?B/s]

common_voice_11_0.py:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/60.9k [00:00<?, ?B/s]

The repository for mozilla-foundation/common_voice_11_0 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/mozilla-foundation/common_voice_11_0.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


n_shards.json:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

ro_train_0.tar:   0%|          | 0.00/157M [00:00<?, ?B/s]

ro_dev_0.tar:   0%|          | 0.00/98.6M [00:00<?, ?B/s]

ro_test_0.tar:   0%|          | 0.00/115M [00:00<?, ?B/s]

ro_other_0.tar:   0%|          | 0.00/480M [00:00<?, ?B/s]

ro_invalidated_0.tar:   0%|          | 0.00/21.7M [00:00<?, ?B/s]

train.tsv:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

dev.tsv:   0%|          | 0.00/845k [00:00<?, ?B/s]

test.tsv:   0%|          | 0.00/871k [00:00<?, ?B/s]

other.tsv:   0%|          | 0.00/4.40M [00:00<?, ?B/s]

invalidated.tsv:   0%|          | 0.00/201k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 5187it [00:00, 157337.59it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 3703it [00:00, 138318.50it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 3859it [00:00, 146844.78it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 19267it [00:00, 150083.68it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 882it [00:00, 128210.17it/s]


Filter:   0%|          | 0/3859 [00:00<?, ? examples/s]

In [5]:
def speech_file_to_array_fn(batch):
    speech_array, _ = torchaudio.load(batch["audio"]["path"])
    batch["speech"] = speech_array[0].numpy()
    batch["target_text"] = batch["sentence"].lower()
    return batch

In [6]:
cv = cv.map(speech_file_to_array_fn)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:
!pip install tqdm



In [11]:
from tqdm import tqdm

In [20]:
results = []
for model_id in MODEL_PATHS:
    fails = 0
    total = 0

    print(f"\nEvaluating {model_id}...")

    processor = AutoProcessor.from_pretrained(model_id)
    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id) \
                                     .to(DEVICE)

    predictions = []
    references = []

    for sample in tqdm(cv):
        input_features = processor(
            sample["speech"],
            sampling_rate=16000,
            return_tensors="pt"
        ).input_features.to(DEVICE)
        with torch.no_grad():
            try:
                predicted_ids = model.generate(
                    input_features,
                    language="ro",
                    task="transcribe",
                )
            except IndexError as ie:
                fails += 1
                continue
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].lower()

        predictions.append(transcription)
        references.append(sample["target_text"])

    wer_score = wer(references, predictions) * 100
    cer_score = cer(references, predictions) * 100
    print(f"Total samples: {len(cv)}, of which {fails} failed due to indexing errors")
    print(f"{model_id}: WER = {wer_score:.2f}%, CER = {cer_score:.2f}%")
    results.append((model_id, wer_score, cer_score))


results.sort(key=lambda x: (
    x[1], # sort by wer by default
    x[2]) # use cer as tiebreaker
)


Evaluating victors3136/whisper-ro-multilingual-finetune-it00-es00...


100%|██████████| 1000/1000 [05:00<00:00,  3.33it/s]


Total samples: 1000, of which 630 failed due to indexing errors
victors3136/whisper-ro-multilingual-finetune-it00-es00: WER = 350.53%, CER = 204.24%

Evaluating victors3136/whisper-ro-multilingual-finetune-it05-es05...


100%|██████████| 1000/1000 [07:47<00:00,  2.14it/s]


Total samples: 1000, of which 621 failed due to indexing errors
victors3136/whisper-ro-multilingual-finetune-it05-es05: WER = 389.59%, CER = 287.10%

Evaluating victors3136/whisper-ro-multilingual-finetune-it15-es15...


100%|██████████| 1000/1000 [03:53<00:00,  4.28it/s]


Total samples: 1000, of which 599 failed due to indexing errors
victors3136/whisper-ro-multilingual-finetune-it15-es15: WER = 104.43%, CER = 85.22%

Evaluating victors3136/whisper-ro-multilingual-finetune-it15-es35...


100%|██████████| 1000/1000 [11:11<00:00,  1.49it/s]


Total samples: 1000, of which 44 failed due to indexing errors
victors3136/whisper-ro-multilingual-finetune-it15-es35: WER = 275.69%, CER = 228.74%

Evaluating victors3136/whisper-ro-multilingual-finetune-it25-es25...


100%|██████████| 1000/1000 [04:01<00:00,  4.15it/s]


Total samples: 1000, of which 237 failed due to indexing errors
victors3136/whisper-ro-multilingual-finetune-it25-es25: WER = 113.26%, CER = 111.60%

Evaluating victors3136/whisper-ro-multilingual-finetune-it35-es15...


100%|██████████| 1000/1000 [05:40<00:00,  2.94it/s]

Total samples: 1000, of which 734 failed due to indexing errors
victors3136/whisper-ro-multilingual-finetune-it35-es15: WER = 470.98%, CER = 364.45%





In [19]:
print("\nFinal Benchmark Results:")
print(f"{'Model':<60} {'WER (%)':<10} {'CER (%)':<10}")
for model_id, wer_score, cer_score in results:
    print(f"{model_id:<60} {wer_score:<10.2f} {cer_score:<10.2f}")



Final Benchmark Results:
Model                                                        WER (%)    CER (%)   
victors3136/whisper-ro-multilingual-finetune-it15-es15       104.43     85.22     
victors3136/whisper-ro-multilingual-finetune-it25-es25       113.26     111.60    
victors3136/whisper-ro-multilingual-finetune-it15-es35       275.69     228.74    
victors3136/whisper-ro-multilingual-finetune-it00-es00       350.53     204.24    
victors3136/whisper-ro-multilingual-finetune-it05-es05       389.59     287.10    
victors3136/whisper-ro-multilingual-finetune-it35-es15       470.98     364.45    


In [16]:
len("victors3136/whisper-ro-multilingual-finetune-it15-es15")

54