* This notebook follow the [Fine-Tune Whisper For Multilingual ASR with 🤗 Transformers]*(https://huggingface.co/blog/fine-tune-whisper) tutorial.
* To use finetuned model, please see the following links: [faster-whisper model conversion](https://github.com/SYSTRAN/faster-whisper?tab=readme-ov-file#model-conversion) and the [github issue](https://github.com/SYSTRAN/faster-whisper/issues/248). 


In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
from datasets import load_dataset, DatasetDict
from tqdm.notebook import tqdm

common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "cs", split="train+validation", trust_remote_code=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "cs", split="test", trust_remote_code=True)

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 22155
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 7714
    })
})


In [9]:
common_voice["train"][0]

{'client_id': '2b8bbeb31869b943b845a1c40b6c788f2ca545adae4b8452543a4925c176485927cd3caa0b13ce67be7d73f456c175972013c4071da65940e07e0db869be8d8c',
 'path': '/lnet/express/work/people/stankov/huggingface_cache/datasets/downloads/extracted/db9e3c6bebeb6c945086ecada7c5ce10d7944812dc5898a996045894fb6f7ae3/cs_train_0/common_voice_cs_25695144.mp3',
 'audio': {'path': '/lnet/express/work/people/stankov/huggingface_cache/datasets/downloads/extracted/db9e3c6bebeb6c945086ecada7c5ce10d7944812dc5898a996045894fb6f7ae3/cs_train_0/common_voice_cs_25695144.mp3',
  'array': array([ 4.26325641e-14,  1.13686838e-13,  2.62900812e-13, ...,
         -1.01048208e-04, -1.48227118e-04, -8.67909548e-05]),
  'sampling_rate': 48000},
 'sentence': 'S judem začínala v rodném Kjóto.',
 'up_votes': 2,
 'down_votes': 0,
 'age': '',
 'gender': '',
 'accent': '',
 'locale': 'cs',
 'segment': ''}

In [11]:
import IPython.display as ipd
s = "test"
for i in tqdm(range(len(common_voice[s])), total=len(common_voice["test"])):
    l =common_voice[s][i]["audio"]["array"].shape[0] / 48000
    if l < 1:
        print({"i": i, "l": l, "path": common_voice[s][i]["audio"]["array"], "sentence": common_voice[s][i]["sentence"]})
        ipd.display(ipd.Audio(common_voice[s][i]["audio"]["array"], rate=48000))
    # print(common_voice["test"][i]["audio"]["array"].shape[0] < 48000)

  0%|          | 0/7714 [00:00<?, ?it/s]

{'i': 23, 'l': 0.9, 'path': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
       -1.87428668e-06,  1.32247806e-06, -4.99654561e-07]), 'sentence': 'ne'}


{'i': 29, 'l': 0.9, 'path': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        1.41062401e-07,  4.59640432e-08, -1.83663360e-08]), 'sentence': 'jedna'}


{'i': 30, 'l': 0.972, 'path': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
       -1.14848717e-08,  1.12902399e-09,  2.91407787e-09]), 'sentence': 'tři'}


In [24]:
common_voice["test"][2286]

{'client_id': '3c87a1811035cb8bc20a1644523cae83a799a80a30dd64aba339cb07f877b6acf95de8fb1e34e99492a97f52af9c83c197c655971e3daa16b4b097f6ebf23227',
 'path': '/lnet/express/work/people/stankov/huggingface_cache/datasets/downloads/extracted/330dd4f7e23fdb0cdb9965fcc692eb3a226bffb7428f9cbe083a1387be9c0a34/cs_test_0/common_voice_cs_24687407.mp3',
 'audio': {'path': '/lnet/express/work/people/stankov/huggingface_cache/datasets/downloads/extracted/330dd4f7e23fdb0cdb9965fcc692eb3a226bffb7428f9cbe083a1387be9c0a34/cs_test_0/common_voice_cs_24687407.mp3',
  'array': array([-1.13686838e-13,  6.53699317e-13,  5.11590770e-13, ...,
          1.75866189e-05,  9.25016866e-06,  2.88985029e-06]),
  'sampling_rate': 48000},
 'sentence': 'Již od svých pěti let hrála na housle.',
 'up_votes': 2,
 'down_votes': 0,
 'age': '',
 'gender': '',
 'accent': '',
 'locale': 'cs',
 'segment': ''}

In [6]:
common_voice = common_voice.remove_columns([
    "accent", "age", "down_votes", "gender", "locale", "segment", "up_votes"
])

In [21]:
model_type = "openai/whisper-tiny"
# possible options:
# openai/whisper-tiny     39M
# openai/whisper-base     74M
# openai/whisper-small    244M
# openai/whisper-medium   769M
# openai/whisper-large    1550M
# openai/whisper-large-v2 1550M
# openai/whisper-large-v3 1550M

In [22]:

from transformers import WhisperProcessor


processor = WhisperProcessor.from_pretrained(model_type, language="Czech", task="transcribe")
# feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
# tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Czech", task="transcribe")



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
input_str = common_voice["train"][0]["sentence"]
labels = processor.tokenizer(input_str).input_ids
decoded_with_special = processor.tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = processor.tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Labels:                {labels}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


Input:                 S judem začínala v rodném Kjóto.
Labels:                [50258, 50283, 50359, 50363, 50, 3747, 443, 7949, 10236, 870, 4660, 64, 371, 8685, 77, 4011, 591, 73, 812, 1353, 13, 50257]
Decoded w/ special:    <|startoftranscript|><|cs|><|transcribe|><|notimestamps|>S judem začínala v rodném Kjóto.<|endoftext|>
Decoded w/out special: S judem začínala v rodném Kjóto.
Are equal:             True


In [15]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))


In [None]:
class BatchPreparer:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, batch):
        # load and resample audio data from 48 to 16kHz
        audio = batch["audio"]

        # compute log-Mel input features from input audio array 
        batch["input_features"] = self.processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

        # encode target text to label ids 
        batch["labels"] = self.processor.tokenizer(batch["sentence"]).input_ids
        return batch

In [None]:
batch_preparer = BatchPreparer(processor)
common_voice = common_voice.map(batch_preparer, remove_columns=common_voice.column_names["train"], num_proc=4)

