# 1 prepare

## 1.1 login HuggingFace

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## 1.2 prepare dataset from HuggingFace hub

### 1.2.1 load dataset

In [3]:
from datasets import load_dataset, DatasetDict

# prepare a empty dict
common_voice = DatasetDict()

# fill the empty dict
common_voice["train"] = load_dataset(
    "mozilla-foundation/common_voice_17_0", "dv", split="train+validation"
)
common_voice["test"] = load_dataset(
    "mozilla-foundation/common_voice_17_0", "dv", split="test"
)

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 4902
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 2215
    })
})


### 1.2.2 process columns

In [4]:
# only preserve necessary columns
common_voice = common_voice.select_columns(["audio", "sentence"])

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 4902
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 2215
    })
})


## 1.3 prepare processor

In ü§ó Transformers, the Whisper model has an associated feature extractor and tokenizer, called [WhisperFeatureExtractor](https://huggingface.co/docs/transformers/main/model_doc/whisper#transformers.WhisperFeatureExtractor) and [WhisperTokenizer](https://huggingface.co/docs/transformers/main/model_doc/whisper#transformers.WhisperTokenizer) respectively. To make our lives simple, these two objects are wrapped under a single class, called the [WhisperProcessor](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperProcessor).

- **FeatureExtractor**: for every supported audio model, ü§ó Transformers offer a feature extractor class that can convert raw audio data into the input features the model expects.
- **Tokenizer**: ü§ó Transformers also offer model-specific tokenizers to process the text inputs.

### 1.3.1 check if language supported

In [5]:
from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE

print(TO_LANGUAGE_CODE)
print('Dhivehi' in TO_LANGUAGE_CODE.keys())
print('dv' in TO_LANGUAGE_CODE.values())

{'english': 'en', 'chinese': 'zh', 'german': 'de', 'spanish': 'es', 'russian': 'ru', 'korean': 'ko', 'french': 'fr', 'japanese': 'ja', 'portuguese': 'pt', 'turkish': 'tr', 'polish': 'pl', 'catalan': 'ca', 'dutch': 'nl', 'arabic': 'ar', 'swedish': 'sv', 'italian': 'it', 'indonesian': 'id', 'hindi': 'hi', 'finnish': 'fi', 'vietnamese': 'vi', 'hebrew': 'he', 'ukrainian': 'uk', 'greek': 'el', 'malay': 'ms', 'czech': 'cs', 'romanian': 'ro', 'danish': 'da', 'hungarian': 'hu', 'tamil': 'ta', 'norwegian': 'no', 'thai': 'th', 'urdu': 'ur', 'croatian': 'hr', 'bulgarian': 'bg', 'lithuanian': 'lt', 'latin': 'la', 'maori': 'mi', 'malayalam': 'ml', 'welsh': 'cy', 'slovak': 'sk', 'telugu': 'te', 'persian': 'fa', 'latvian': 'lv', 'bengali': 'bn', 'serbian': 'sr', 'azerbaijani': 'az', 'slovenian': 'sl', 'kannada': 'kn', 'estonian': 'et', 'macedonian': 'mk', 'breton': 'br', 'basque': 'eu', 'icelandic': 'is', 'armenian': 'hy', 'nepali': 'ne', 'mongolian': 'mn', 'bosnian': 'bs', 'kazakh': 'kk', 'albanian'

We can see `Dhivehi` not in support languages list.

If you scroll through this list, you‚Äôll notice that many languages are present, but Dhivehi is one of few that is not! This means that Whisper was not pre-trained on Dhivehi. However, this doesn‚Äôt mean that we can‚Äôt fine tune Whisper on it. 

In doing so, we‚Äôll be teaching Whisper a new language, one that the pre-trained checkpoint does not support. That‚Äôs pretty cool, right!

### 1.3.2 find target language

What we need to do to fine-tune Whisper on a new language is find the language most similar that Whisper was pre-trained on. The Wikipedia article for Dhivehi states that Dhivehi is closely related to the Sinhalese language of Sri Lanka. If we check the language codes again, we can see that Sinhalese is present in the Whisper language set, so we can safely set our language argument to "sinhalese".

Right! We‚Äôll load our processor from the pre-trained checkpoint, setting the language to "sinhalese" and task to "transcribe" as explained above

In [6]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small", language="sinhalese", task="transcribe"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## 1.4 proprecess the dataset

### 1.4.1 resample the audio rate

In [7]:
common_voice["train"].features

{'audio': Audio(sampling_rate=48000, mono=True, decode=True, id=None),
 'sentence': Value(dtype='string', id=None)}

In [11]:
from datasets import Audio

sampling_rate = processor.feature_extractor.sampling_rate
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=sampling_rate))

common_voice["train"].features

{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'sentence': Value(dtype='string', id=None)}

### 1.4.2 ignore too long examples

#### 1.4.2.1 tokenization and add key: input_length

tokenization: convert the text to numbers the model can make sense of.

Use processor to tokenize use the fields:
- audio
  - array
  - sampling_rate
- sentence

and output the fields:
- input_features
- labels

In [13]:
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        audio=audio["array"],
        sampling_rate=audio["sampling_rate"],
        text=example["sentence"],
    )

    # ËÆ°ÁÆóËæìÂÖ•Èü≥È¢ëÊ†∑Êú¨ÁöÑÈïøÂ∫¶Ôºå‰ª•ÁßíËÆ°
    example["input_length"] = len(audio["array"]) / audio["sampling_rate"]

    return example

common_voice = common_voice.map(
    prepare_dataset, 
    remove_columns=common_voice.column_names["train"], 
    # num_proc=1,
    
    # batched=True,
    # batch_size=50,  # Example batch size, adjust based on your requirement
    num_proc=24
)

Map (num_proc=24):   0%|          | 0/4902 [00:00<?, ? examples/s]

Exception ignored from cffi callback <function SoundFile._init_virtual_io.<locals>.vio_read at 0x148e5d7dcdc0>:
Exception ignored from cffi callback <function SoundFile._init_virtual_io.<locals>.vio_read at 0x148e5d808550>:
Process ForkPoolWorker-59:
Process ForkPoolWorker-54:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/yuchuan/.e/jupyter/lib/python3.9/site-packages/soundfile.py", line 1244, in vio_read
  File "/home/yuchuan/.e/jupyter/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/home/yuchuan/.e/jupyter/lib/python3.9/site-packages/multiprocess/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/yuchuan/.e/jupyter/lib/python3.9/site-packages/multiprocess/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/home/yuchuan/.e/jupyter/lib/python3.9/site-packages/soundfile.py", line 1244, in vio_rea

TimeoutError: 

#### 1.4.2.2 filter audio length

In [None]:
max_input_length = 30.0

def is_audio_in_length_range(length):
    return length < max_input_length

print('>>> before filter')
print(common_voice)

common_voice["train"] = common_voice["train"].filter(
    is_audio_in_length_range,
    input_columns=["input_length"],
)

print('>>> after filter')
print(common_voice)

## 1.5 prepare pipeline

### 1.5.1 prepare data collator

- collate function: The function that is responsible for putting together samples inside a batch is called a collate function.
- The default collator is a function that will just convert your samples to tf.Tensor and concatenate them (recursively if your elements are lists, tuples, or dictionaries). This won‚Äôt be possible in our case since the inputs we have won‚Äôt all be of the same size.

In [None]:
import tensorflow as tf
from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], tf.Tensor]]]
    ) -> Dict[str, tf.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning tf tensors
        input_features = [
            {"input_features": feature["input_features"][0]} for feature in features
        ]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="tf")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="tf")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

### 1.5.2 prepare function: evaluate metrics

In [None]:
import evaluate

metric = evaluate.load("wer")

from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer()

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Áî® pad_token_id ÊõøÊç¢ -100
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # Êàë‰ª¨Â∏åÊúõÂú®ËÆ°ÁÆóÊåáÊ†áÊó∂‰∏çË¶ÅÁªÑÂêàËµ∑ËØçÂÖÉ
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    # ËÆ°ÁÆóÊôÆÈÄöÁöÑ WER
    wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str)

    # ËÆ°ÁÆóÊ†áÂáÜÂåñÁöÑ WER
    pred_str_norm = [normalizer(pred) for pred in pred_str]
    label_str_norm = [normalizer(label) for label in label_str]
    # ËøáÊª§Ôºå‰ªéËÄåÂú®ËØÑ‰º∞Êó∂Âè™ËÆ°ÁÆó reference ÈùûÁ©∫ÁöÑÊ†∑Êú¨
    pred_str_norm = [
        pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0
    ]
    label_str_norm = [
        label_str_norm[i]
        for i in range(len(label_str_norm))
        if len(label_str_norm[i]) > 0
    ]

    wer = 100 * metric.compute(predictions=pred_str_norm, references=label_str_norm)

    return {"wer_ortho": wer_ortho, "wer": wer}

### 1.5.3 build Transformer model

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

### 1.5.4 prepare train params

In [None]:
# some setting

from functools import partial

# train args: disable cache during training since it's incompatible with gradient checkpointing
model.config.use_cache = False

# generate args: ‰∏∫ÁîüÊàêËÆæÁΩÆËØ≠Ë®ÄÂíå‰ªªÂä°ÔºåÂπ∂ÈáçÊñ∞ÂêØÁî®ÁºìÂ≠ò
model.generate = partial(
    model.generate, language="sinhalese", task="transcribe", use_cache=True
)

# train args
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-dv",  # Âú® HF Hub ‰∏äÁöÑËæìÂá∫ÁõÆÂΩïÁöÑÂêçÂ≠ó
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # ÊØèÊ¨° batch size ‰∏ãË∞ÉÂà∞‰∏ÄÂçäÂ∞±ÊääËøô‰∏™ÂèÇÊï∞‰∏äË∞ÉÂà∞‰∏§ÂÄç
    learning_rate=1e-5,
    lr_scheduler_type="constant_with_warmup",
    warmup_steps=50,
    max_steps=500,  # Â¶ÇÊûúÊÇ®ÊúâËá™Â∑±ÁöÑ GPU ÊàñËÄÖ Colab ‰ªòË¥πËÆ°ÂàíÔºå‰∏äË∞ÉÂà∞ 4000
    gradient_checkpointing=True,
    fp16=True,
    fp16_full_eval=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    eval_steps=500,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

### 1.5.5 build trainer

We can forward the training arguments to the ü§ó Trainer along with our model, dataset, data collator and compute_metrics function

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)

# 2 train