```bash
jupyter nbconvert --to python --no-prompt --ClearOutputPreprocessor.enabled=True train.ipynb
```

In [1]:
# Start of Selection
import evaluate
import numpy as np
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("openai/whisper-large-v3-turbo")

# Load CER metric
cer_metric = evaluate.load("cer")


def normalize_caption(string: str) -> str:
    """Remove all parentheses and their contents from a string and filter garbage characters."""
    import re

    # Remove parentheses and square brackets and their content
    string = re.sub(r"[\(\[].*?[\)\]]", "", string)

    # Patterns for valid characters
    ENGLISH_PATTERN = r"([a-zA-Z])"
    CHINESE_PATTERN = r"([\u4e00-\u9fff])"
    DIGIT_PATTERN = r"(\d)"

    def is_valid_char(c: str) -> bool:
        return bool(re.match(ENGLISH_PATTERN, c) or re.match(CHINESE_PATTERN, c) or re.match(DIGIT_PATTERN, c) or c == " ")

    # Filter out invalid characters
    string = "".join(c for c in string if is_valid_char(c))

    # Insert spaces between English and Chinese characters
    string = re.sub(f"{ENGLISH_PATTERN}{CHINESE_PATTERN}", r"\1 \2", string)
    string = re.sub(f"{CHINESE_PATTERN}{ENGLISH_PATTERN}", r"\1 \2", string)

    # Collapse multiple spaces and trim
    string = re.sub(r" {2,}", " ", string).strip()
    return string


class MockEvalPrediction:
    def __init__(self, predictions: np.ndarray, label_ids: np.ndarray):
        self.predictions = predictions
        self.label_ids = label_ids


def compute_metrics(pred: MockEvalPrediction) -> dict:
    """Compute CER for a batch of predictions vs. labels."""
    pred_ids = pred.predictions
    label_ids = pred.label_ids.copy()
    # Replace -100 with pad token
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    pred_str = [normalize_caption(p) for p in pred_str]
    label_str = [normalize_caption(l) for l in label_str]

    cer = 100 * cer_metric.compute(predictions=pred_str, references=label_str)
    return {"cer": cer}


def test_with_manual_examples() -> tuple[MockEvalPrediction, dict]:
    # Prepare text pairs
    examples = [
        ("Hello world", "hello world,"),
        ("我喜欢吃饭", ",我喜欢吃饭"),
    ]

    pred_arrays = []
    label_arrays = []
    for pred_text, label_text in examples:
        pred_ids = tokenizer(pred_text, return_tensors="pt").input_ids.numpy()[0]
        label_ids = tokenizer(label_text, return_tensors="pt").input_ids.numpy()[0]
        pred_arrays.append(pred_ids)
        label_arrays.append(label_ids)

    # Determine maximum sequence lengths
    max_pred_len = max(arr.shape[0] for arr in pred_arrays)
    max_label_len = max(arr.shape[0] for arr in label_arrays)

    # Pad predictions to uniform length
    padded_preds = np.full(
        (len(pred_arrays), max_pred_len),
        tokenizer.pad_token_id,
        dtype=pred_arrays[0].dtype,
    )
    for i, arr in enumerate(pred_arrays):
        padded_preds[i, : arr.shape[0]] = arr

    # Pad labels with -100 for padding positions
    padded_labels = np.full(
        (len(label_arrays), max_label_len),
        -100,
        dtype=label_arrays[0].dtype,
    )
    for i, arr in enumerate(label_arrays):
        padded_labels[i, : arr.shape[0]] = arr

    # Build mock EvalPrediction and display
    mock_eval_pred = MockEvalPrediction(
        predictions=padded_preds,
        label_ids=padded_labels,
    )
    print(f"Predictions shape: {mock_eval_pred.predictions.shape}")
    print(f"Labels shape:      {mock_eval_pred.label_ids.shape}")

    print("\nRaw decoded predictions:")
    for i, arr in enumerate(pred_arrays):
        print(f"  Example {i+1}: {tokenizer.decode(arr, skip_special_tokens=True)}")

    print("\nRaw decoded labels:")
    for i, arr in enumerate(label_arrays):
        print(f"  Example {i+1}: {tokenizer.decode(arr, skip_special_tokens=True)}")

    metrics = compute_metrics(mock_eval_pred)
    print(f"\nComputed metrics: {metrics}")
    return mock_eval_pred, metrics


# Run the test
mock_eval_pred, metrics = test_with_manual_examples()
# End of Selectio

2025-05-08 08:02:09.238311: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746691329.258715  799236 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746691329.264461  799236 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746691329.280435  799236 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746691329.280459  799236 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746691329.280461  799236 computation_placer.cc:177] computation placer alr

[2025-05-08 08:02:12,011] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_audit_symbind_alt@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `__nptl_change_stack_perm@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_find_dso_for_object@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_fatal_printf@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_exception_create@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `__tunable_get_val@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_audit_preinit@GLIBC_PRIVATE'
collect2: error: ld returned 1 exit status
/home/t

Predictions shape: (2, 8)
Labels shape:      (2, 9)

Raw decoded predictions:
  Example 1: Hello world
  Example 2: 我喜欢吃饭

Raw decoded labels:
  Example 1: hello world,
  Example 2: ,我喜欢吃饭

Computed metrics: {'cer': 6.25}


In [2]:
import random

from datasets import load_from_disk

SIZE = 3000
ds = load_from_disk(f"dataset_local/ds")
ds = ds.select(random.sample(range(len(ds)), SIZE))
ds = ds.train_test_split(test_size=0.3)
ds

Loading dataset from disk:   0%|          | 0/658 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 2100
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 900
    })
})

## Load Models

### Processor (Feature Extractor & Tokenizer)

In [3]:
from transformers import WhisperFeatureExtractor, WhisperProcessor, WhisperTokenizer

model_id = "openai/whisper-large-v3-turbo"
processor: WhisperProcessor = WhisperProcessor.from_pretrained(
    model_id,
    language="yue",
    task="transcribe",
)
feature_extractor: WhisperFeatureExtractor = processor.feature_extractor
tokenizer: WhisperTokenizer = processor.tokenizer

### Model with Quantization

In [4]:
import os

from transformers import BitsAndBytesConfig, WhisperForConditionalGeneration

quantization_config: BitsAndBytesConfig = BitsAndBytesConfig(load_in_8bit=True)
local_rank = int(os.getenv("LOCAL_RANK", "0"))
base_model: WhisperForConditionalGeneration = WhisperForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map={"": local_rank},
)


# Since the Whisper model uses Convolutional layers in the Encoder, checkpointing disables grad computation.
# To avoid this we specifically need to make the inputs trainable.
def make_inputs_require_grad(module, input, output):
    output.requires_grad_(True)


base_model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)

<torch.utils.hooks.RemovableHandle at 0x7fc55c188d10>

### PEFT Model

In [6]:
from peft import LoraConfig, LoraRuntimeConfig, PeftModel, get_peft_model

peft_config = LoraConfig(
    r=32,
    # target_modules=["q_proj", "v_proj"],
    target_modules="all-linear",
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    use_rslora=True,
    use_dora=True,
    runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=True),
)
peft_model: PeftModel = get_peft_model(base_model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 28,288,000 || all params: 837,166,080 || trainable%: 3.3790


### Data Collator

In [7]:
from dataclasses import dataclass
from typing import Dict, List, Union

import torch


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: WhisperProcessor

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        """
        Collate input features and labels for training or evaluation.

        Args:
            features (List[Dict[str, Union[List[int], torch.Tensor]]]): A list of dictionaries containing input features and labels.

        Returns:
            Dict[str, torch.Tensor]: A dictionary containing the collated input features and labels.
        """
        # Prepare input features for the model (audio log-Mel spectrograms)
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Cast to half if we're on GPU / using fp16
        if batch["input_features"].dtype == torch.float32 and torch.cuda.is_available():
            batch["input_features"] = batch["input_features"].half()

        # Prepare label features (tokenized text) for the model
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Set padding tokens in labels to -100 so they're ignored in loss computation
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # Remove BOS token if present at the start (it will be added during training)
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## Metric

In [None]:
import re

import evaluate

cer_metric = evaluate.load("cer")


def normalize_caption(string: str) -> str:
    """Remove all parentheses and their contents from a string and filter garbage characters."""

    # Remove parentheses and square brackets and their content
    string = re.sub(r"[\(\[].*?[\)\]]", "", string)

    # Patterns for valid characters
    ENGLISH_PATTERN = r"([a-zA-Z])"
    CHINESE_PATTERN = r"([\u4e00-\u9fff])"
    DIGIT_PATTERN = r"(\d)"

    def is_valid_char(c: str) -> bool:
        return bool(re.match(ENGLISH_PATTERN, c) or re.match(CHINESE_PATTERN, c) or re.match(DIGIT_PATTERN, c) or c == " ")

    # Filter out invalid characters
    string = "".join(c for c in string if is_valid_char(c))

    # Insert spaces between English and Chinese characters
    string = re.sub(f"{ENGLISH_PATTERN}{CHINESE_PATTERN}", r"\1 \2", string)
    string = re.sub(f"{CHINESE_PATTERN}{ENGLISH_PATTERN}", r"\1 \2", string)

    # Collapse multiple spaces and trim
    string = re.sub(r" {2,}", " ", string).strip()
    return string


def compute_metrics(pred):
    """Compute metrics for the model.

    Args:
        pred (dict): The predictions from the model.

    Returns:
        dict: A dictionary containing the computed metrics.
    """
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    pred_str = [normalize_caption(p) for p in pred_str]
    label_str = [normalize_caption(l) for l in label_str]

    cer = 100 * cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

## Training

In [9]:
import bitsandbytes as bnb
from peft.optimizers import create_loraplus_optimizer
from transformers import (
    EarlyStoppingCallback,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

repo_id = "Whisper-Cantonese"

training_args = Seq2SeqTrainingArguments(
    # I/O
    output_dir=repo_id,
    overwrite_output_dir=True,
    report_to=["tensorboard"],  # only tensorboard
    logging_dir=f"{repo_id}/runs",  # explicit log folder
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repo_id,
    hub_private_repo=True,
    # What to do
    eval_strategy="steps",
    eval_steps=10,  # eval every 200 steps
    save_strategy="steps",
    save_steps=10,
    save_total_limit=3,  # keep last 3 checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    # Batch & optimization
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    # auto_find_batch_size=True,
    learning_rate=1e-4,
    weight_decay=0,
    warmup_ratio=0.1,  # 10% warmup
    max_grad_norm=1.0,
    num_train_epochs=20,
    # Precision & memory
    fp16=True,  # 16-bit mixed precision
    gradient_checkpointing=True,
    # Logging
    logging_strategy="steps",
    logging_steps=50,
    logging_first_step=True,
    logging_nan_inf_filter=True,
    # Data loading
    eval_accumulation_steps=2,
    dataloader_num_workers=os.cpu_count(),  # maximize throughput
    dataloader_pin_memory=True,
    group_by_length=False,  # reduce padding
    remove_unused_columns=False,  # required for PEFT wrapper
    label_names=["labels"],
    # Generation
    predict_with_generate=True,  # compute generative metrics if any
    generation_max_length=128,
    # Repro & devices
    seed=42,
)


# This callback helps to save only the adapter weights and remove the base model weights.
class SavePeftModelCallback(TrainerCallback):
    def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        return control


trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    processing_class=processor.feature_extractor,
    compute_metrics=compute_metrics,
    callbacks=[
        SavePeftModelCallback,
        EarlyStoppingCallback(early_stopping_patience=3),
    ],
    optimizers=(
        create_loraplus_optimizer(
            model=peft_model,
            optimizer_cls=bnb.optim.Adam8bit,
            lr=1e-4,
            loraplus_lr_ratio=16,
        ),
        None,
    ),
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.66 GiB. GPU 0 has a total capacity of 44.35 GiB of which 3.45 GiB is free. Including non-PyTorch memory, this process has 40.89 GiB memory in use. Of the allocated memory 37.27 GiB is allocated by PyTorch, and 3.29 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 