# ENV Prep

In [1]:
!sudo apt update && sudo apt install ffmpeg
!pip install datasets>=2.6.1
!pip install transformers[torch]
!pip install librosa
!pip install jiwer
!pip install deepcut

[33m0% [Working][0m            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
[33m0% [Connecting to archive.ubuntu.com] [1 InRelease 14.2 kB/110 kB 13%] [Connect[0m                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
[33m0% [Connecting to archive.ubuntu.com (91.189.91.81)] [1 InRelease 46.0 kB/110 k[0m[33m0% [Connecting to archive.ubuntu.com (91.189.91.81)] [1 InRelease 88.0 kB/110 k[0m                                                                               Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
[33m0% [Waiting for headers] [1 InRelease 93.8 kB/110 kB 85%] [Connected to ppa.lau[0m[33m0% [Waiting for headers] [Connected to ppa.launchpadcontent.net (185.125.190.80[0m                                                                               Hit:4 http://archive.ubun

# Data prep

In [2]:
import re
from dataclasses import dataclass
from typing import Any, Dict
from pathlib import Path

import datasets
import torch
import jiwer
import numpy as np
from deepcut import tokenize
from transformers import WhisperProcessor, pipeline, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
from datasets.features import Audio
from google.colab import drive


drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-large-v3",
    language="Thai",
    task="transcribe",
  )

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = processor.feature_extractor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
    ).input_features[0]

    # encode target text to label ids
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch


In [5]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: list[Dict[str, list[int] | torch.Tensor]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [6]:
# fp = Path("/content/drive/MyDrive/Colab Notebooks/speech-to-text test/TH only common voice 15")
fp = Path("/content/drive/MyDrive/Colab Notebooks/speech-to-text test/th-cut-sample.wav")
fp.exists()

True

In [7]:
# https://huggingface.co/docs/datasets/audio_dataset#local-files

In [8]:
dataset = (
    Dataset
    .from_dict({"audio": [str(fp)], "sentence": ["สวัสดีครับ"]})
    .cast_column("audio", Audio(sampling_rate=16000))
    .cast_column("sentence", datasets.Value("string"))
)

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

In [9]:
dataset = dataset.map(prepare_dataset, num_proc=4)

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# Metrics

In [11]:
CLEAN_PATTERNS = "((นะ)?(คะ|ครับ)|เอ่อ|อ่า)"
REMOVE_TOKENS = {"", " "}

def hack_wer(
    hypothesis: str,
    reference: str,
    debug=False,
  ) -> float:
  """
  we will tokenize TH long txt into list of words,
  then concat it back separated by whitespace.
  Then, we will just use normal WER jiwer, to utilize
  C++ implementation.
  """
  refs = tokenize(re.sub(CLEAN_PATTERNS, "", reference))
  hyps = tokenize(re.sub(CLEAN_PATTERNS, "", hypothesis))

  refs = [r for r in refs if r not in REMOVE_TOKENS]
  hyps = [h for h in hyps if h not in REMOVE_TOKENS]

  if debug: print(refs); print(hyps)

  return jiwer.wer(" ".join(refs), " ".join(hyps))


def isd_np(preds: list[str], actuals: list[str], debug=True) -> int:
  dp = np.array([np.arange(len(preds) + 1) for _ in range(len(actuals) + 1)], dtype="int16")

  for row in range(len(dp)):
    for col in range(len(dp[0])):
      if row == 0 or col == 0:
        dp[row][col] = max(row, col)
        continue

      if preds[col - 1] != actuals[row - 1]:
        dp[row][col] = min(dp[row - 1][col], dp[row][col - 1], dp[row - 1][col - 1]) + 1
      else:
        dp[row][col] = min(dp[row - 1][col], dp[row][col - 1], dp[row - 1][col - 1])

  if debug: print(*dp, sep="\n")

  return dp[-1][-1]


def wer(pred: str, actual: str, **kwargs) -> float:
  refs = tokenize(re.sub(CLEAN_PATTERNS, "", actual))
  hyps = tokenize(re.sub(CLEAN_PATTERNS, "", pred))

  actuals = [r for r in refs if r not in REMOVE_TOKENS]
  preds = [h for h in hyps if h not in REMOVE_TOKENS]
  if kwargs["debug"]: print(f"{preds}\n{actuals}")
  err = isd_np(preds, actuals, **kwargs)
  return err / len(actuals)

In [12]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    # pred_str, and label_str is list[str]
    pred_strs = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_strs = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wers = list(map(hack_wer, pred_strs, label_strs))
    wer = sum(map(lambda w: w * 100, wers)) / len(wers)

    return {"wer": wer}


In [13]:
print(hack_wer("สวัสดีครับอิอิ ผมไม่เด็กแล้วนะครับ จริงๆนะ", "สวัสดีครับอุอุ ผมโตแล้วครับ จริงๆนะ", debug=True))
print(wer("สวัสดีครับอิอิ ผมไม่เด็กแล้วนะครับ จริงๆนะ", "สวัสดีครับอุอุ ผมโตแล้วครับ จริงๆนะ", debug=True))

['สวัสดี', 'อุอุ', 'ผม', 'โต', 'แล้ว', 'จริง', 'ๆ', 'นะ']
['สวัสดี', 'อิอิ', 'ผม', 'ไม่', 'เด็ก', 'แล้ว', 'จริง', 'ๆ', 'นะ']
0.375
['สวัสดี', 'อิอิ', 'ผม', 'ไม่', 'เด็ก', 'แล้ว', 'จริง', 'ๆ', 'นะ']
['สวัสดี', 'อุอุ', 'ผม', 'โต', 'แล้ว', 'จริง', 'ๆ', 'นะ']
[0 1 2 3 4 5 6 7 8 9]
[1 0 1 2 3 4 5 6 7 8]
[2 1 1 2 3 4 5 6 7 8]
[3 2 2 1 2 3 4 5 6 7]
[4 3 3 2 2 3 4 5 6 7]
[5 4 4 3 3 3 3 4 5 6]
[6 5 5 4 4 4 4 3 4 5]
[7 6 6 5 5 5 5 4 3 4]
[8 7 7 6 6 6 6 5 4 3]
0.375


# Model prep

In [14]:
CHUNK_LENGTH = 30
NUM_BEAMS = 2
BATCH_SIZE = 16
N = 2

In [15]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

In [16]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-large-v3",
    # torch_dtype=torch_dtype,
    num_beams=NUM_BEAMS,
)

config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [17]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

# Fine-tune the model

In [18]:
(output_dir := (Path.cwd() / "fine-tune-whisper-large-v3-test")).mkdir(exist_ok=True)

In [19]:
training_args = Seq2SeqTrainingArguments(
    # set to tmp_trainer folder in current folder
    output_dir=str(output_dir),
    per_device_train_batch_size=BATCH_SIZE // N,
    gradient_accumulation_steps=N,  # increase by 2x for every 2x decrease in batch size
    learning_rate=2e-5,
    warmup_steps=0, # 1000
    max_steps=1, # 6000
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1, # 1000
    eval_steps=1, # 1000
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

In [20]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [21]:
trainer.train()

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
1,No log,1.805137,3900.0




Could not locate the best model at /content/fine-tune-whisper-large-v3-test/checkpoint-1/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=1, training_loss=0.9025684595108032, metrics={'train_runtime': 46.6896, 'train_samples_per_second': 0.343, 'train_steps_per_second': 0.021, 'total_flos': 3397498306560000.0, 'train_loss': 0.9025684595108032, 'epoch': 1.0})

# Inference

# Eval