# Install necessary packages

In [None]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg

Repository: 'deb https://ppa.launchpadcontent.net/jonathonf/ffmpeg-4/ubuntu/ jammy main'
Description:
Backport of FFmpeg 4 and associated libraries. Now includes AOM/AV1 support!

FDK AAC is not compatible with GPL and FFmpeg can't be redistributed with it included. Please don't ask for it to be added to this public PPA.

---

PPA supporters:

BigBlueButton (https://bigbluebutton.org)

---

Donate to FFMPEG: https://ffmpeg.org/donations.html
Donate to Debian: https://www.debian.org/donations
Donate to this PPA: https://ko-fi.com/jonathonf
More info: https://launchpad.net/~jonathonf/+archive/ubuntu/ffmpeg-4
Adding repository.
Adding deb entry to /etc/apt/sources.list.d/jonathonf-ubuntu-ffmpeg-4-jammy.list
Adding disabled deb-src entry to /etc/apt/sources.list.d/jonathonf-ubuntu-ffmpeg-4-jammy.list
Adding key to /etc/apt/trusted.gpg.d/jonathonf-ubuntu-ffmpeg-4.gpg with fingerprint 4AB0F789CBA31744CC7DA76A8CF63AD3F06FC659
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ In

In [None]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git@main

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-nw0j6tm1
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-nw0j6tm1
  Resolved https://github.com/huggingface/transformers to commit e9adb0c9cf9f2e4017615ab64f4d2f364339136e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tokenizers<0.15,>=0.14 (from transformers==4.36.0.dev0)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers==4.36.0.dev0)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 M

# HuggingFace authentication
We are utilising huggingface API to pull the model available on their platform, we require a write token so that we can keep track and save the best performing model on HF itself during training.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
1# Select CUDA device index
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_name_or_path = "openai/whisper-small"
language = "English"
task = "transcribe"

In [None]:
from datasets import load_from_disk
concatenated_dataset = load_from_disk('/content/drive/My Drive/full_dataset_1')

In [None]:
import evaluate

metric = evaluate.load("wer")

# Training

## Training from scratch

In [None]:
# Run this if training model from scratch
from transformers import WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor

tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)
model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path, load_in_8bit = True, device_map="auto")

# # model.hf_device_map - this should be {" ": 0}

## Training from check point from HF

In [None]:
# from peft import PeftModel, PeftConfig
# from transformers import WhisperForConditionalGeneration, Seq2SeqTrainer
# from transformers import WhisperFeatureExtractor
# from transformers import WhisperTokenizer
# from transformers import WhisperProcessor

# peft_model_id = "jcrj/whisper-small" # TO CHANGE
# peft_config = PeftConfig.from_pretrained(peft_model_id)
# model = WhisperForConditionalGeneration.from_pretrained(
#     peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
# )
# model = PeftModel.from_pretrained(model, peft_model_id, is_trainable=True)
# tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
# processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
# feature_extractor = processor.feature_extractor

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union
from torch.cuda.amp import autocast


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [None]:
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)

Only run the below cell if training from scratch. Position of cell is here as it has to be ran in this specific sequence.

In [None]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 5,898,240 || all params: 762,118,400 || trainable%: 0.7739269908717595


In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir='jcrj/whisper-small',  # change to a repo name of your choice
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-3,
    warmup_steps=50,
    num_train_epochs=8,
    evaluation_strategy="epoch",
    per_device_eval_batch_size=8,
    generation_max_length=128,
    logging_steps=25,
    remove_unused_columns=False,  # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
    label_names=["labels"],
    push_to_hub=True,
    load_best_model_at_end = True,
    save_strategy='epoch'
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=concatenated_dataset['train'],
    eval_dataset=concatenated_dataset['test'],
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.6299,0.666671
2,0.4983,0.590109
3,0.3267,0.569999
4,0.2389,0.553482
5,0.1995,0.54117




Epoch,Training Loss,Validation Loss
1,0.6299,0.666671
2,0.4983,0.590109
3,0.3267,0.569999
4,0.2389,0.553482
5,0.1995,0.54117
6,0.1196,0.54141
7,0.1029,0.541155
8,0.0684,0.545091




TrainOutput(global_step=2664, training_loss=0.30306807657082874, metrics={'train_runtime': 10654.1266, 'train_samples_per_second': 2.0, 'train_steps_per_second': 0.25, 'total_flos': 2.12749677232128e+19, 'train_loss': 0.30306807657082874, 'epoch': 8.0})