In [None]:
!pip install "/kaggle/input/efficient-llm-inference-wheels/bitsandbytes-0.40.0.post4-py3-none-any.whl"
!pip install "/kaggle/input/efficient-llm-inference-wheels/accelerate-0.20.3-py3-none-any.whl"
!pip install "/kaggle/input/efficient-llm-inference-wheels/peft-0.3.0-py3-none-any.whl"

!pip install evaluate>=0.30
!pip install jiwer
!pip install datasets==2.13.1

In [None]:
import pandas as pd
from datasets import Dataset

train_df = pd.read_csv("/kaggle/input/bengaliai-speech/train.csv")

In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large-v2")

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large-v2", language="bengali", task="transcribe")

In [None]:
input_str = train_df.sentence[0]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2", language="bengali", task="transcribe")



In [None]:
import torch
import librosa
from tqdm import tqdm

# sample 10% of the data only and 128 examples for validaion
fraction = 0.1
train_sample = train_df[train_df["split"] == "train"].sample(frac=fraction)
eval_sample = train_df[train_df["split"] == "valid"].sample(128)

sampling_rate = 16000
path_template = "/kaggle/input/bengaliai-speech/train_mp3s/{}.mp3"

In [None]:
from datasets import IterableDataset

def dataset_generator(df):
    for _, row in df.iterrows():
        audio_array = librosa.load(path_template.format(row["id"]))[0]
        yield {
            "input_features": feature_extractor(audio_array, sampling_rate=sampling_rate).input_features[0], 
            "labels": tokenizer(row["sentence"]).input_ids
        }

train_ds = IterableDataset.from_generator(dataset_generator, gen_kwargs={"df": train_sample})
eval_ds = IterableDataset.from_generator(dataset_generator, gen_kwargs={"df": eval_sample})

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
from transformers import WhisperForConditionalGeneration

model_id = "openai/whisper-large-v2"
model = WhisperForConditionalGeneration.from_pretrained(model_id, load_in_8bit=True)

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [None]:
from peft import prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)



In [None]:
model

In [None]:
import bitsandbytes as bnb

def find_all_linear_names(model, bits=8):
    cls = bnb.nn.Linear4bit if bits == 4 else (bnb.nn.Linear8bitLt if bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])


    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [None]:
target_modules = find_all_linear_names(model)
target_modules

In [None]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

config = LoraConfig(r=32, lora_alpha=64, target_modules=target_modules, lora_dropout=0.1, bias="none")

model = get_peft_model(model, config)
model.print_trainable_parameters()

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-largev2-bn",  # change to a repo name of your choice
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-4,
    warmup_steps=30,
    max_steps=1000,
#     gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    generation_max_length=225,
    save_steps=30,
    eval_steps=30,
    logging_steps=5,
    report_to="wandb",
    load_best_model_at_end=True,
#     metric_for_best_model="wer",
#     greater_is_better=False,
    save_total_limit=5,
    remove_unused_columns=False,  # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
    label_names=["labels"],  # same reason as above
)

In [None]:
import os
from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR


class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        return control


trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
#     compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
    callbacks=[SavePeftModelCallback],
)
model.config.use_cache = False

In [None]:
with torch.autocast("cuda"):
    trainer.train()

In [None]:
trainer.save("best_model_bath")