# Complete Whisper Fine-tuning Guide

End-to-end guide for fine-tuning Whisper on custom datasets with profiling and optimization.

In [None]:
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, Audio
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

## 1. Dataset Preparation

In [None]:
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train[:100]", trust_remote_code=True)

def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features[0]
    batch["labels"] = processor.tokenizer(batch["text"], return_tensors="pt").input_ids[0]
    return batch

dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names, num_proc=4)

## 2. Training

In [None]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned",
    per_device_train_batch_size=8,
    learning_rate=1e-5,
    num_train_epochs=3,
    fp16=True,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(args=training_args, model=model, train_dataset=dataset)
trainer.train()