In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install -q bitsandbytes datasets accelerate
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git@main
! pip install -U openai-whisper
! pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
! pip install optuna

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
# Select CUDA device index
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
task = "transcribe"
language = "English"
model_name_or_path = "openai/whisper-base"

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is enabled and available!")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. Falling back to CPU.")

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

## Prepare Feature Extractor, Tokenizer and Data

In [5]:
from transformers import WhisperProcessor
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)
tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)

### Prepare Data

In [None]:
from datasets import load_dataset, DatasetDict
import os

data_dir = os.path.abspath(os.path.join("..", "..", "data", "processed"))

# Load the dataset
dataset = DatasetDict({
    "train": load_dataset("csv", data_files=os.path.join(data_dir, "train_data.csv")),
    "validation": load_dataset("csv", data_files=os.path.join(data_dir, "validation_data.csv")),
    "test": load_dataset("csv", data_files=os.path.join(data_dir, "test_data.csv"))
})

print(dataset)

### Batching

In [None]:
# Flatten the nested DatasetDict
dataset = DatasetDict({
    "train": dataset["train"]["train"],  # Extract the inner 'train' dataset
    "validation": dataset["validation"]["train"],  # Extract the inner 'train' dataset
    "test": dataset["test"]["train"]  # Extract the inner 'train' dataset
})

print(dataset)

In [None]:
import os

data_dir = os.path.abspath(os.path.join("..", "..", "data", "processed"))

# Function to correct the file paths
def correct_path(example):
    # Convert Windows-style backslashes to forward slashes
    current_path = example["path"].replace("\\", "/")
    
    # Remove '../processed' and replace with '../../data/processed'
    if "../processed" in current_path:
        new_path = current_path.replace("../processed", "../../data/processed")
    else:
        new_path = current_path
    
    example["path"] = new_path
    return example

# Apply the path correction to each split
for split in dataset.keys():
    dataset[split] = dataset[split].map(correct_path)

print(dataset)

# Verify the correction worked
print("Original path:", "../processed\\test\\SP0873-CH00-SE01-RC755.flac")
print("New path:", dataset["test"][0]["path"])
print("New path:", dataset["train"][0]["path"])

In [9]:
from datasets import Audio

# Decode the audio column
dataset = dataset.cast_column("path", Audio(sampling_rate=16000))  # Set the desired sampling rateD

In [None]:
print(dataset["train"][0]["path"])

In [None]:
def prepare_dataset(batch):
    # Load and resample audio data
    audio = batch["path"]
    
    # Compute log-Mel input features
    batch["input_features"] = feature_extractor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]
    
    # Encode target text to label ids
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

print("Preparing dataset...")
dataset = dataset.map(
    prepare_dataset,
    remove_columns=dataset["train"].column_names
)
print("Dataset preparation complete!")

## Training & Evaluation

### Define a data collector

In [12]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [13]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

### Evaluation Metric

In [14]:
import evaluate
import numpy as np

# Evaluation metric
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    
    # Replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    
    # Handle different types of prediction outputs
    if isinstance(pred_ids, list) and isinstance(pred_ids[0], list):
        # Already in the right format
        pass
    elif hasattr(pred_ids, "shape") and len(pred_ids.shape) > 1:
        # Convert numpy arrays to lists if needed
        pred_ids = pred_ids.tolist()
    
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    
    return {"wer": wer}

### Load Pretrained model and apply post-processing

In [15]:
# Load model with quantization
from transformers import BitsAndBytesConfig
from transformers import WhisperForConditionalGeneration
from peft import prepare_model_for_kbit_training

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = WhisperForConditionalGeneration.from_pretrained(
    model_name_or_path, 
    quantization_config=bnb_config,
    device_map="auto"
)

# Set decoding parameters
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

### LORA

In [18]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],  # For Whisper architecture
    lora_dropout=0.05,
    bias="none"
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 73,773,568 || trainable%: 1.5990


### Training Params

In [19]:
from transformers import Seq2SeqTrainingArguments

# Update model configuration before training
model.config.use_cache = False
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
model.config.suppress_tokens = []

# Define a more robust training arguments setup
training_args = Seq2SeqTrainingArguments(
    output_dir="../../output/models/whisper/",
    per_device_train_batch_size=8, 
    gradient_accumulation_steps=2,
    learning_rate=1e-3,
    warmup_steps=50,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    fp16=True,
    per_device_eval_batch_size=4,  
    predict_with_generate=True,  
    generation_max_length=128,
    generation_num_beams=1,  
    logging_steps=25,
    remove_unused_columns=False,  
    label_names=["labels"],
    push_to_hub=False,
)


In [20]:
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

# Callback to save PEFT adapter weights
class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
        
        # Save the adapter model
        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)
        
        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        return control

In [21]:
from transformers import Seq2SeqTrainer

# Initialize trainer
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor, 
    callbacks=[SavePeftModelCallback],
)

  trainer = Seq2SeqTrainer(


### Hyperparameter tuning
Find best hyperparameters and use that to train model -> save best_model

In [22]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers.trainer_utils import HPSearchBackend
import optuna
import os
import shutil

def model_init():
    # Initialize a fresh model for each trial
    model = WhisperForConditionalGeneration.from_pretrained(
        model_name_or_path, 
        quantization_config=bnb_config,
        device_map="auto"
    )
    
    # Set decoding parameters
    model.config.use_cache = False
    model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
    model.config.suppress_tokens = []
    
    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)
    
    # Apply LoRA
    lora_config = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none"
    )
    model = get_peft_model(model, lora_config)
    
    return model

# Define the hyperparameter search space
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16]),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 5),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.3),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.1),
    }

# Base output directory
base_output_dir = "../../output/models/whisper"
hp_search_dir = os.path.join(base_output_dir, "hp_search")
best_model_dir = os.path.join(base_output_dir, "best_model")

# Create directories if they don't exist
os.makedirs(hp_search_dir, exist_ok=True)
os.makedirs(best_model_dir, exist_ok=True)

# Define the baseline training arguments 
training_args = Seq2SeqTrainingArguments(
    output_dir=hp_search_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    predict_with_generate=True,
    generation_max_length=128,
    generation_num_beams=1,
    logging_steps=25,
    remove_unused_columns=False,
    label_names=["labels"],
    load_best_model_at_end=True,
    metric_for_best_model="eval_wer",  
    greater_is_better=False, 
    push_to_hub=False,
)

# Initialize trainer with model_init function
trainer = Seq2SeqTrainer(
    args=training_args,
    model_init=model_init,  
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
    callbacks=[SavePeftModelCallback],
)

# Run hyperparameter search (n_trials specifies how many combinations to try)
best_trial = trainer.hyperparameter_search(
    direction="minimize",  # Minimize WER
    backend="optuna",
    hp_space=hp_space,
    n_trials=10,  
    compute_objective=lambda metrics: metrics["eval_wer"],
)

print(f"Best hyperparameters: {best_trial.hyperparameters}")

# Train with the best hyperparameters
for param, value in best_trial.hyperparameters.items():
    setattr(trainer.args, param, value)

# Set a new output directory for the final training with best hyperparameters
trainer.args.output_dir = best_model_dir

# Train the model with best hyperparameters
trainer.train()

# Save the final model, tokenizer and config
print(f"Saving best model to {best_model_dir}")
trainer.save_model(best_model_dir)
processor.save_pretrained(best_model_dir)

# Save hyperparameter information for reference
import json
with open(os.path.join(best_model_dir, "hyperparameters.json"), "w") as f:
    json.dump(best_trial.hyperparameters, f, indent=2)

print(f"Successfully trained and saved the best model to {best_model_dir}")

  trainer = Seq2SeqTrainer(
[I 2025-03-02 15:04:53,033] A new study created in memory with name: no-name-3f76accf-7d2c-4fff-8dcb-5a766771d8c7
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Wer
1,No log,1.650314,154.304636
2,No log,1.621067,80.794702
3,No log,1.61683,74.834437


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
[I 2025-03-02 15:06:18,831] Trial 0 finished with value: 74.83443708609272 and parameters: {'learning_rate': 1.367956006791367e-05, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'num_train_epochs': 3, 'warmup_ratio': 0.2843658686208624, 'weight_decay': 0.026060929032745175}. Best is trial 0 with value: 74.83443708609272.
  retu

Epoch,Training Loss,Validation Loss,Wer
1,No log,0.427997,17.218543
2,No log,0.282944,15.89404


  return fn(*args, **kwargs)
[I 2025-03-02 15:07:05,382] Trial 1 finished with value: 15.894039735099339 and parameters: {'learning_rate': 0.0009164287533577747, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 2, 'num_train_epochs': 2, 'warmup_ratio': 0.060267985815418035, 'weight_decay': 0.08276306288577605}. Best is trial 1 with value: 15.894039735099339.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Wer
1,No log,0.481115,17.880795
2,No log,0.41745,17.218543


  return fn(*args, **kwargs)
[I 2025-03-02 15:07:51,206] Trial 2 finished with value: 17.218543046357617 and parameters: {'learning_rate': 0.00043638035396118284, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 2, 'num_train_epochs': 2, 'warmup_ratio': 0.18202069088345593, 'weight_decay': 0.03829477442020021}. Best is trial 1 with value: 15.894039735099339.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Wer
1,No log,1.482602,74.172185
2,No log,1.326367,42.384106


  return fn(*args, **kwargs)
[I 2025-03-02 15:08:40,995] Trial 3 finished with value: 42.384105960264904 and parameters: {'learning_rate': 0.0002969562685005127, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 4, 'num_train_epochs': 3, 'warmup_ratio': 0.204653736383967, 'weight_decay': 0.0014074731839436305}. Best is trial 1 with value: 15.894039735099339.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Wer
1,No log,1.457224,66.887417
2,No log,1.258133,86.754967
3,1.763800,1.151694,31.125828


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
[I 2025-03-02 15:09:53,786] Trial 4 finished with value: 31.125827814569533 and parameters: {'learning_rate': 8.391495902077265e-05, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 1, 'num_train_epochs': 3, 'warmup_ratio': 0.08442620628104903, 'weight_decay': 0.09777367838794399}. Best is trial 1 with value: 15.894039735099339.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Wer
1,No log,1.572339,70.198675


[I 2025-03-02 15:10:19,828] Trial 5 pruned. 


Epoch,Training Loss,Validation Loss,Wer
1,No log,1.526599,73.509934
2,No log,1.287046,86.754967


  return fn(*args, **kwargs)
[I 2025-03-02 15:11:13,848] Trial 6 pruned. 


Epoch,Training Loss,Validation Loss,Wer
1,No log,1.366478,64.900662


[I 2025-03-02 15:11:39,070] Trial 7 pruned. 


Epoch,Training Loss,Validation Loss,Wer
1,No log,1.656137,92.715232


[I 2025-03-02 15:12:04,837] Trial 8 pruned. 


Epoch,Training Loss,Validation Loss,Wer
1,No log,1.383157,88.741722
2,No log,0.806012,18.543046
3,No log,0.489605,18.543046


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
[I 2025-03-02 15:13:14,027] Trial 9 finished with value: 18.543046357615893 and parameters: {'learning_rate': 0.00029642199745677413, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 2, 'num_train_epochs': 3, 'warmup_ratio': 0.23650483956494822, 'weight_decay': 0.01612901423299774}. Best is trial 1 with value: 15.894039735099339.


Best hyperparameters: {'learning_rate': 0.0009164287533577747, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 2, 'num_train_epochs': 2, 'warmup_ratio': 0.060267985815418035, 'weight_decay': 0.08276306288577605}


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Wer
1,No log,0.473121,17.880795
2,No log,0.306596,15.89404


  return fn(*args, **kwargs)


Saving best model to ../../output/models/whisper\best_model
Successfully trained and saved the best model to ../../output/models/whisper\best_model


### Training (without Hyperparameter tuning)

In [21]:
# Disable caching during training to avoid memory issues
model.config.use_cache = False

# Start training
print("Starting training...")
trainer.train()
print("Training complete!")

Starting training...


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Wer
1,No log,1.658005,70.860927
2,No log,1.364233,70.198675
3,No log,0.555613,18.543046


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Training complete!


In [25]:
# Save the final model
model.save_pretrained(best_model_dir)

### Evaluation on Test

In [23]:
# Evaluate on test set
print("Evaluating on test set...")
eval_results = trainer.evaluate(dataset["test"])
print(f"Test WER: {eval_results['eval_wer']:.2f}%")

Evaluating on test set...




Test WER: 22.81%
