In [1]:
!pip install torch librosa scikit-learn 'transformers[torch]' datasets

Collecting torch
  Downloading torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl (766.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
Collecting librosa
  Downloading librosa-0.10.2.post1-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.1/260.1 KB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting transformers[torch]
  Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[

In [1]:
import os
import torch
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from datasets import load_dataset, Audio

In [2]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

In [6]:
train_path = Path('../Data/dataset/train')
files = os.listdir(train_path)
train_csv = [file for file in files if file.endswith('csv')]

In [7]:
trains, valids = [], []
for file in train_csv:
    df = pd.read_csv(train_path /file)[['audio_filepath', 'transcript']]
    df = df.rename(columns = {'audio_filepath': 'audio_path'})

    train_df, valid_df = train_test_split(df, test_size = 0.1, random_state = 42)
    
    trains.append(train_df)
    valids.append(valid_df)

In [8]:
valid = pd.concat(valids, axis = 0)
train = pd.concat(trains, axis = 0)

In [9]:
train.to_csv('train.csv', index = False)
valid.to_csv('val.csv', index = False)

In [3]:
# Define a function to process each example:
# It extracts the audio features and tokenizes the transcript.




In [9]:
def main():
    # Specify the model name (using the large version)
    model_name = "openai/whisper-large"
    
    # Load the processor and model from Hugging Face
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name)
    # Create a data collator to dynamically pad the input features and labels in each batch.
    def data_collator(features):
        # Extract input features and labels lists from the batch
        print(features)
        input_features = [f["input_features"] for f in features]
        input_features = {"input_features": input_features}
        labels = [f["labels"] for f in features]
        # Pad input features (using the feature extractor's padding method)`
        batch_inputs = processor.feature_extractor.pad(input_features, return_tensors="pt")
        
        # Pad labels using the tokenizer's pad method.
        batch_labels = processor.tokenizer.pad({"input_ids": labels}, return_tensors="pt")["input_ids"]
        
        return {"input_features": batch_inputs.input_features, "labels": batch_labels}
    def prepare_example(batch):
        # "audio_path" is automatically loaded as a dictionary with an "array" key.
        audio = batch["audio_path"]["array"]
        transcript = batch["transcript"]
    
        # Extract features from the audio using the processor's feature extractor.
        # The result is a list with one element per audio sample.
        input_features = processor.feature_extractor(audio, sampling_rate=16000).input_features[0]
        
        # Tokenize the transcript (the tokenizer will handle any necessary preprocessing).
        labels = processor.tokenizer(transcript).input_ids
    
        # Store the processed features and labels in the batch.
        batch["input_features"] = input_features
        batch["labels"] = labels
        return batch
    # Load your custom dataset from CSV files.
    # Ensure your CSV files have at least two columns: "audio_path" and "transcript".
    data_files = {"train": "train.csv", "validation": "val.csv"}
    dataset = load_dataset("csv", data_files=data_files)
    
    # Cast the "audio_path" column to an Audio column with the desired sampling rate.
    dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000))
    
    # Apply the preprocessing function to all examples.
    # remove_columns will drop the original columns so that the model receives only what it needs.
    dataset = dataset.map(prepare_example, remove_columns=dataset["train"].column_names)
    # Define the training arguments.
    # Adjust the batch sizes, learning rate, number of epochs, etc. according to your needs.
    training_args = Seq2SeqTrainingArguments(
        output_dir="./whisper-finetuned",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        evaluation_strategy="steps",
        num_train_epochs=3,
        fp16=True,  # enable this if you have a GPU that supports half precision
        save_steps=500,
        eval_steps=500,
        logging_steps=100,
        learning_rate=1e-5,
        predict_with_generate=True,
        logging_dir="./logs",
    )
    
    # Create a Trainer for sequence-to-sequence training.
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        data_collator=data_collator,
        tokenizer=processor.tokenizer,
    )
    trainer.train()

In [10]:
from accelerate import notebook_launcher
notebook_launcher(finetuning_loop, num_processes=2)

Launching training on 2 GPUs.


  trainer = Seq2SeqTrainer(
  trainer = Seq2SeqTrainer(
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)







IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)






IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)






Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
W0228 09:27:43.320000 1494 torch/multiprocessing/spawn.py:169] Terminating process 1719 via signal SIGTERM
E0228 09:27:43.511000 1494 torch/distributed/elastic/multiprocessing/api.py:732] failed (exitcode: 1) local_rank: 0 (pid: 1718) of fn: finetuning_loop (start_method: fork)
E0228 09:27:43.511000 1494 torch/distributed/elastic/multiprocessing/api.py:732] Traceback (most recent call last):
E0228 09:27:43.511000 1494 torch/distributed/elastic/multiprocessing/api.py:732]   File "/venv/main/lib/python3.10/site-packages/to

ChildFailedError: 
============================================================
finetuning_loop FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-02-28_09:27:42
  host      : fada9b19e18c
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 1718)
  error_file: /tmp/torchelastic_ccu0ujml/none_79lkkrrl/attempt_0/0/error.json
  traceback : Traceback (most recent call last):
    File "/venv/main/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
      return f(*args, **kwargs)
    File "/tmp/ipykernel_1494/2479828396.py", line 75, in finetuning_loop
      trainer.train()
    File "/venv/main/lib/python3.10/site-packages/transformers/trainer.py", line 2241, in train
      return inner_training_loop(
    File "/venv/main/lib/python3.10/site-packages/transformers/trainer.py", line 2548, in _inner_training_loop
      tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
    File "/venv/main/lib/python3.10/site-packages/transformers/trainer.py", line 3698, in training_step
      loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
    File "/venv/main/lib/python3.10/site-packages/transformers/trainer.py", line 3759, in compute_loss
      outputs = model(**inputs)
    File "/venv/main/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
      return self._call_impl(*args, **kwargs)
    File "/venv/main/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
      return forward_call(*args, **kwargs)
    File "/venv/main/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1645, in forward
      return self._post_forward(output)
    File "/venv/main/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1620, in _post_forward
      passthrough_tensor_list = _DDPSink.apply(
    File "/venv/main/lib/python3.10/site-packages/torch/autograd/function.py", line 575, in apply
      return super().apply(*args, **kwargs)  # type: ignore[misc]
    File "/venv/main/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 250, in forward
      ret = tuple(
    File "/venv/main/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 251, in <genexpr>
      inp.clone() if isinstance(inp, torch.Tensor) else inp for inp in inputs
  torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 832.00 KiB is free. Process 549389 has 23.63 GiB memory in use. Of the allocated memory 22.29 GiB is allocated by PyTorch, and 774.42 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
  
============================================================

In [23]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [11]:
os.environ["CUDA_VISIBLE_DEVICES"]

KeyError: 'CUDA_VISIBLE_DEVICES'

In [4]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Total GPUs: {torch.cuda.device_count()}")

for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)} - {torch.cuda.memory_allocated(i)/1024**3:.2f} GB allocated, {torch.cuda.memory_reserved(i)/1024**3:.2f} GB reserved")


PyTorch version: 2.6.0+cu124
CUDA available: True
Total GPUs: 2
GPU 0: NVIDIA GeForce RTX 4090 - 0.00 GB allocated, 0.00 GB reserved
GPU 1: NVIDIA GeForce RTX 4090 - 0.00 GB allocated, 0.00 GB reserved


In [5]:
torch.cuda.empty_cache()
import gc
gc.collect()


288