In [5]:
import os
import torch
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from datasets import load_dataset, Audio

# Specify the model name (using the large version)
model_name = "openai/whisper-large"

# Load the processor and model from Hugging Face
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

model.safetensors:  35%|###4      | 2.14G/6.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.85k [00:00<?, ?B/s]

In [23]:
import pandas as pd
from pathlib import Path

In [25]:
train_path = Path('../Data/dataset/train')
files = os.listdir(train_path)
train_csv = [file for file in files if file.endswith('csv')]

In [26]:
for file in train_csv:
    df = pd.read_csv(train_path /train_csv[0])[['audio_filepath', 'transcript']]
    df = df.rename(columns = {'audio_filepath', 'audio_path'})

Unnamed: 0,audio,audio_filepath,speaker,transcript
0,1150481435_413579_413580_25314,../Data/audio/bcd/voices/1150481435_413579_413...,1150481435,1671726558297919
1,1150481435_413589_413590_25232,../Data/audio/bcd/voices/1150481435_413589_413...,1150481435,1233482219165815
2,1150481435_413655_413656_25384,../Data/audio/bcd/voices/1150481435_413655_413...,1150481435,1459998773472568
3,1150481435_413659_413660_25161,../Data/audio/bcd/voices/1150481435_413659_413...,1150481435,8634735591936473
4,1150481435_413768_413769_25493,../Data/audio/bcd/voices/1150481435_413768_413...,1150481435,1435789229821271
...,...,...,...,...
639,1347919237_485335_485342_26032,../Data/audio/bcd/voices/1347919237_485335_485...,1347919237,9726787326957793
640,1347919237_485394_485399_25496,../Data/audio/bcd/voices/1347919237_485394_485...,1347919237,8644274957911231
641,1009209372_485440_485443_25229,../Data/audio/bcd/voices/1009209372_485440_485...,1009209372,5178644254884757
642,1009209372_485613_485614_25909,../Data/audio/bcd/voices/1009209372_485613_485...,1009209372,7315883563164141


In [None]:
# Load your custom dataset from CSV files.
# Ensure your CSV files have at least two columns: "audio_path" and "transcript".
data_files = {"train": "train.csv", "validation": "val.csv"}
dataset = load_dataset("csv", data_files=data_files)

# Cast the "audio_path" column to an Audio column with the desired sampling rate.
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000))

In [None]:
# Define a function to process each example:
# It extracts the audio features and tokenizes the transcript.
def prepare_example(batch):
    # "audio_path" is automatically loaded as a dictionary with an "array" key.
    audio = batch["audio_path"]["array"]
    transcript = batch["transcript"]

    # Extract features from the audio using the processor's feature extractor.
    # The result is a list with one element per audio sample.
    input_features = processor.feature_extractor(audio, sampling_rate=16000).input_features[0]
    
    # Tokenize the transcript (the tokenizer will handle any necessary preprocessing).
    labels = processor.tokenizer(transcript).input_ids

    # Store the processed features and labels in the batch.
    batch["input_features"] = input_features
    batch["labels"] = labels
    return batch

# Apply the preprocessing function to all examples.
# remove_columns will drop the original columns so that the model receives only what it needs.
dataset = dataset.map(prepare_example, remove_columns=dataset["train"].column_names)

# Create a data collator to dynamically pad the input features and labels in each batch.
def data_collator(features):
    # Extract input features and labels lists from the batch
    input_features = [f["input_features"] for f in features]
    labels = [f["labels"] for f in features]
    
    # Pad input features (using the feature extractor's padding method)
    batch_inputs = processor.feature_extractor.pad(input_features, return_tensors="pt")
    
    # Pad labels using the tokenizer's pad method.
    batch_labels = processor.tokenizer.pad({"input_ids": labels}, return_tensors="pt")["input_ids"]
    
    return {"input_features": batch_inputs.input_features, "labels": batch_labels}

# Define the training arguments.
# Adjust the batch sizes, learning rate, number of epochs, etc. according to your needs.
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="steps",
    num_train_epochs=3,
    fp16=True,  # enable this if you have a GPU that supports half precision
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=1e-5,
    predict_with_generate=True,
    logging_dir="./logs",
)

# Create a Trainer for sequence-to-sequence training.
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    tokenizer=processor.tokenizer,
)

# Main entry point for training
if __name__ == "__main__":
    trainer.train()

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

KeyboardInterrupt: 