In [27]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [28]:
!pip install jiwer



In [23]:
import pandas as pd
from datasets import Dataset, Audio, Value

# Paths
train_csv = "train.csv"
test_csv = "test.csv"
output_train_dir = "/content/out_dir/train"
output_test_dir = "/content/out_dir/test"
data_dir = "/content/drive/MyDrive"

# Function to prepare dataset
def prepare_dataset(csv_file, data_dir, output_dir):
    df = pd.read_csv(csv_file)
    audio_paths = [f"{data_dir}/{filename}" for filename in df['filepath']]
    sentences = df['transcript'].tolist()

    # Create dataset without resampling since it's already at 16kHz
    audio_dataset = Dataset.from_dict({"audio": audio_paths, "sentence": sentences})

    # Cast columns without specifying sampling rate
    audio_dataset = audio_dataset.cast_column("audio", Audio())
    audio_dataset = audio_dataset.cast_column("sentence", Value("string"))

    # Save dataset directly without chunking
    audio_dataset.save_to_disk(output_dir)
    print(f'Data preparation done. Saved to {output_dir}')

# Prepare Training Data
prepare_dataset(train_csv, data_dir, output_train_dir)

# Prepare Testing Data
prepare_dataset(test_csv, data_dir, output_test_dir)


Casting the dataset:   0%|          | 0/2162 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2162 [00:00<?, ? examples/s]

Data preparation done. Saved to /content/out_dir/train


Casting the dataset:   0%|          | 0/541 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/541 [00:00<?, ? examples/s]

Data preparation done. Saved to /content/out_dir/test


In [29]:
import torch
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from datasets import DatasetDict, Audio, load_from_disk, concatenate_datasets
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset

# Clear GPU cache and set device
torch.cuda.empty_cache()
torch.set_num_threads(1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# Configuration dictionary
config = {
    'model_name': 'openai/whisper-small.en',
    'language': 'English',
    'sampling_rate': 16000,
    'num_proc': 16,
    'train_strategy': 'steps',
    'learning_rate': 1.75e-5,
    'warmup': 500,
    'train_batchsize': 4,
    'eval_batchsize': 4,
    'num_epochs': 10,
    'num_steps': 500,
    'resume_from_ckpt': None,
    'output_dir': '/content/out_dir',
    'train_datasets': ['/content/out_dir/train/data-00000-of-00001.arrow'],
    'eval_datasets': ['/content/out_dir/test/data-00000-of-00001.arrow']
}

if config['train_strategy'] not in ['steps', 'epoch']:
    raise ValueError('The train strategy should be either "steps" or "epoch".')

# Model setup
feature_extractor = WhisperFeatureExtractor.from_pretrained(config['model_name'])
tokenizer = WhisperTokenizer.from_pretrained(config['model_name'], language=config['language'], task="transcribe")
processor = WhisperProcessor.from_pretrained(config['model_name'], language=config['language'], task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(config['model_name'])

if model.config.decoder_start_token_id is None:
    raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined.")

# Dataset preparation
def load_custom_dataset(split):
    ds = []
    datasets_list = config['train_datasets'] if split == 'train' else config['eval_datasets']
    for dset in datasets_list:
        ds.append(Dataset.from_file(dset))
    ds_to_return = concatenate_datasets(ds)
    ds_to_return = ds_to_return.shuffle(seed=22)
    return ds_to_return

def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    transcription = batch["sentence"]
    batch["labels"] = processor.tokenizer(transcription).input_ids
    return batch

raw_dataset = DatasetDict({
    "train": load_custom_dataset('train'),
    "eval": load_custom_dataset('eval')
})

print("Dataset size before filtering:")
print("Train set:", len(raw_dataset["train"]))
print("Eval set:", len(raw_dataset["eval"]))

raw_dataset = raw_dataset.cast_column("audio", Audio(sampling_rate=config['sampling_rate']))
raw_dataset = raw_dataset.map(prepare_dataset, num_proc=config['num_proc'])

# Data collator and metrics
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Pad input features (audio features)
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Tokenize and pad labels
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding with -100 to ignore these during loss computation
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        # ✅ Explicitly set the attention mask for input features
        pad_token_id = self.processor.tokenizer.pad_token_id
        batch["attention_mask"] = (batch["input_features"] != pad_token_id).long()

        return batch


data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

metric = evaluate.load("wer")
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# Training setup
training_args = Seq2SeqTrainingArguments(
    output_dir=config['output_dir'],
    per_device_train_batch_size=config['train_batchsize'],
    learning_rate=config['learning_rate'],
    warmup_steps=config['warmup'],
    fp16=True,
    evaluation_strategy=config['train_strategy'],
    save_strategy=config['train_strategy'],
    num_train_epochs=config['num_epochs'] if config['train_strategy'] == 'epoch' else -1,  # Use -1 instead of None
    max_steps=config['num_steps'] if config['train_strategy'] == 'steps' else -1,          # Use -1 instead of None
    save_total_limit=10,
    per_device_eval_batch_size=config['eval_batchsize'],
    predict_with_generate=True,
    logging_steps=500,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    optim="adamw_bnb_8bit",
    resume_from_checkpoint=config['resume_from_ckpt'],
    remove_unused_columns=False,
)


trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=raw_dataset["train"],
    eval_dataset=raw_dataset["eval"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

processor.save_pretrained(training_args.output_dir)
print('TRAINING IN PROGRESS...')
trainer.train()
print('DONE TRAINING')


Using device: cuda
Dataset size before filtering:
Train set: 2162
Eval set: 541


  trainer = Seq2SeqTrainer(


TRAINING IN PROGRESS...


Step,Training Loss,Validation Loss,Wer
500,0.5877,0.104173,4.549075


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


DONE TRAINING


In [32]:
save_path = config['output_dir'] + "/final_model"
model.save_pretrained(save_path)
processor.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
feature_extractor.save_pretrained(save_path)
print(f"Model saved successfully at {save_path}")

# 🎤 INFERENCE FUNCTION FIX
from transformers import pipeline, WhisperFeatureExtractor

# Load saved model components
saved_model_path = save_path
processor = WhisperProcessor.from_pretrained(saved_model_path)
feature_extractor = WhisperFeatureExtractor.from_pretrained(saved_model_path)
model = WhisperForConditionalGeneration.from_pretrained(saved_model_path).to(device)

# Create ASR pipeline with the explicitly loaded feature extractor
asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=feature_extractor,
    device=0  # Use 0 for GPU, -1 for CPU
)

# Transcribe audio function
def transcribe_audio(file_path):
    result = asr_pipeline(file_path)
    print(f"Transcription for {file_path}:")
    print(result["text"])

# Example transcription
audio_file = "/harvard.wav"  # Replace with actual audio file path
transcribe_audio(audio_file)


Model saved successfully at /content/out_dir/final_model


Device set to use cuda:0


Transcription for /harvard.wav:
 The stale smell of old beer lingers it takes heat to bring out the odor a cold dip restores health and zest a salt pickle tastes fine with ham tacos al pastor are my favorite a zestful food is the hot cross bun


In [35]:
# Mount Google Drive (optional, if your folder is in Google Drive)
from google.colab import drive
drive.mount('/content/drive')

# Set your folder path
folder_path = '/content/out_dir/final_model'  # Replace with your actual folder path
zip_file = '/content/stt_whisper.zip'  # The zip file will be saved here

# Zip the folder
!zip -r {zip_file} {folder_path}

# Download the zipped folder
from google.colab import files
files.download(zip_file)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
  adding: content/out_dir/final_model/ (stored 0%)
  adding: content/out_dir/final_model/added_tokens.json (deflated 80%)
  adding: content/out_dir/final_model/model.safetensors (deflated 8%)
  adding: content/out_dir/final_model/tokenizer_config.json (deflated 96%)
  adding: content/out_dir/final_model/special_tokens_map.json (deflated 80%)
  adding: content/out_dir/final_model/generation_config.json (deflated 71%)
  adding: content/out_dir/final_model/preprocessor_config.json (deflated 42%)
  adding: content/out_dir/final_model/normalizer.json (deflated 81%)
  adding: content/out_dir/final_model/config.json (deflated 59%)
  adding: content/out_dir/final_model/vocab.json (deflated 68%)
  adding: content/out_dir/final_model/merges.txt (deflated 53%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>