In [1]:
import numpy as np
import torch
import torchaudio
from datasets import load_dataset, Dataset, concatenate_datasets
import whisper
from whisper.audio import log_mel_spectrogram
from whisper.tokenizer import get_tokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_size = "tiny"
model = whisper.load_model(model_size) 
tokenizer = get_tokenizer(model_size, language="en")

dataset = load_dataset(
    "mozilla-foundation/common_voice_11_0",
    "en",
    split="train",
    trust_remote_code=True
)
dataset = dataset.select(range(100)) 

In [3]:
from datasets import load_dataset, Dataset, concatenate_datasets, Audio, Features, Value, Sequence

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

dataset = dataset.select(range(100))

In [4]:
def generate_silence_example():

    return np.zeros(16000, dtype=np.float32)

In [11]:
from transformers import TrainingArguments, Trainer

def preprocess_function(batch):

    if "input_features" in batch and "labels" in batch and "is_silent" in batch and "audio" not in batch:

        return {
            "input_features": batch["input_features"],
            "labels": batch["labels"],
            "is_silent": batch["is_silent"]
        }

    processed_examples = {
        "input_features": [],
        "labels": [],
        "is_silent": []
    }


    transcript_key = None
    for key in ["segment", "sentence", "text"]:
        if key in batch:
            transcript_key = key
            break

    if transcript_key is None:
        raise KeyError(
            "No transcript key found in batch. Available keys: " + ", ".join(batch.keys()))

    for audio_item, transcript in zip(batch["audio"], batch[transcript_key]):

        if isinstance(audio_item, dict):
            audio = audio_item.get("array")
            sampling_rate = audio_item.get("sampling_rate")
        elif isinstance(audio_item, str):
            waveform, sampling_rate = torchaudio.load(audio_item)
            audio = waveform.squeeze(0).numpy()
        elif isinstance(audio_item, np.ndarray):
            audio = audio_item
            sampling_rate = 16000 
        elif audio_item is None:
          
            audio = np.zeros(16000, dtype=np.float32)
            sampling_rate = 16000
        else:
            raise ValueError(
                f"Unexpected audio format: type {type(audio_item)}")

      
        audio = np.array(audio, dtype=np.float32)

       
        if sampling_rate != 16000:
            waveform = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
            resampler = torchaudio.transforms.Resample(
                orig_freq=sampling_rate, new_freq=16000)
            waveform = resampler(waveform)
            audio = waveform.squeeze(0).numpy()
            sampling_rate = 16000

       
        input_features = log_mel_spectrogram(audio)
        processed_examples["input_features"].append(input_features)


        tokenized = tokenizer.encode(transcript)
        processed_examples["labels"].append(tokenized)

        processed_examples["is_silent"].append(np.abs(audio).mean() < 1e-4)

    return processed_examples



if "input_features" in dataset.column_names and "audio" not in dataset.column_names:
    print("Dataset already processed, skipping preprocessing step")

    columns_to_keep = ["input_features", "labels", "is_silent"]
    dataset = dataset.select_columns(columns_to_keep)
else:

    dataset = dataset.map(preprocess_function, batched=True)

    columns_to_keep = ["input_features", "labels", "is_silent"]
    dataset = dataset.select_columns(columns_to_keep)

silence_examples = []
for _ in range(10):  
    silence_audio = {"array": generate_silence_example(),
                     "sampling_rate": 16000}
    silence_examples.append({"audio": silence_audio, "segment": ""})

silence_dataset = Dataset.from_dict({
    "audio": [ex["audio"] for ex in silence_examples],
    "segment": [ex["segment"] for ex in silence_examples],
})
silence_dataset = silence_dataset.map(preprocess_function, batched=True)

silence_dataset = silence_dataset.select_columns(
    ["input_features", "labels", "is_silent"])


processed_features = Features({
    "input_features": Sequence(feature=Sequence(feature=Value("float32"))),
    "labels": Sequence(Value("int64")),
    "is_silent": Value("bool")
})

dataset = dataset.cast(processed_features)
silence_dataset = silence_dataset.cast(processed_features)
combined_dataset = concatenate_datasets([dataset, silence_dataset])


def data_collator(batch):
    input_features = [item["input_features"] for item in batch]
    labels = [torch.tensor(item["labels"]) for item in batch]

    max_time = max(feat.shape[1] for feat in input_features)
    padded_features = []
    for feat in input_features:
        pad_len = max_time - feat.shape[1]
        if pad_len > 0:
            pad = torch.zeros((feat.shape[0], pad_len))
            padded_feat = torch.cat([feat, pad], dim=1)
        else:
            padded_feat = feat
        padded_features.append(padded_feat)
    padded_features = torch.stack(padded_features)

    max_label_len = max(label.size(0) for label in labels)
    padded_labels = []
    for label in labels:
        pad_len = max_label_len - label.size(0)
        if pad_len > 0:
            padded_label = torch.cat(
                [label, torch.full((pad_len,), -100, dtype=label.dtype)])
        else:
            padded_label = label
        padded_labels.append(padded_label)
    padded_labels = torch.stack(padded_labels)

    is_silent = [item["is_silent"] for item in batch]

    return {
        "input_features": padded_features,
        "labels": padded_labels,
        "is_silent": is_silent,
    }

# -----------------------------
# 7. Define a custom Trainer for Whisper
# -----------------------------


   

Dataset already processed, skipping preprocessing step


Map: 100%|██████████| 10/10 [00:00<00:00, 213.46 examples/s]
Casting the dataset: 100%|██████████| 100/100 [00:00<00:00, 2297.85 examples/s]
Casting the dataset: 100%|██████████| 10/10 [00:00<00:00, 4916.54 examples/s]


In [22]:

from transformers import TrainingArguments


class CustomWhisperTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Forward pass: note that your inputs should be formatted correctly for the Whisper model.
        outputs = model(**inputs)
        # Assuming the model returns a loss attribute in its outputs
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss


training_args = TrainingArguments(
    output_dir="./whisper-tiny-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    num_train_epochs=3,
    save_strategy="steps",
    evaluation_strategy="no",
    logging_steps=10,
    save_steps=50,
)

# Initialize the Trainer
trainer = CustomWhisperTrainer(
    model=model,
    args=training_args,
    train_dataset=combined_dataset,
    data_collator=data_collator,  
)

# Start fine-tuning
trainer.train()

NotImplementedError: Could not run 'aten::_sparse_coo_tensor_with_dims_and_tensors' with arguments from the 'SparseMPS' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::_sparse_coo_tensor_with_dims_and_tensors' is only available for these backends: [MPS, Meta, SparseCPU, SparseMeta, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMTIA, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradMeta, AutogradNestedTensor, Tracer, AutocastCPU, AutocastXPU, AutocastMPS, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

MPS: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/mps/MPSFallback.mm:78 [backend fallback]
Meta: registered at /Users/runner/work/pytorch/pytorch/pytorch/build/aten/src/ATen/RegisterMeta.cpp:27006 [kernel]
SparseCPU: registered at /Users/runner/work/pytorch/pytorch/pytorch/build/aten/src/ATen/RegisterSparseCPU.cpp:1407 [kernel]
SparseMeta: registered at /Users/runner/work/pytorch/pytorch/pytorch/build/aten/src/ATen/RegisterSparseMeta.cpp:291 [kernel]
BackendSelect: registered at /Users/runner/work/pytorch/pytorch/pytorch/build/aten/src/ATen/RegisterBackendSelect.cpp:792 [kernel]
Python: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:194 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:503 [backend fallback]
Functionalize: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/FunctionalizeFallbackKernel.cpp:349 [backend fallback]
Named: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/ConjugateFallback.cpp:17 [backend fallback]
Negative: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/native/NegateFallback.cpp:18 [backend fallback]
ZeroTensor: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/ZeroTensorFallback.cpp:86 [backend fallback]
ADInplaceOrView: fallthrough registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:100 [backend fallback]
AutogradOther: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradCPU: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradCUDA: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradHIP: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradXLA: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradMPS: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradIPU: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradXPU: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradHPU: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradVE: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradLazy: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradMTIA: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradPrivateUse1: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradPrivateUse2: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradPrivateUse3: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradMeta: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
AutogradNestedTensor: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/VariableType_2.cpp:20142 [autograd kernel]
Tracer: registered at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/generated/TraceType_2.cpp:17801 [kernel]
AutocastCPU: fallthrough registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/autocast_mode.cpp:322 [backend fallback]
AutocastXPU: fallthrough registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/autocast_mode.cpp:465 [backend fallback]
AutocastMPS: fallthrough registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/autocast_mode.cpp:209 [backend fallback]
AutocastCUDA: fallthrough registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/autocast_mode.cpp:165 [backend fallback]
FuncTorchBatched: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:731 [backend fallback]
BatchedNestedTensor: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:758 [backend fallback]
FuncTorchVmapMode: fallthrough registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/functorch/VmapModeRegistrations.cpp:27 [backend fallback]
Batched: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/LegacyBatchingRegistrations.cpp:1075 [backend fallback]
VmapMode: fallthrough registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/functorch/TensorWrapper.cpp:207 [backend fallback]
PythonTLSSnapshot: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:202 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:499 [backend fallback]
PreDispatch: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:206 [backend fallback]
PythonDispatcher: registered at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:198 [backend fallback]


In [None]:
import json
export_dir = "./whisper-tiny-finetuned"
torch.save(model.state_dict(), f"{export_dir}/pytorch_model.bin")

# Save tokenizer information manually.
tokenizer_config = {
    "vocab_size": tokenizer.vocab_size,
    "language": "en",
    # You can add more tokenizer settings if needed.
}
with open(f"{export_dir}/tokenizer_config.json", "w") as f:
    json.dump(tokenizer_config, f)

print(f"Model and tokenizer configuration exported to {export_dir}")