### Setup

In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(torch_dtype)

cuda:0
torch.float16


### Attempt at Optimizing the code

In [53]:
model_id = "distil-whisper/distil-large-v2"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device).eval()

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0): WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
     

In [73]:
from datasets import load_dataset
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor

# Load the dataset
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

# Get a sample from the dataset
sample = dataset[0]["audio"]
audio_array = sample["array"]
sampling_rate = sample["sampling_rate"]

# Load the model and processor
# model_id = "distil-whisper/distil-large-v2"
# model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# Process the audio
inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt", torch_dtype=torch_dtype)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [74]:
print(inputs)

{'input_features': tensor([[[ 1.1933e-01, -9.4576e-02, -1.0978e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [ 4.9347e-04, -8.9271e-02, -6.7290e-02,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [-1.5326e-01, -2.0804e-01, -2.2227e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         ...,
         [-8.0603e-01, -8.0603e-01, -7.9997e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [-8.0603e-01, -7.7211e-01, -8.0603e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [-8.0603e-01, -8.0603e-01, -8.0603e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01]]])}


In [75]:
inputs['input_features'].shape, inputs['input_features'].type

(torch.Size([1, 80, 3000]), <function Tensor.type>)

In [76]:
inputs['input_features'] = inputs['input_features'].type(torch_dtype)

In [32]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = dataset[0]["audio"]
print(sample)

{'path': '/home/sergiu/.cache/huggingface/datasets/downloads/extracted/1a60ccb5a1766c398d19189d3cde6df8fd01ce2e8102d80a12f5a9e732f89e55/dev_clean/1272/128104/1272-128104-0000.flac', 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
       0.0010376 ]), 'sampling_rate': 16000}


In [78]:
%%timeit
# Inference
model.eval()
with torch.no_grad():
    outputs = model.generate(**inputs.to(device))

101 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Working Code

In [51]:
from datasets import load_dataset
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
import torch

# Load the dataset
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

# Get a sample from the dataset
sample = dataset[0]["audio"]
audio_array = sample["array"]
sampling_rate = sample["sampling_rate"]

# Load the model and processor
model_id = "distil-whisper/distil-large-v2"  # Replace with your desired Whisper model ID
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Process the audio
inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt")

# Move tensors to the right device
input_values = inputs.to(device)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


 Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.


In [59]:
inputs['input_features'].shape

torch.Size([1, 80, 585])

In [52]:
%%timeit
# Inference
model.eval()
with torch.no_grad():
    outputs = model.generate(**input_values.to(device))

206 ms ± 1.22 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# Decode the model output
transcription = processor.batch_decode(outputs, skip_special_tokens=True)

# Print the transcription
print(transcription[0])