# Transcribe audio files in batch mode as fast as possible

In [3]:
from importlib.metadata import version

In [2]:
version('torch')

'2.4.0'

In [None]:
pip install --upgrade pip

In [None]:
pip install --upgrade transformers accelerate

In [4]:
version('transformers')

'4.45.1'

In [5]:
version('accelerate')

'0.34.2'

In [None]:
pip install --upgrade flash-attn --no-build-isolation

In [8]:
version('flash_attn')

'2.6.3'

## Huggingface Whisper

https://github.com/huggingface/speech-to-speech/blob/main/STT/whisper_stt_handler.py

https://huggingface.co/eustlb/distil-large-v3-fr

In [2]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "eustlb/distil-large-v3-fr"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, 
    use_safetensors=True, low_cpu_mem_usage=True, device_map=device, 
    attn_implementation="flash_attention_2"
)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    return_timestamps=True,
    torch_dtype=torch_dtype
)

# warmup
dummy_input = torch.randn( (1, model.config.num_mel_bins, 3000), dtype=torch_dtype, device=device)
_ = model.generate(dummy_input)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [4]:
# ./audio/2024-09-19 16-32-50.mp3
# - batch size 64, flash attention 2, no compile: 6.14 sec
# - batch size 64, sdpa attention, compile default with fullgraph: 18 sec (first test) / 34 sec (second test)
# => follow line by line the example of https://huggingface.co/eustlb/distil-large-v3-fr, it is the fastest combination

# Test with 2 mp3 files
# - small: 24 min 47 sec (1487 sec), 22.7 MB file
# - big : 1h 24 min 19 sec (5059 sec), 77.2 MB file

# Sequential long-form: pipe(mp3file)
# - gpu memory = 3.1 GB -> 27 sec / 94 sec

# Chunked long-form: pipe(mp3file, chunk_length_s=25, batch_size=xxx)
# batch size 1 : gpu memory = 3.1 GB -> 33 sec / 111 sec
# batch size 8 : gpu memory = 3.6 GB -> 13 sec / 46 sec
# batch size 16 : gpu memory = 4.4 GB -> 11 sec / 37 sec
# batch size 32 : gpu memory = 5.1 GB -> 9.5 sec / 34 sec
# batch size 64 : gpu memory = 7.2 GB -> 10.3 sec / 31 sec
# batch size 128 : gpu memory = 11 GB -> 9.3 sec / 33 sec
# Chunked attention: pipe(mp3file, chunk_length_s=25, batch_size=xxx)

# Sequential long-form algorithm + batch_size 2: pipe([mp3file1, mp3file2], batch_size=2)
# => RuntimeError: The expanded size of the tensor (505922) must match the existing size (148730) at non-singleton dimension 1.  Target sizes: [128, 505922].  Tensor sizes: [128, 148730]

result = pipe("./audio/2024-09-19 16-32-50.mp3")
print(result["text"][:200])



 Ok, donc dans la première partie, on a fait beaucoup de choses, on s'est posé beaucoup de questions pour pouvoir cadrer, sélectionner, identifier des projets à Basse-Dia qui soient pertinents. La pha


In [6]:
result = pipe("./audio/2024-09-19 15-03-35.mp3")
print(result["text"][:200])



 Ok. Donc, on poursuit notre énumération de tous les aspects à prendre en compte pour voir si un projet doit être fait, faisable et rentable, et raisonnable en matière d'environnement, etc. Donc, poin


In [None]:
results = pipe(["./audio/2024-09-19 15-03-35.mp3","./audio/2024-09-19 16-32-50.mp3"], batch_size=2)
for result in results: print(result["text"])

## Nvidia Nemo ASR

https://developer.nvidia.com/blog/accelerating-leaderboard-topping-asr-models-10x-with-nvidia-nemo/

https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/intro.html