We will use a rolling audio buffer with a limited duration:

In [1]:
import numpy as np
import time

class RollingAudioBuffer:
    def __init__(self, max_length_sec, sampling_rate):
        self.buffer = np.empty((0,), dtype=np.float32)
        self.sampling_rate = sampling_rate
        self.max_length = max_length_sec * sampling_rate

    def add_samples(self, np_samples):
        self.buffer = np.concatenate((self.buffer, np_samples))
        if len(self.buffer) > self.max_length:
            self.buffer = self.buffer[len(self.buffer)-self.max_length:]

Load test data:

In [2]:
audiobuffer = RollingAudioBuffer(max_length_sec=25, sampling_rate=48000)
audiobuffer.buffer = np.load("test_en.npy")

# 1. Huggingface automatic-speech-recognition pipeline

## Install prerequisites

This dependency is necessary for whisper with Huggingface, but is not necessary for whisper with faster-whisper

> apt install ffmpeg

In [12]:
!ffmpeg -version

ffmpeg version 4.3 Copyright (c) 2000-2020 the FFmpeg developers
built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
configuration: --prefix=/opt/conda/conda-bld/ffmpeg_1597178665428/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placeh --cc=/opt/conda/conda-bld/ffmpeg_1597178665428/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --disable-openssl --enable-avresample --enable-gnutls --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-pic --enable-pthreads --enable-shared --disable-static --enable-version3 --enable-zlib --enable-libmp3lame
libavutil      56. 51.100 / 56. 51.100
libavcodec     58. 91.100 / 58. 91.100
libavformat    58. 45.100 / 58. 45.100
libavdevice    58. 10.100 / 58. 10.100
libavfilter     7. 85.100 /  7. 85.100
libavresample   4.  0.  0 /  4.  0.  0
libswscale      5.  7.1

In [6]:
import importlib

In [None]:
pip install transformers

In [5]:
importlib.metadata.version('transformers')

'4.42.3'

In [None]:
pip install flash_attn

In [8]:
importlib.metadata.version('flash_attn')

'2.5.9.post1'

In [None]:
pip install accelerate

In [7]:
importlib.metadata.version('accelerate')

'0.32.1'

## Load model and create pipeline

In [None]:
from transformers import pipeline
import torch

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small", 
                       model_kwargs={"torch_dtype":torch.float16, "attn_implementation":"flash_attention_2", "device_map":0}, 
                       generate_kwargs = {"task":"transcribe", "language":"english"})

transcriber.model.model = torch.compile(transcriber.model.model)

In [6]:
transcriber({"sampling_rate": audiobuffer.sampling_rate, "raw": audiobuffer.buffer})["text"]

" So I am on a website to try and find a job and I had to answer to some questions and I had to say what I was studying and I had some difficulties to find the good topic and the... voilà. Y'a trouvé pas? Ouais, j'ai"

In [8]:
transcriber({"sampling_rate": audiobuffer.sampling_rate/4, "raw": audiobuffer.buffer[::4]})["text"]

' So I am on a website to try and find a job and I had to answer to some questions and I had to say what I was studying and I had some difficulties to find the good topic and the... voilà!'

## Performance

whisper-small
- basic huggingface pipeline: 5.92 sec
- with 16 bits & flash attention: 2.42 sec
- and with torch.compile: 2.25 sec
- then plugged: 1.07 sec
- then divide sampling rate by 4: 800-900 ms

# 2. Systran faster-whisper with ctranslate2

## Install prerequisites

In [25]:
import importlib

In [None]:
pip install faster-whisper

In [26]:
importlib.metadata.version('faster-whisper')

'1.0.3'

## Load model and create pipeline

In [9]:
from faster_whisper import WhisperModel

# distill-large-v3 : english only, 1.5GB, 750 ms
# model_size = "tiny" # 20 sec -> 1.24 sec - too many errors
# model_size = "base" # 20 sec -> 1.72 sec vs 3 sec with huggingface
# model_size = "small" # 20 sec -> 4.18 sec
# model_size = "large-v3" # 20 sec -> 16.33 sec

model = WhisperModel("small", device="cuda", compute_type="float16")

In [12]:
segments, info = model.transcribe(audiobuffer.buffer[::4], beam_size=5, language="en", condition_on_previous_text=False, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

[0.08s -> 7.44s]  So I am on a website to try and find a job and I had to answer to some questions
[7.44s -> 14.24s]  And I had to say what I was studying and I had some difficulties to find the good the topic and
[15.92s -> 18.46s]  Yeah


## Performance

whisper-small
- sampling rate divided by 4: 618 ms

# 3. Translation with Helsinki-NLP/opus-mt

## Install prerequisites

In [17]:
import importlib

In [None]:
pip install sentencepiece

In [3]:
importlib.metadata.version('sentencepiece')

'0.2.0'

In [None]:
pip install sacremoses

In [18]:
importlib.metadata.version('sacremoses')

'0.1.1'

## Load model and create pipeline

In [19]:
from transformers import MarianMTModel, MarianTokenizer

# Load model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-fr"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name, )

# Prepare the input text
def translate_texts(src_texts):
    encoded = tokenizer(src_texts, return_tensors="pt", padding=True, truncation=True)
    
    # Perform the translation
    translated = model.generate(**encoded)
    tgt_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    return tgt_texts

In [23]:
# Test the translations
src_texts = ["So I am on a website to try and find a job and I had to answer to some questions and I had to say what I was studying and I had some difficulties to find the good topic and the... voilà!"]
tgt_texts = translate_texts(src_texts)

for src, tgt in zip(src_texts, tgt_texts):
    print(f"Source: {src}")
    print(f"Translated: {tgt}")

Source: So I am on a website to try and find a job and I had to answer to some questions and I had to say what I was studying and I had some difficulties to find the good topic and the... voilà!
Translated: Donc je suis sur un site web pour essayer de trouver un emploi et j'ai dû répondre à certaines questions et j'ai dû dire ce que j'étudiais et j'ai eu quelques difficultés à trouver le bon sujet et le... voilà!


## Performance

opus-mt-en-fr
- 592 ms

Note: MarianMTModel doesn't support flash attention

# 4. Real-time speech transcription UI with gradio

## Install prerequisites

In [None]:
pip install gradio

In [24]:
importlib.metadata.version('gradio')

'4.37.2'

## Display UI

In [26]:
import gradio as gr

audiobuffer = RollingAudioBuffer(max_length_sec=25, sampling_rate=48000)

def transcribe(audio):
    # Get audio stream from gr.Audio component: int16
    sampling_rate, np_samples = audio
    print(sampling_rate, np_samples.shape)

    # Convert audio stream to whisper format: fp32 between -1 and 1
    np_samples = np_samples.astype(np.float32)
    np_samples /= np.max(np.abs(np_samples))

    # Accumulate audio samples in buffer
    audiobuffer.add_samples(np_samples)
    
    # Convert audio stream to text
    start_time = time.time()
    text = transcriber({"sampling_rate": sampling_rate, "raw": audiobuffer.buffer})["text"]
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Transcription executed in {elapsed_time*1000:.2f} ms")
    
    return text


demo = gr.Interface(
    transcribe,
    gr.Audio(sources=["microphone"], streaming=True),
    "text",
    live=True,
)

demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




48000 (24000,)




Transcription executed in 266.19 ms
48000 (24000,)
Transcription executed in 140.58 ms
48000 (24000,)
Transcription executed in 120.30 ms
48000 (24000,)
Transcription executed in 182.59 ms
48000 (24000,)
Transcription executed in 184.64 ms


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


48000 (24000,)
Transcription executed in 237.20 ms
48000 (48000,)
Transcription executed in 268.52 ms
48000 (24000,)
Transcription executed in 296.22 ms
48000 (48000,)
Transcription executed in 297.93 ms
48000 (24000,)
Transcription executed in 336.71 ms
48000 (48000,)
Transcription executed in 416.45 ms
48000 (24000,)
Transcription executed in 380.45 ms
48000 (48000,)
Transcription executed in 413.32 ms
48000 (48000,)
Transcription executed in 573.85 ms
48000 (48000,)
Transcription executed in 528.43 ms
48000 (48000,)
Transcription executed in 562.93 ms
48000 (48000,)
Transcription executed in 664.57 ms
48000 (48000,)
Transcription executed in 691.68 ms
48000 (48000,)
Transcription executed in 687.75 ms
48000 (48960,)
Transcription executed in 495.76 ms


In [13]:
np.save("test_en.npy", audiobuffer.buffer)