# Real-time speech transcription

## Audio buffer management

The audio samples for real-time speech transcription will be accumulated in a rolling audio buffer with a limited duration (< 30 sec for whsiper).

You don't need to understand the code below, you can just execute the cell to use the class in the next steps.

In [1]:
import numpy as np
import time

# 1. Aggregates audio samples in a buffer and keeps only the last max_length_sec seconds.
# 2. Converts the audio format from input to output sampling rate and dtype.
class RollingAudioBuffer:

    # Default max length (whisper): 30 sec
    # Default input audio format (gradio): 48 kHz 16 bits int
    # Default output audio format (whisper): 16 kHz 32 bits float
    def __init__(self, max_length_sec=30, input_sampling_rate=48000, input_dtype=np.int16, output_sampling_rate=16000, output_dtype=np.float32):
        
        self.input_sampling_rate = input_sampling_rate
        self.input_dtype = input_dtype
        self.output_sampling_rate = output_sampling_rate
        self.output_dtype = output_dtype

        self.output_buffer = np.empty((0,), dtype=output_dtype)
        self.max_buffer_length = max_length_sec * output_sampling_rate

    # input_samples should be a numpy array recorded with input_sampling_rate and input_dtype
    def append_input_samples(self, input_samples):

        # Convert input sampling rate to output sampling rate
        if self.input_sampling_rate==48000 and self.output_sampling_rate==16000:
            input_samples = input_samples[::3]
        elif self.input_sampling_rate!=self.output_sampling_rate:
            raise TypeError(f"Conversion of input sampling rate {self.input_sampling_rate} to output sampling rate {self.output_sampling_rate} is not supported")
        
        # Convert input dtype to output dtype
        if self.input_dtype==np.int16 and self.output_dtype==np.float32:
            input_samples = input_samples.astype(np.float32)
            max_value = np.max(np.abs(input_samples))
            if max_value==0:
                return
            else:
                input_samples /= max_value
        elif self.input_dtype!=self.output_dtype:
            raise TypeError(f"Conversion of input type {self.input_type} to output type {self.output_type} is not supported")

        # Accumulate samples in the output buffer with a rolling window
        self.output_buffer = np.concatenate((self.output_buffer, input_samples))
        if len(self.output_buffer) > self.max_buffer_length:
            self.output_buffer = self.output_buffer[len(self.output_buffer)-self.max_buffer_length:]
    
    # output buffer is a numpy array ready to be used by the transcription model
    def get_output_samples(self):
        return self.output_buffer

    def clear(self):
        self.output_buffer = np.empty((0,), dtype=self.output_dtype)

    # Loads the output buffer from a file
    def load(self, filename):
        self.output_buffer = np.load(filename)

    # Saves the output buffer from a file
    def save(self, filename):
        np.save(filename, self.output_buffer)

Load test data:

In [2]:
audiobuffer = RollingAudioBuffer()
audiobuffer.load("data/test_audio_en.npy")

## Huggingface automatic-speech-recognition pipeline

### Install prerequisites

In [3]:
!ffmpeg -version

ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers
built with gcc 13 (Ubuntu 13.2.0-23ubuntu3)
configuration: --prefix=/usr --extra-version=3ubuntu5 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --disable-omx --enable-gnutls --enable-libaom --enable-libass --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libharfbuzz --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-

In [4]:
!uv add transformers accelerate

[2mResolved [1m263 packages[0m [2min 0.56ms[0m[0m
[2mAudited [1m159 packages[0m [2min 1ms[0m[0m


In [5]:
import importlib

In [6]:
importlib.metadata.version('transformers')

'4.57.1'

In [7]:
importlib.metadata.version('accelerate')

'1.11.0'

### Load model and create pipeline

In [8]:
from transformers import pipeline
import torch

# Load whisper-small in 16 bits with flash attention 2 on the GPU
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small", 
                       model_kwargs={"dtype":torch.float16, "attn_implementation":"sdpa", "device_map":0}, 
                       generate_kwargs = {"task":"transcribe", "language":"english"})

# torch compile the model to speed up inference
transcriber.model.model = torch.compile(transcriber.model.model)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


In [9]:
def transcribe_speech_to_text_hf(audiobuffer):
    return transcriber({"sampling_rate": audiobuffer.output_sampling_rate, "raw": audiobuffer.output_buffer})["text"]

In [10]:
transcribe_speech_to_text_hf(audiobuffer)

`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.


" So I am on a website to try and find a job and I had to answer to some questions and I had to say what I was studying and I had some difficulties to find the good topic and the... voilà! You didn't find it? Yeah!"

In [11]:
%timeit transcribe_speech_to_text_hf(audiobuffer)

427 ms ± 34.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Performance results

whisper-small on RTX 4090 -> 429 ms

## Translation with Helsinki-NLP/opus-mt

### Install prerequisites

In [15]:
!uv add sentencepiece sacremoses

[2K[2mResolved [1m264 packages[0m [2min 1.04s[0m[0m                                       [0m
[2K[2mPrepared [1m1 package[0m [2min 155ms[0m[0m                                              
[2K[2mInstalled [1m1 package[0m [2min 9ms[0m[0m                                  [0m
 [32m+[39m [1msacremoses[0m[2m==0.1.1[0m


In [17]:
importlib.metadata.version('sentencepiece')

'0.2.1'

In [18]:
importlib.metadata.version('sacremoses')

'0.1.1'

### Load model and create pipeline

In [35]:
from transformers import MarianMTModel, MarianTokenizer

# Load model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
opusmtmodel = MarianMTModel.from_pretrained(model_name, attn_implementation="sdpa", device_map=0)

In [36]:
def translate_text(text):
    encoded = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
    translated = opusmtmodel.generate(**encoded)
    decoded = tokenizer.decode(translated[0], skip_special_tokens=True)
    return decoded

In [37]:
text = " So, I am on a website to try and find a job and I had to answer to some questions and I had to say what I was studying and I had some difficulties to find the good topic. and did you find it? yeah"
translate_text(text)

"Donc, je suis sur un site Web pour essayer de trouver un emploi et j'ai dû répondre à certaines questions et j'ai dû dire ce que j'étudiais et j'ai eu quelques difficultés à trouver le bon sujet. et avez-vous trouvé?"

In [38]:
%timeit translate_text(text)

249 ms ± 12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Performance results

opus-mt-en-fr on RTX 4090 -> 249 ms

## Gradio speech transcription and translation UI

### Install prerequisites

In [40]:
!uv add gradio

[2mResolved [1m275 packages[0m [2min 0.71ms[0m[0m
[2mAudited [1m178 packages[0m [2min 1ms[0m[0m


In [41]:
importlib.metadata.version('gradio')

'5.49.1'

### Define audio processing function and build the associated UI

In [42]:
import time

# Single user application => simplify the implementation with a global audio buffer
audiobuffer = RollingAudioBuffer()

# Input: tuple (sampling_rate,input_samples) received from gr.Audio component, audio_samples format is 48 kHz mono 16 bits integers
# The input_samples are converted and added to the global audio buffer, which is then transcribed to text and translated 
# Output: tuple (english_text, french_text) transcribed and translated from the global audio buffer
def process_audio(gradio_audio):
    start_time = time.time()
    sampling_date, input_samples = gradio_audio
    audiobuffer.append_input_samples(input_samples)
    english_text = transcribe_speech_to_text_fw(audiobuffer)
    if len(english_text)>3:
        french_text = translate_text(english_text)
    else:
        french_text = ""
    end_time = time.time()
    refresh_rate = f"{end_time-start_time:.2f} sec"
    return english_text, french_text, refresh_rate

In [43]:
import gradio as gr

def on_clear():
    audiobuffer.clear()
    return ("", "", "")

# Define custom CSS
custom_css = """
    #english_transcription textarea, #french_translation textarea {
        font-size: 20px !important;
    }
"""

# Create the Gradio Blocks interface
with gr.Blocks(css=custom_css) as interface:
    gr.Markdown("# Real time speech translation")
    gr.Markdown("This application transcribes your english speech in real-time and translates it to French.")
    
    with gr.Row():
        audio_input = gr.Audio(sources=["microphone"], streaming=True, scale=5)
        clear_button = gr.Button("Clear")
        
    with gr.Row():
        english_output = gr.Textbox(label="Transcription (English)", lines=10, elem_id="english_transcription")
        french_output = gr.Textbox(label="Translation (French)", lines=10, elem_id="french_translation")
    
    with gr.Row():
        refresh_rate = gr.Textbox(label="Refresh rate")
    
    clear_button.click(fn=on_clear, inputs=None, outputs=[english_output, french_output, refresh_rate])
    
    audio_input.stream(process_audio, inputs=audio_input, outputs=[english_output, french_output, refresh_rate]) 

### Display and stop the UI

In [46]:
import os

In [54]:
# Port exposed by wordslab notebooks
port = int(os.getenv("USER_APP1_PORT"))
url = os.getenv("USER_APP1_URL")

In [60]:
# Optional https certificate
workspace = os.getenv("WORDSLAB_WORKSPACE")
keyfile = os.path.join(workspace, ".secrets", "certificate-key.pem")
certfile = os.path.join(workspace, ".secrets", "certificate.pem")

# Only add SSL keys if they exist
launch_kwargs = { }
if os.path.isfile(keyfile) and os.path.isfile(certfile):
    launch_kwargs["ssl_keyfile"] = keyfile
    launch_kwargs["ssl_certfile"] = certfile

In [63]:
interface.launch(server_name="0.0.0.0", server_port=port) #, **launch_kwargs)

* Running on local URL:  http://0.0.0.0:8883
* To create a public link, set `share=True` in `launch()`.




Navigate to this URL

In [51]:
url

'https://192.168.1.197:8883'

In [62]:
interface.close()

Closing server running on port: 8883
