# Real-time speech transcription

## Audio buffer management

The audio samples for real-time speech transcription will be accumulated in a rolling audio buffer with a limited duration (< 30 sec for whsiper).

In [1]:
import numpy as np
import time

# 1. Aggregates audio samples in a buffer and keeps only the last max_length_sec seconds.
# 2. Converts the audio format from input to output sampling rate and dtype.
class RollingAudioBuffer:

    # Default max length (whisper): 30 sec
    # Default input audio format (gradio): 48 kHz 16 bits int
    # Default output audio format (whisper): 16 kHz 32 bits float
    def __init__(self, max_length_sec=30, input_sampling_rate=48000, input_dtype=np.int16, output_sampling_rate=16000, output_dtype=np.float32):
        
        self.input_sampling_rate = input_sampling_rate
        self.input_dtype = input_dtype
        self.output_sampling_rate = output_sampling_rate
        self.output_dtype = output_dtype

        self.output_buffer = np.empty((0,), dtype=output_dtype)
        self.max_buffer_length = max_length_sec * output_sampling_rate

    # input_samples should be a numpy array recorded with input_sampling_rate and input_dtype
    def append_input_samples(self, input_samples):

        # Convert input sampling rate to output sampling rate
        if self.input_sampling_rate==48000 and self.output_sampling_rate==16000:
            input_samples = input_samples[::3]
        elif self.input_sampling_rate!=self.output_sampling_rate:
            raise TypeError(f"Conversion of input sampling rate {self.input_sampling_rate} to output sampling rate {self.output_sampling_rate} is not supported")
        
        # Convert input dtype to output dtype
        if self.input_dtype==np.int16 and self.output_dtype==np.float32:
            input_samples = input_samples.astype(np.float32)
            max_value = np.max(np.abs(input_samples))
            if max_value==0:
                return
            else:
                input_samples /= max_value
        elif self.input_dtype!=self.output_dtype:
            raise TypeError(f"Conversion of input type {self.input_type} to output type {self.output_type} is not supported")

        # Accumulate samples in the output buffer with a rolling window
        self.output_buffer = np.concatenate((self.output_buffer, input_samples))
        if len(self.output_buffer) > self.max_buffer_length:
            self.output_buffer = self.output_buffer[len(self.output_buffer)-self.max_buffer_length:]
    
    # output buffer is a numpy array ready to be used by the transcription model
    def get_output_samples(self):
        return self.output_buffer

    def clear(self):
        self.output_buffer = np.empty((0,), dtype=self.output_dtype)

    # Loads the output buffer from a file
    def load(self, filename):
        self.output_buffer = np.load(filename)

    # Saves the output buffer from a file
    def save(self, filename):
        np.save(filename, self.output_buffer)

Load test data:

In [2]:
audiobuffer = RollingAudioBuffer()
audiobuffer.load("test_en.npy")

## Option 1: Huggingface automatic-speech-recognition pipeline

### Install prerequisites

This dependency is necessary for whisper with Huggingface, but is not necessary for whisper with faster-whisper

> apt install ffmpeg

In [3]:
!ffmpeg -version

ffmpeg version 4.3 Copyright (c) 2000-2020 the FFmpeg developers
built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
configuration: --prefix=/opt/conda/conda-bld/ffmpeg_1597178665428/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placeh --cc=/opt/conda/conda-bld/ffmpeg_1597178665428/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --disable-openssl --enable-avresample --enable-gnutls --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-pic --enable-pthreads --enable-shared --disable-static --enable-version3 --enable-zlib --enable-libmp3lame
libavutil      56. 51.100 / 56. 51.100
libavcodec     58. 91.100 / 58. 91.100
libavformat    58. 45.100 / 58. 45.100
libavdevice    58. 10.100 / 58. 10.100
libavfilter     7. 85.100 /  7. 85.100
libavresample   4.  0.  0 /  4.  0.  0
libswscale      5.  7.1

In [4]:
import importlib

In [None]:
pip install transformers

In [6]:
importlib.metadata.version('transformers')

'4.42.3'

In [None]:
pip install flash_attn

In [7]:
importlib.metadata.version('flash_attn')

'2.5.9.post1'

In [None]:
pip install accelerate

In [8]:
importlib.metadata.version('accelerate')

'0.32.1'

### Load model and create pipeline

In [9]:
from transformers import pipeline
import torch

# Load whisper-small in 16 bits with flash attention 2 on the GPU
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small", 
                       model_kwargs={"torch_dtype":torch.float16, "attn_implementation":"flash_attention_2", "device_map":0}, 
                       generate_kwargs = {"task":"transcribe", "language":"english"})

# torch compile the model to speed up inference
transcriber.model.model = torch.compile(transcriber.model.model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
def transcribe_speech_to_text_hf(audiobuffer):
    return transcriber({"sampling_rate": audiobuffer.output_sampling_rate, "raw": audiobuffer.output_buffer})["text"]

In [12]:
transcribe_speech_to_text_hf(audiobuffer)



" So I am on a website to try and find a job and I had to answer to some questions and I had to say what I was studying and I had some difficulties to find the good topic and the... voilà! You didn't find it? Yeah!"

### Performance results

whisper-small
- basic huggingface pipeline: 5.92 sec
- with 16 bits & flash attention: 2.42 sec
- and with torch.compile: 2.25 sec
- laptop plugged: 1.07 sec
- divide sampling rate by 3: 800-900 ms

## Option 2: Systran faster-whisper with ctranslate2

### Install prerequisites

In [25]:
import importlib

In [None]:
pip install faster-whisper

In [13]:
importlib.metadata.version('faster-whisper')

'1.0.3'

### Load model and create pipeline

In [14]:
from faster_whisper import WhisperModel

# Directly load an optimized model in 16 bits on the GPU
whispermodel = WhisperModel("small", device="cuda", compute_type="float16")

In [15]:
def transcribe_speech_to_text_fw(audiobuffer):
    segments, info = whispermodel.transcribe(audiobuffer.output_buffer, beam_size=5, language="en", condition_on_previous_text=False, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500))
    return "".join([segment.text for segment in segments])

In [17]:
transcribe_speech_to_text_fw(audiobuffer)

' So, I am on a website to try and find a job and I had to answer to some questions and I had to say what I was studying and I had some difficulties to find the good topic. and did you find it? yeah'

### Performance results

whisper-small
- 618 ms

Model sizes: Huggingface default perf vs faster-whisper optimized perf (unplugged) 
- distill-large-v3 : english only, 1.5GB, 750 ms
- "tiny": 20 sec -> 1.24 sec - too many errors
- "base": 20 sec -> 1.72 sec vs 3 sec with huggingface
- "small": 20 sec -> 4.18 sec
- "large-v3": 20 sec -> 16.33 sec

## Translation with Helsinki-NLP/opus-mt

### Install prerequisites

In [17]:
import importlib

In [None]:
pip install sentencepiece

In [18]:
importlib.metadata.version('sentencepiece')

'0.2.0'

In [None]:
pip install sacremoses

In [19]:
importlib.metadata.version('sacremoses')

'0.1.1'

### Load model and create pipeline

In [20]:
from transformers import MarianMTModel, MarianTokenizer

# Load model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
opusmtmodel = MarianMTModel.from_pretrained(model_name)

# Note: unfortunately, MarianMTModel doesn't support flash attention yet

In [21]:
def translate_text(text):
    encoded = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = opusmtmodel.generate(**encoded)
    decoded = tokenizer.decode(translated[0], skip_special_tokens=True)
    return decoded

In [22]:
text = " So, I am on a website to try and find a job and I had to answer to some questions and I had to say what I was studying and I had some difficulties to find the good topic. and did you find it? yeah"
translate_text(text)

"Donc, je suis sur un site Web pour essayer de trouver un emploi et j'ai dû répondre à certaines questions et j'ai dû dire ce que j'étudiais et j'ai eu quelques difficultés à trouver le bon sujet. et avez-vous trouvé?"

### Performance results

opus-mt-en-fr
- 592 ms

## Gradio speech transcription UI

### Install prerequisites

In [None]:
pip install gradio

In [23]:
importlib.metadata.version('gradio')

'4.37.2'

### Define audio processing function and build the associated UI

In [66]:
import time

# Single user application => simplify the implementation with a global audio buffer
audiobuffer = RollingAudioBuffer()

# Input: tuple (sampling_rate,input_samples) received from gr.Audio component, audio_samples format is 48 kHz mono 16 bits integers
# The input_samples are converted and added to the global audio buffer, which is then transcribed to text and translated 
# Output: tuple (english_text, french_text) transcribed and translated from the global audio buffer
def process_audio(gradio_audio):
    start_time = time.time()
    sampling_date, input_samples = gradio_audio
    audiobuffer.append_input_samples(input_samples)
    english_text = transcribe_speech_to_text_fw(audiobuffer)
    if len(english_text)>3:
        french_text = translate_text(english_text)
    else:
        french_text = ""
    end_time = time.time()
    refresh_rate = f"{end_time-start_time:.2f} sec"
    return english_text, french_text, refresh_rate

In [69]:
import gradio as gr

def on_clear():
    audiobuffer.clear()
    return ("", "", "")

# Define custom CSS
custom_css = """
    #english_transcription textarea, #french_translation textarea {
        font-size: 20px !important;
    }
"""

# Create the Gradio Blocks interface
with gr.Blocks(css=custom_css) as interface:
    gr.Markdown("# Crédit Mutuel - IBM supervision committee")
    gr.Markdown("This application transcribes your english speech in real-time and translates it to French.")
    
    with gr.Row():
        audio_input = gr.Audio(sources=["microphone"], streaming=True, scale=5)
        clear_button = gr.Button("Clear")
        
    with gr.Row():
        english_output = gr.Textbox(label="Transcription (English)", lines=10, elem_id="english_transcription")
        french_output = gr.Textbox(label="Translation (French)", lines=10, elem_id="french_translation")
    
    with gr.Row():
        refresh_rate = gr.Textbox(label="Refresh rate")
    
    clear_button.click(fn=on_clear, inputs=None, outputs=[english_output, french_output, refresh_rate])
    
    audio_input.stream(process_audio, inputs=audio_input, outputs=[english_output, french_output, refresh_rate]) 

### Display and stop the UI

In [70]:
interface.launch()

Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.




In [67]:
interface.close()

Closing server running on port: 7863


In [None]:
audiobuffer.save("last_speech_en.py")