This dependency is necessary for whisper with Huggingface, but is not necessary for whisper with faster-whisper

> apt install ffmpeg

In [4]:
import importlib

In [None]:
pip install gradio

In [3]:
importlib.metadata.version('gradio')

'4.37.2'

In [None]:
pip install transformers

In [5]:
importlib.metadata.version('transformers')

'4.42.3'

In [3]:
import gradio as gr
from transformers import pipeline
import numpy as np
import time

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base", model_kwargs={}, generate_kwargs = {"task":"transcribe", "language":"french"}, device=0)

class RollingAudioBuffer:
    def __init__(self, max_length_sec, sampling_rate):
        self.buffer = np.empty((0,), dtype=np.float32)
        self.sampling_rate = sampling_rate
        self.max_length = max_length_sec * sampling_rate

    def add_samples(self, np_samples):
        self.buffer = np.concatenate((self.buffer, np_samples))
        if len(self.buffer) > self.max_length:
            self.buffer = self.buffer[len(self.buffer)-self.max_length:]

audiobuffer = RollingAudioBuffer(max_length_sec=25, sampling_rate=48000)

def transcribe(audio):
    # Get audio stream from gr.Audio component: int16
    sampling_rate, np_samples = audio
    print(sampling_rate, np_samples.shape)

    # Convert audio stream to whisper format: fp32 between -1 and 1
    np_samples = np_samples.astype(np.float32)
    np_samples /= np.max(np.abs(np_samples))

    # Accumulate audio samples in buffer
    audiobuffer.add_samples(np_samples)
    
    # Convert audio stream to text
    start_time = time.time()
    text = transcriber({"sampling_rate": sampling_rate, "raw": audiobuffer.buffer})["text"]
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Transcription executed in {elapsed_time*1000:.2f} ms")
    
    return text


demo = gr.Interface(
    transcribe,
    gr.Audio(sources=["microphone"], streaming=True),
    "text",
    live=True,
)

demo.launch()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




48000 (24000,)


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcription executed in 1495.54 ms
48000 (96000,)
Transcription executed in 726.83 ms
48000 (48000,)
Transcription executed in 630.95 ms
48000 (48000,)
Transcription executed in 594.23 ms
48000 (48000,)
Transcription executed in 797.05 ms
48000 (72000,)
Transcription executed in 1081.01 ms
48000 (72000,)
Transcription executed in 1216.34 ms
48000 (96000,)
Transcription executed in 1487.10 ms
48000 (96000,)
Transcription executed in 1671.09 ms
48000 (96000,)
Transcription executed in 2235.63 ms


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


48000 (120000,)
Transcription executed in 2695.22 ms
48000 (144000,)
Transcription executed in 3254.47 ms
48000 (168000,)
Transcription executed in 3241.85 ms
48000 (140640,)
Transcription executed in 3160.29 ms


In [None]:
pip install faster-whisper

In [5]:
importlib.metadata.version('faster-whisper')

'1.0.2'

In [12]:
audiobuffer.buffer[::4].shape

(220800,)

In [5]:
transcriber({"sampling_rate": audiobuffer.sampling_rate/4, "raw": audiobuffer.buffer[::4]})["text"]

" Le 30 juin et le 7 juillet prochain se tiendra dans notre circonscription et partout en France une nouvelle élection législative. J'y suis candidat. Depuis 7 ans, j'ai défendu à l'assandé nationale des positions forgées par mes convictions et par nos échanges que j'ai toujours voulu constant et nombreux."

In [14]:
from faster_whisper import WhisperModel

# distill-large-v3 : english only, 1.5GB, 750 ms

# model_size = "tiny" # 20 sec -> 1.24 sec - too many errors
# model_size = "base" # 20 sec -> 1.72 sec vs 3 sec with huggingface
# model_size = "small" # 20 sec -> 4.18 sec
# model_size = "large-v3" # 20 sec -> 16.33 sec

model = WhisperModel(model_size, device="cuda", compute_type="float16")

In [15]:
segments, info = model.transcribe(audiobuffer.buffer[::4], beam_size=5, language="fr", condition_on_previous_text=False, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

[0.00s -> 6.64s]  Cher monsieur, le 30 juin et le 7 juillet prochains se tiendra dans notre circonscription et partout en France une nouvelle élection législative.
[7.22s -> 7.98s]  J'y suis candidat.
[8.50s -> 14.26s]  Depuis 7 ans, j'ai défendu à l'Assemblée nationale des positions forgées par mes convictions et par nos échanges que j'ai toujours voulu constants et nombreux.
