This dependency is necessary for whisper with Huggingface, but is not necessary for whisper with faster-whisper

> apt install ffmpeg

In [4]:
import importlib

In [None]:
pip install gradio

In [3]:
importlib.metadata.version('gradio')

'4.37.2'

In [None]:
pip install transformers

In [5]:
importlib.metadata.version('transformers')

'4.42.3'

In [1]:
from transformers import pipeline

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small", model_kwargs={}, generate_kwargs = {"task":"transcribe", "language":"english"}, device=0)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
import torch

transcriber.model.model = torch.compile(transcriber.model.model)

In [4]:
import numpy as np
import time

class RollingAudioBuffer:
    def __init__(self, max_length_sec, sampling_rate):
        self.buffer = np.empty((0,), dtype=np.float32)
        self.sampling_rate = sampling_rate
        self.max_length = max_length_sec * sampling_rate

    def add_samples(self, np_samples):
        self.buffer = np.concatenate((self.buffer, np_samples))
        if len(self.buffer) > self.max_length:
            self.buffer = self.buffer[len(self.buffer)-self.max_length:]

In [None]:
import gradio as gr

audiobuffer = RollingAudioBuffer(max_length_sec=25, sampling_rate=48000)

def transcribe(audio):
    # Get audio stream from gr.Audio component: int16
    sampling_rate, np_samples = audio
    print(sampling_rate, np_samples.shape)

    # Convert audio stream to whisper format: fp32 between -1 and 1
    np_samples = np_samples.astype(np.float32)
    np_samples /= np.max(np.abs(np_samples))

    # Accumulate audio samples in buffer
    audiobuffer.add_samples(np_samples)
    
    # Convert audio stream to text
    start_time = time.time()
    text = transcriber({"sampling_rate": sampling_rate, "raw": audiobuffer.buffer})["text"]
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Transcription executed in {elapsed_time*1000:.2f} ms")
    
    return text


demo = gr.Interface(
    transcribe,
    gr.Audio(sources=["microphone"], streaming=True),
    "text",
    live=True,
)

demo.launch()

In [None]:
np.save("julie_en.npy", audiobuffer.buffer)

In [6]:
audiobuffer = RollingAudioBuffer(max_length_sec=25, sampling_rate=48000)
audiobuffer.buffer = np.load("julie_en.npy")

In [8]:
transcriber({"sampling_rate": audiobuffer.sampling_rate, "raw": audiobuffer.buffer})["text"]

" So I am on a website to try and find a job and I had to answer to some questions and I had to say what I was studying and I had some difficulties to find the good topic and the... voilà. Y'a trouvé pas? Ouais, j'ai"

In [None]:
pip install faster-whisper

In [5]:
importlib.metadata.version('faster-whisper')

'1.0.2'

In [12]:
audiobuffer.buffer[::4].shape

(220800,)

In [5]:
transcriber({"sampling_rate": audiobuffer.sampling_rate/4, "raw": audiobuffer.buffer[::4]})["text"]

" Le 30 juin et le 7 juillet prochain se tiendra dans notre circonscription et partout en France une nouvelle élection législative. J'y suis candidat. Depuis 7 ans, j'ai défendu à l'assandé nationale des positions forgées par mes convictions et par nos échanges que j'ai toujours voulu constant et nombreux."

In [14]:
from faster_whisper import WhisperModel

# distill-large-v3 : english only, 1.5GB, 750 ms

# model_size = "tiny" # 20 sec -> 1.24 sec - too many errors
# model_size = "base" # 20 sec -> 1.72 sec vs 3 sec with huggingface
# model_size = "small" # 20 sec -> 4.18 sec
# model_size = "large-v3" # 20 sec -> 16.33 sec

model = WhisperModel(model_size, device="cuda", compute_type="float16")

In [15]:
segments, info = model.transcribe(audiobuffer.buffer[::4], beam_size=5, language="fr", condition_on_previous_text=False, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

[0.00s -> 6.64s]  Cher monsieur, le 30 juin et le 7 juillet prochains se tiendra dans notre circonscription et partout en France une nouvelle élection législative.
[7.22s -> 7.98s]  J'y suis candidat.
[8.50s -> 14.26s]  Depuis 7 ans, j'ai défendu à l'Assemblée nationale des positions forgées par mes convictions et par nos échanges que j'ai toujours voulu constants et nombreux.


In [17]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import MarianMTModel, MarianTokenizer

# Load model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-fr"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Prepare the input text
def translate_texts(src_texts):
    encoded = tokenizer(src_texts, return_tensors="pt", padding=True, truncation=True)
    
    # Perform the translation
    translated = model.generate(**encoded)
    tgt_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    return tgt_texts

In [6]:
# Test the translations
src_texts = ["The result is a distilled model that performs to within 1% WER of large-v3 on long-form audio using both the sequential and chunked algorithms, and outperforms distil-large-v2 by 4.8% using the sequential algorithm. The model is also faster than previous Distil-Whisper models: 6.3x faster than large-v3, and 1.1x faster than distil-large-v2."]
tgt_texts = translate_texts(src_texts)

for src, tgt in zip(src_texts, tgt_texts):
    print(f"Source: {src}")
    print(f"Translated: {tgt}")

Source: The result is a distilled model that performs to within 1% WER of large-v3 on long-form audio using both the sequential and chunked algorithms, and outperforms distil-large-v2 by 4.8% using the sequential algorithm. The model is also faster than previous Distil-Whisper models: 6.3x faster than large-v3, and 1.1x faster than distil-large-v2.
Translated: Le résultat est un modèle distillé qui effectue à moins de 1% WER de grand-v3 sur l'audio de forme longue en utilisant à la fois les algorithmes séquentielle et coupé, et surperforms distil-large-v2 de 4,8 % en utilisant l'algorithme séquentielle. Le modèle est également plus rapide que les modèles Distil-Whisper précédents: 6,3x plus rapide que grand-v3 et 1,1x plus rapide que distil-large-v2.
