In [1]:
import asyncio
from fastapi import FastAPI, WebSocket
import numpy as np
from faster_whisper import WhisperModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
app = FastAPI()

In [3]:
SAMPLE_RATE = 16000
BUFFER_SECONDS = 2.0  # run whisper every 2s
MIN_SAMPLES = int(SAMPLE_RATE * BUFFER_SECONDS)

In [4]:
model = WhisperModel("base", device="cpu", compute_type="int8")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
def transcribe_float32(audio: np.ndarray) -> str:
    """
    audio: float32 mono [-1,1] at 16kHz
    returns transcript text
    """
    segments, info = model.transcribe(audio, language="en")
    return "".join([s.text for s in segments]).strip()

In [None]:
def stt_stream(ws: WebSocket):
  await ws.accept()

  # We'll store audio as float32 in a python list then convert to np array
  audio_samples = []
  last_text = ""

  try:
      while True:
          # frontend sends raw PCM16
          pcm_bytes = await ws.receive_bytes()

          # bytes -> int16 -> float32
          pcm = np.frombuffer(pcm_bytes, dtype=np.int16).astype(np.float32) / 32768.0
          audio_samples.extend(pcm.tolist())

          # whenever buffer gets >= 2 sec, transcribe
          if len(audio_samples) >= MIN_SAMPLES:
              audio_np = np.array(audio_samples, dtype=np.float32)

              # run whisper in background thread so websocket stays responsive
              text = await asyncio.to_thread(transcribe_float32, audio_np)

              if text and text != last_text:
                  last_text = text
                  await ws.send_json({"text": text, "is_final": False})

              # keep last 0.5 sec overlap (prevents cutting words)
              keep = int(SAMPLE_RATE * 0.5)
              audio_samples = audio_samples[-keep:]

  except Exception:
      # connection closed: final transcript
      if audio_samples:
          audio_np = np.array(audio_samples, dtype=np.float32)
          final_text = await asyncio.to_thread(transcribe_float32, audio_np)
          await ws.send_json({"text": final_text, "is_final": True})
      await ws.close()