In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


In [None]:
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

def transcribe_audio(audio_file):
    result = pipe(audio_file)
    return result["text"]

transcribe_audio("voice/squiz_complete.mp3")

In [None]:
import sounddevice as sd
import wavio as wv
import threading
from datetime import datetime

def write_audio_file(filename, clip, freq):
    wv.write(filename, clip, freq, sampwidth=2)

def record_audio(filename, duration=15, freq=44100):
    clip = sd.rec(int(duration * freq), samplerate=freq, channels=2)
    # print("Recording Audio")
    sd.wait()
    # Start a new thread for the wv.write operation
    write_thread = threading.Thread(target=write_audio_file, args=(filename, clip, freq))
    write_thread.start()
    return write_thread

def transcribe_in_thread(filename, write_thread):
    write_thread.join()  # Wait for the audio file to be saved
    transcription = transcribe_audio(filename)
    print(transcription)

# Main loop to record and transcribe forever
try:
    while True:
        filename =  f"voice/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.wav"
        write_thread = record_audio(filename, duration=5)
        transcription_thread = threading.Thread(target=transcribe_in_thread, args=(filename, write_thread))
        transcription_thread.start()
except KeyboardInterrupt:
    print("Program interrupted by user.")