In [None]:
# code reference - https://www.youtube.com/watch?v=2kSPbH4jWME

In [39]:
# !pip3 install ipywidgets
# !brew install portaudio
# !pip install pyaudio
# !pip install vosk # optional if not using OpenAI speech-to-text 

In [49]:
import ipywidgets as widgets
from IPython.display import display
from queue import Queue
from threading import Thread
import pyaudio
import json
from vosk import Model, KaldiRecognizer
import time
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv
import os 
import wave

In [50]:
context_transcript = "" # context used to answer the question 

In [42]:
# # Find audio device index using this code
# p = pyaudio.PyAudio()
# for i in range(p.get_device_count()):
#     print(p.get_device_info_by_index(i))

# p.terminate()

# after running above code for my mac - 'index': 0, 'structVersion': 2, 'name': 'MacBook Pro Microphone'

In [51]:
# setting various params 
AUDIO_FORMAT = pyaudio.paInt16
CHANNELS = 1
FRAME_RATE = 16000
RECORD_SECONDS = 0.5
device_index=0
SAMPLE_SIZE = 2
wav_filename = "recording.wav"

In [55]:
# transcription for context (optional transcription for question below)
messages = Queue()
recordings = Queue()

record_button = widgets.Button(
    description='Record',
    disabled=False,
    button_style='success',
    tooltip='Record',
    icon='microphone'
)
stop_button = widgets.Button(
    description='Stop',
    disabled=False,
    button_style='warning',
    tooltip='Stop',
    icon='stop'
)
output = widgets.Output()

def start_recording(data):
    messages.put(True)
    with output:
        display("Starting...")
        record = Thread(target=record_microphone)
        record.start()
        transcribe = Thread(target=speech_recognition, args=(output,))
        transcribe.start()

def stop_recording(data):
    with output:
        messages.get()
        display("Stopped.")
        display("Context transcript:")
        display(context_transcript)

record_button.on_click(start_recording)
stop_button.on_click(stop_recording)

display(record_button, stop_button, output)

Button(button_style='success', description='Record', icon='microphone', style=ButtonStyle(), tooltip='Record')



Output()

In [52]:
def record_microphone(chunk=1024):
    p = pyaudio.PyAudio()
    stream = p.open(format=AUDIO_FORMAT,
                    channels=CHANNELS,
                    rate=FRAME_RATE,
                    input=True,
                    input_device_index=device_index,
                    frames_per_buffer=chunk)
    frames = []
    while not messages.empty():
        data = stream.read(chunk)
        frames.append(data)
        if len(frames) >= (FRAME_RATE * RECORD_SECONDS) / chunk:
            recordings.put(frames.copy())
            frames = []
    if len(frames)>0:
        recordings.put(frames.copy())
        frames = []        

    stream.stop_stream()
    stream.close()
    p.terminate()

In [56]:
# OpenAI speech-to-text transcription 
_ = load_dotenv(find_dotenv()) # finds .env file and adds key-value pairs specified in .env to enviornment variables
openai_api_key   = os.environ.get("OPENAI_API_KEY") 
client = OpenAI(api_key=openai_api_key)

def get_transcription():
    audio_file= open(wav_filename, "rb")
    transcription = client.audio.transcriptions.create(model="whisper-1", file=audio_file)
    return transcription.text 

def write_frames_to_file(frames):
    wf = wave.open(wav_filename, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(SAMPLE_SIZE)
    wf.setframerate(FRAME_RATE)
    wf.writeframes(b''.join(frames))
    wf.close() 

# transcription using openAI speech-to-text
def speech_recognition(output):
    global context_transcript 
    while not messages.empty():
        frames = recordings.get()
        write_frames_to_file(frames)
        transcription = get_transcription()
        context_transcript += transcription
        output.append_stdout(transcription)
    while not recordings.empty():
        frames = recordings.get()
        write_frames_to_file(frames)
        transcription = get_transcription()
        context_transcript += transcription
        output.append_stdout(transcription)

In [17]:
# using vosk 
# model = Model(model_name="vosk-model-small-en-us-0.15") # "vosk-model-en-us-0.22"
# rec = KaldiRecognizer(model, FRAME_RATE)
# def speech_recognition(output):
#     while not messages.empty():
#         frames = recordings.get()
#         rec.AcceptWaveform(b''.join(frames))
#         result = rec.Result()
#         text = json.loads(result)["text"]
#         output.append_stdout(text)
#         #time.sleep(1)