In [1]:
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav
import whisper
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from dotenv import load_dotenv, find_dotenv
import os
from openai import OpenAI
import pyaudio
import threading
import time
import keyboard

load_dotenv(find_dotenv())

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
# --- Settings ---
FS = 44100               # Sampling frequency
THRESHOLD = 50          # Volume threshold for silence (adjust this)
SILENCE_DURATION = 1.5   # Seconds of silence before stopping (adjust this)
CHUNK_SIZE = 1024        # Process audio in chunks for efficiency
MIN_SPEECH_DURATION = 1  # Minimum duration of speech in seconds

In [3]:
# --- Globals ---
pause_loop = False

In [4]:
# Initialising Gemini and ConversationChain
chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", max_output_tokens=4096)
memory = ConversationBufferMemory()
conversation = ConversationChain(
    llm=chat_model,
    memory=ConversationBufferMemory(),
)

# Initialising whisper
model = whisper.load_model("large", device="cuda")

In [5]:
# Initialising text2speech
tts_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def stream_tts(input_string):
    def _stream_tts():
        p = pyaudio.PyAudio()
        stream = p.open(format=8,
                        channels=1,
                        rate=24_000,
                        output=True)
        with tts_client.audio.speech.with_streaming_response.create(
            model="tts-1",
            voice="nova",
            input=input_string,
            response_format="pcm"
        ) as response:
            for chunk in response.iter_bytes(1024):
                stream.write(chunk)
                
        print("FINISHED!!!!!!!!!!!!!!!!!!!!")
        thread_done.set()

    thread_done = threading.Event()

    thread = threading.Thread(target=_stream_tts)
    thread.start()
    thread_done.wait()



In [6]:
# --- Functions ---
def is_silent(data):
    rms = np.sqrt(np.mean(data**2))
    print("RMS: ", rms)
    return rms < THRESHOLD

In [7]:
def record_speech():
    print("Recording... Speak now!")
    audio_data = np.array([], dtype=np.int16)  # Initialize empty array
    silent_chunks = 0
    speech_chunks = 0
    speech_started = 0  # Flag to track if speech has started

    with sd.InputStream(samplerate=FS, channels=1, dtype='int16') as stream:
        while True:
            chunk, overflowed = stream.read(CHUNK_SIZE)
            if overflowed:
                print("Warning: Input overflowed!")

            # Start counting silent chunks only after speech has started
            if speech_started == 0:
                if not is_silent(chunk):
                    speech_started = True
                    print("Speech detected, starting silence detection.")
            else:  # Speech has started
                audio_data = np.append(audio_data, chunk)
                if is_silent(chunk):
                    silent_chunks += 1
                else:
                    silent_chunks = 0  # Reset silent chunk counter if speech detected

            if silent_chunks > int(SILENCE_DURATION * FS / CHUNK_SIZE):
                print("Silence detected, stopping recording.")
                break
    
    wav.write("g97613g9f0g8.wav", FS, audio_data)

    return "g97613g9f0g8.wav"

In [8]:
def keyboard_listener():
    global pause_loop
    
    def on_key_press(event):
        global pause_loop
        #print("Key pressed: {}" .format(event.name))
        if event.name == "+":
            pause_loop = False
            print("Set to continue on next loop")
        elif event.name == "-":
            pause_loop = True
            print("Set to pause on next loop")
    
    keyboard.on_press(on_key_press)
    keyboard.wait('esc')

listener_thread = threading.Thread(target=keyboard_listener)
listener_thread.start()

In [9]:
while True:
    if not pause_loop:
        # --- Record Speech ---
        time.sleep(0.5)
        print("Recording speech...")
        wav_file = record_speech()

        # --- Speech to Text ---
        print("Converting speech to text...")
        text = model.transcribe(wav_file, language="en")
        print("You said: ", text.get("text"))

        # --- Chatbot ---
        print("Chatting...")
        response = conversation.invoke(text.get("text"))

        response2 = response.get("response")
        print("Chatbot: ", response2)

        # --- Text to Speech ---
        print("Converting text to speech...")
        stream_tts(response2)

    else:
        time.sleep(1)

Recording speech...
Recording... Speak now!
RMS:  12.649844057833283
RMS:  13.29682909804063
RMS:  9.173937693951274
RMS:  14.60425210983089
RMS:  16.149357814244503
RMS:  13.57002239543104
RMS:  9.605957217399004
RMS:  15.295813612309088
RMS:  18.471737279557654
RMS:  9.111880551647943
RMS:  6.504880979887334
RMS:  13.057639646008003
RMS:  14.05808958615999
RMS:  11.535610625363532
RMS:  12.985418505000137
RMS:  10.331658340387568
RMS:  11.383581927495404
RMS:  12.68022420690975
RMS:  8.142229328476324
RMS:  10.3785291437419
RMS:  8.363623336359666
RMS:  12.087732676147334
RMS:  10.13227940482792
RMS:  8.67760227539843
RMS:  8.234360175508478
RMS:  10.285903202320155
RMS:  6.956547612681164
RMS:  14.267327732445203
RMS:  12.669553193581848
RMS:  13.836023521048235
RMS:  14.12506913699894
RMS:  10.79803946325443
RMS:  12.821262247727406
RMS:  17.16785809477991
RMS:  14.301442234963577
RMS:  11.06695708234653
RMS:  7.014145863004276
RMS:  8.302802143704257
RMS:  7.195783270429981
RMS:  