In [1]:
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav
import whisper
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from dotenv import load_dotenv, find_dotenv
import os
from openai import OpenAI
import pyaudio
import threading
import time
import keyboard

load_dotenv(find_dotenv())

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
initial_prompt = ""

In [3]:
# --- Settings ---
FS = 44100               # Sampling frequency
THRESHOLD = 50          # Volume threshold for silence (adjust this)
SILENCE_DURATION = 1.5   # Seconds of silence before stopping (adjust this)
CHUNK_SIZE = 1024        # Process audio in chunks for efficiency

In [4]:
# --- Globals ---
pause_loop = False

In [5]:
# Initialising Gemini and ConversationChain
chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", max_output_tokens=4096)
memory = ConversationBufferMemory()
conversation = ConversationChain(
    llm=chat_model,
    memory=ConversationBufferMemory(),
)

# Initialising whisper
model = whisper.load_model("large", device="cuda")

In [5]:
# Initialising text2speech
tts_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def stream_tts(input_string):
    def _stream_tts():
        p = pyaudio.PyAudio()
        stream = p.open(format=8,
                        channels=1,
                        rate=24_000,
                        output=True)
        with tts_client.audio.speech.with_streaming_response.create(
            model="tts-1",
            voice="nova",
            input=input_string,
            response_format="pcm"
        ) as response:
            for chunk in response.iter_bytes(1024):
                stream.write(chunk)
                
        print("FINISHED!!!!!!!!!!!!!!!!!!!!")
        thread_done.set()

    thread_done = threading.Event()

    thread = threading.Thread(target=_stream_tts)
    thread.start()
    thread_done.wait()



In [6]:
# --- Functions ---
def is_silent(data):
    rms = np.sqrt(np.mean(data**2))
    print("RMS: ", rms)
    return rms < THRESHOLD

In [7]:
# Initialising speech2text
def record_speech():
    print("Recording... Speak now!")
    audio_data = np.array([], dtype=np.int16)  # Initialize empty array
    silent_chunks = 0

    with sd.InputStream(samplerate=FS, channels=1, dtype='int16') as stream:
        while True:
            chunk, overflowed = stream.read(CHUNK_SIZE)
            if overflowed:
                print("Warning: Input overflowed!")
            audio_data = np.append(audio_data, chunk)

            

            if is_silent(chunk):
                #print("Silent Chunk Detected!!")
                silent_chunks += 1
            else:
                silent_chunks = 0

            if silent_chunks > int(SILENCE_DURATION * FS / CHUNK_SIZE):
                #print("Silence detected, stopping recording.")
                break
    
    wav.write("g97613g9f0g8.wav", FS, audio_data)

    return "g97613g9f0g8.wav"


In [8]:
def keyboard_listener():
    global pause_loop
    
    def on_key_press(event):
        global pause_loop
        #print("Key pressed: {}" .format(event.name))
        if event.name == "+":
            pause_loop = False
            print("Set to continue on next loop")
        elif event.name == "-":
            pause_loop = True
            print("Set to pause on next loop")
    
    keyboard.on_press(on_key_press)
    keyboard.wait('esc')

listener_thread = threading.Thread(target=keyboard_listener)
listener_thread.start()

In [9]:
while True:
    if not pause_loop:
        # --- Record Speech ---
        time.sleep(0.5)
        print("Recording speech...")
        wav_file = record_speech()

        # --- Speech to Text ---
        print("Converting speech to text...")
        text = model.transcribe(wav_file, language="en")
        print("You said: ", text.get("text"))

        # --- Chatbot ---
        print("Chatting...")
        response = conversation.invoke(text.get("text"))

        response2 = response.get("response")
        print("Chatbot: ", response2)

        # --- Text to Speech ---
        print("Converting text to speech...")
        stream_tts(response2)

    else:
        time.sleep(1)

Recording speech...
Recording... Speak now!
RMS:  8.96057990241145
RMS:  7.8271737196449145
RMS:  9.234256027152377
RMS:  9.906989325723531
RMS:  11.375515098227421
RMS:  11.818905340068513
RMS:  9.782947137110575
RMS:  13.864156799098891
RMS:  9.807124323674092
RMS:  12.355374273469016
RMS:  11.177719188188616
RMS:  9.527612010624697
RMS:  7.2029724766932155
RMS:  6.956266846340787
RMS:  7.819933963116313
RMS:  7.745147170164038
RMS:  8.409676346328673
RMS:  8.26507854998971
RMS:  6.468070616497627
RMS:  7.715776249023814
RMS:  7.595614483042172
RMS:  8.676983292740628
RMS:  8.869606635866102
RMS:  8.10399930204217
RMS:  8.096283387301854
RMS:  6.827391980104848
RMS:  8.895003864529796
RMS:  10.339311736643788
RMS:  8.702549913674726
RMS:  10.859363758986987
RMS:  7.499674472102106
RMS:  7.867245686229711
RMS:  11.130221738020317
RMS:  7.630889119231127
RMS:  9.897719118564641
RMS:  36.18285807879195
RMS:  72.8190447702385
RMS:  45.347701843643634
RMS:  25.733269747060906
RMS:  nan
RM

  rms = np.sqrt(np.mean(data**2))


RMS:  38.13420739228101
RMS:  41.66199322749093
RMS:  74.90154344829952
RMS:  50.78855716657542
RMS:  27.931591936819498
RMS:  18.164316194464906
RMS:  14.712910200313873
RMS:  15.241128310840375
RMS:  29.080605919151683
RMS:  37.3306200767212
RMS:  34.85528116369168
RMS:  8.868230252564487
RMS:  10.245235172996274
RMS:  16.764743977317398
RMS:  48.84116075990619
RMS:  82.15444915295215
RMS:  56.17062976714343
RMS:  50.02754124292438
RMS:  12.362090804653556
RMS:  31.501379558084437
RMS:  35.23258379593384
RMS:  13.01298930396087
RMS:  13.177721493111015
RMS:  32.363354565773925
RMS:  26.115167108885213
RMS:  33.24651943719071
RMS:  69.07828580901888
RMS:  38.51867698354267
RMS:  24.792754265359466
RMS:  15.964957964633042
RMS:  10.015710315549267
RMS:  14.656183368718475
RMS:  17.429881848423413
RMS:  9.55274216193968
RMS:  9.211172733425425
RMS:  7.829169715716986
RMS:  6.8459614007091805
RMS:  8.462001462420105
RMS:  8.126922849393859
RMS:  16.92688781938369
RMS:  39.430442118051374