In [1]:
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav
import whisper
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from dotenv import load_dotenv, find_dotenv
import os
from openai import OpenAI
import pyaudio
import threading
import time
import keyboard

load_dotenv(find_dotenv())

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
# --- Settings ---
FS = 44100               # Sampling frequency
THRESHOLD = 50          # Volume threshold for silence (adjust this)
SILENCE_DURATION = 1.5   # Seconds of silence before stopping (adjust this)
CHUNK_SIZE = 1024        # Process audio in chunks for efficiency

In [3]:
# --- Globals ---
pause_loop = False

In [4]:
# Initialising Gemini and ConversationChain
chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", max_output_tokens=4096)
memory = ConversationBufferMemory()
conversation = ConversationChain(
    llm=chat_model,
    memory=ConversationBufferMemory(),
)

# Initialising whisper
model = whisper.load_model("large", device="cuda")

In [5]:
# Initialising text2speech
tts_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def stream_tts(input_string):
    def _stream_tts():
        p = pyaudio.PyAudio()
        stream = p.open(format=8,
                        channels=1,
                        rate=24_000,
                        output=True)
        with tts_client.audio.speech.with_streaming_response.create(
            model="tts-1",
            voice="nova",
            input=input_string,
            response_format="pcm"
        ) as response:
            for chunk in response.iter_bytes(1024):
                stream.write(chunk)
                
        print("FINISHED!!!!!!!!!!!!!!!!!!!!")
        thread_done.set()

    thread_done = threading.Event()

    thread = threading.Thread(target=_stream_tts)
    thread.start()
    thread_done.wait()



In [6]:
# --- Functions ---
def is_silent(data):
    rms = np.sqrt(np.mean(data**2))
    print("RMS: ", rms)
    return rms < THRESHOLD

In [7]:
# Initialising speech2text
def record_speech():
    print("Recording... Speak now!")
    audio_data = np.array([], dtype=np.int16)  # Initialize empty array
    silent_chunks = 0

    with sd.InputStream(samplerate=FS, channels=1, dtype='int16') as stream:
        while True:
            chunk, overflowed = stream.read(CHUNK_SIZE)
            if overflowed:
                print("Warning: Input overflowed!")
            audio_data = np.append(audio_data, chunk)

            

            if is_silent(chunk):
                #print("Silent Chunk Detected!!")
                silent_chunks += 1
            else:
                silent_chunks = 0

            if silent_chunks > int(SILENCE_DURATION * FS / CHUNK_SIZE):
                #print("Silence detected, stopping recording.")
                break
    
    wav.write("g97613g9f0g8.wav", FS, audio_data)

    return "g97613g9f0g8.wav"


In [8]:
def keyboard_listener():
    global pause_loop
    
    def on_key_press(event):
        global pause_loop
        #print("Key pressed: {}" .format(event.name))
        if event.name == "+":
            pause_loop = False
            print("Set to continue on next loop")
        elif event.name == "-":
            pause_loop = True
            print("Set to pause on next loop")
    
    keyboard.on_press(on_key_press)
    keyboard.wait('esc')

listener_thread = threading.Thread(target=keyboard_listener)
listener_thread.start()

In [9]:
while True:
    if not pause_loop:
        # --- Record Speech ---
        time.sleep(0.5)
        print("Recording speech...")
        wav_file = record_speech()

        # --- Speech to Text ---
        print("Converting speech to text...")
        text = model.transcribe(wav_file, language="en")
        print("You said: ", text.get("text"))

        # --- Chatbot ---
        print("Chatting...")
        response = conversation.invoke(text.get("text"))

        response2 = response.get("response")
        print("Chatbot: ", response2)

        # --- Text to Speech ---
        print("Converting text to speech...")
        stream_tts(response2)

    else:
        time.sleep(1)

Recording speech...
Recording... Speak now!
RMS:  8.179862850317235
RMS:  13.169233498480464
RMS:  9.589524721017199
RMS:  11.525320291102542
RMS:  15.03888189826624
RMS:  8.966789855488976
RMS:  15.885607922345937
RMS:  11.136756428601641
RMS:  12.688154231900715
RMS:  12.548149452708156
RMS:  11.340564290964537
RMS:  12.460915458444456
RMS:  8.010795938138731
RMS:  10.33775317102319
RMS:  9.863323793225081
RMS:  7.364332857598168
RMS:  16.264356638828357
RMS:  10.83537140284079
RMS:  8.447447839436476
RMS:  11.864016301931652
RMS:  12.656134258730033
RMS:  9.847816620322497
RMS:  15.178141406723684
RMS:  13.777139409634353
RMS:  12.245056400441772
RMS:  10.040251024003334
RMS:  14.165456443404851
RMS:  14.602479990570094
RMS:  15.461393452726051
RMS:  10.40944020168712
RMS:  8.579362957119836
RMS:  13.231930485571635
RMS:  11.213830746002902
RMS:  17.97763367354002
RMS:  13.923654838529286
RMS:  11.397213886516301
RMS:  16.948883397955747
RMS:  14.084392680818722
RMS:  16.16770024616

  rms = np.sqrt(np.mean(data**2))


RMS:  30.39617960163744
RMS:  48.06692924974509
RMS:  73.6598893352074
RMS:  66.8296502193263
RMS:  80.43176941513845
RMS:  69.28343478738262
RMS:  49.64696656455961
RMS:  21.492458706485863
RMS:  32.092639343002006
RMS:  24.782038133747193
RMS:  49.05762316526352
RMS:  68.71205629063579
RMS:  42.44592147662718
RMS:  63.171602537750296
RMS:  62.65387308459071
RMS:  26.477769272070486
RMS:  15.927417153685026
RMS:  32.56567882756323
RMS:  nan
RMS:  29.392066186090084
RMS:  26.434526280746926
RMS:  43.60278804159202
RMS:  36.80532347280078
RMS:  33.14708874377205
RMS:  26.331588236612316
RMS:  38.616465986836495
RMS:  37.36399817170266
RMS:  22.578638618725442
RMS:  29.592479513383125
RMS:  nan
RMS:  43.5591484542857
RMS:  50.80444662502486
RMS:  47.575525893704004
RMS:  44.282352667992924
RMS:  17.95311820060794
RMS:  30.65535804003111
RMS:  32.730776204323355
RMS:  33.717605981519505
RMS:  39.41378298339935
RMS:  51.37050428930497
RMS:  31.62290012791363
RMS:  30.844747324301427
RMS:  