In [None]:
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav
import whisper
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from dotenv import load_dotenv, find_dotenv
import os
from openai import OpenAI
import pyaudio
import threading
import time

load_dotenv(find_dotenv())

In [None]:
# --- Settings ---
FS = 44100               # Sampling frequency
THRESHOLD = 30          # Volume threshold for silence (adjust this)
SILENCE_DURATION = 2   # Seconds of silence before stopping (adjust this)
CHUNK_SIZE = 1024        # Process audio in chunks for efficiency

In [None]:
# Initialising Gemini and ConversationChain
chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", max_output_tokens=4096)
memory = ConversationBufferMemory()
conversation = ConversationChain(
    llm=chat_model,
    memory=ConversationBufferMemory(),
)

# Initialising whisper
model = whisper.load_model("base", device="cuda")

In [None]:
# Initialising text2speech
tts_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def stream_tts(input_string):
    def _stream_tts():
        p = pyaudio.PyAudio()
        stream = p.open(format=8,
                        channels=1,
                        rate=24_000,
                        output=True)
        with tts_client.audio.speech.with_streaming_response.create(
            model="tts-1",
            voice="nova",
            input=input_string,
            response_format="pcm"
        ) as response:
            for chunk in response.iter_bytes(1024):
                stream.write(chunk)
                
        print("FINISHED!!!!!!!!!!!!!!!!!!!!")
        thread_done.set()

    thread_done = threading.Event()

    thread = threading.Thread(target=_stream_tts)
    thread.start()
    thread_done.wait()



In [None]:
# --- Functions ---
def is_silent(data):
    rms = np.sqrt(np.mean(data**2))
    #print("RMS: ", rms)
    return rms < THRESHOLD

In [None]:
# Initialising speech2text
def record_speech():
    print("Recording... Speak now!")
    audio_data = np.array([], dtype=np.int16)  # Initialize empty array
    silent_chunks = 0

    with sd.InputStream(samplerate=FS, channels=1, dtype='int16') as stream:
        while True:
            chunk, overflowed = stream.read(CHUNK_SIZE)
            if overflowed:
                print("Warning: Input overflowed!")
            audio_data = np.append(audio_data, chunk)

            if is_silent(chunk):
                #print("Silent Chunk Detected!!")
                silent_chunks += 1
            else:
                silent_chunks = 0

            if silent_chunks > int(SILENCE_DURATION * FS / CHUNK_SIZE):
                #print("Silence detected, stopping recording.")
                break
    
    wav.write("g97613g9f0g8.wav", FS, audio_data)

    return "g97613g9f0g8.wav"


In [None]:
while True:
    # --- Record Speech ---
    print("Recording in 2...")
    time.sleep(1)
    print("Recording in 1...")
    time.sleep(1)
    wav_file = record_speech()

    # --- Speech to Text ---
    print("Converting speech to text...")
    text = model.transcribe(wav_file, language="en")
    print("You said: ", text.get("text"))

    # --- Chatbot ---
    print("Chatting...")
    response = conversation.invoke(text.get("text"))

    response2 = response.get("response")
    print("Chatbot: ", response2)

    # --- Text to Speech ---
    print("Converting text to speech...")
    stream_tts(response2)

    print("Recording again!")