In [1]:
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav
import whisper
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from dotenv import load_dotenv, find_dotenv
import os
from openai import OpenAI
import pyaudio
import threading
import time
import keyboard

load_dotenv(find_dotenv())

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
# --- Settings ---
FS = 44100               # Sampling frequency
THRESHOLD = 30          # Volume threshold for silence (adjust this)
SILENCE_DURATION = 2   # Seconds of silence before stopping (adjust this)
CHUNK_SIZE = 1024        # Process audio in chunks for efficiency

In [3]:
# --- Globals ---
pause_loop = False

In [4]:
# Initialising Gemini and ConversationChain
chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", max_output_tokens=4096)
memory = ConversationBufferMemory()
conversation = ConversationChain(
    llm=chat_model,
    memory=ConversationBufferMemory(),
)

# Initialising whisper
model = whisper.load_model("base", device="cuda")

In [5]:
# Initialising text2speech
tts_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def stream_tts(input_string):
    def _stream_tts():
        p = pyaudio.PyAudio()
        stream = p.open(format=8,
                        channels=1,
                        rate=24_000,
                        output=True)
        with tts_client.audio.speech.with_streaming_response.create(
            model="tts-1",
            voice="nova",
            input=input_string,
            response_format="pcm"
        ) as response:
            for chunk in response.iter_bytes(1024):
                stream.write(chunk)
                
        print("FINISHED!!!!!!!!!!!!!!!!!!!!")
        thread_done.set()

    thread_done = threading.Event()

    thread = threading.Thread(target=_stream_tts)
    thread.start()
    thread_done.wait()



In [6]:
# --- Functions ---
def is_silent(data):
    rms = np.sqrt(np.mean(data**2))
    #print("RMS: ", rms)
    return rms < THRESHOLD

In [7]:
# Initialising speech2text
def record_speech():
    print("Recording... Speak now!")
    audio_data = np.array([], dtype=np.int16)  # Initialize empty array
    silent_chunks = 0

    with sd.InputStream(samplerate=FS, channels=1, dtype='int16') as stream:
        while True:
            chunk, overflowed = stream.read(CHUNK_SIZE)
            if overflowed:
                print("Warning: Input overflowed!")
            audio_data = np.append(audio_data, chunk)

            if is_silent(chunk):
                #print("Silent Chunk Detected!!")
                silent_chunks += 1
            else:
                silent_chunks = 0

            if silent_chunks > int(SILENCE_DURATION * FS / CHUNK_SIZE):
                #print("Silence detected, stopping recording.")
                break
    
    wav.write("g97613g9f0g8.wav", FS, audio_data)

    return "g97613g9f0g8.wav"


In [8]:
def keyboard_listener():
    global pause_loop
    
    def on_key_press(event):
        global pause_loop
        #print("Key pressed: {}" .format(event.name))
        if event.name == "+":
            pause_loop = False
            print("Set to continue on next loop")
        elif event.name == "-":
            pause_loop = True
            print("Set to pause on next loop")
    
    keyboard.on_press(on_key_press)
    keyboard.wait('esc')

listener_thread = threading.Thread(target=keyboard_listener)
listener_thread.start()

In [9]:
while True:
    if not pause_loop:
        # --- Record Speech ---
        time.sleep(0.5)
        print("Recording speech...")
        wav_file = record_speech()

        # --- Speech to Text ---
        print("Converting speech to text...")
        text = model.transcribe(wav_file, language="en")
        print("You said: ", text.get("text"))

        # --- Chatbot ---
        print("Chatting...")
        response = conversation.invoke(text.get("text"))

        response2 = response.get("response")
        print("Chatbot: ", response2)

        # --- Text to Speech ---
        print("Converting text to speech...")
        stream_tts(response2)

    else:
        time.sleep(1)

Recording speech...
Recording... Speak now!


  rms = np.sqrt(np.mean(data**2))


Set to pause on next loop
Converting speech to text...
You said:   Hey, when did the iPhone 14 launch?
Chatting...
Chatbot:  The iPhone 14 was announced on September 7, 2022, and was released on September 16, 2022.  It was the successor to the iPhone 13, and came in four models: iPhone 14, iPhone 14 Plus, iPhone 14 Pro, and iPhone 14 Pro Max.  It featured a new A16 Bionic chip, a new camera system, and a new safety feature called Crash Detection. 

Converting text to speech...
FINISHED!!!!!!!!!!!!!!!!!!!!
Set to continue on next loop
Recording speech...
Recording... Speak now!


  rms = np.sqrt(np.mean(data**2))


Converting speech to text...
You said:   Yes, pretty cool. So what are the main differences between that and the iPhone 13, for example?
Chatting...
Chatbot:  The iPhone 14 has a few key differences from the iPhone 13. Here's a breakdown:

**Processor:** The iPhone 14 features the new A16 Bionic chip, which is faster and more efficient than the A15 Bionic chip in the iPhone 13.

**Camera:** The iPhone 14 has a slightly improved camera system compared to the iPhone 13. It features a new main camera sensor with a larger aperture, which allows for better low-light performance. The iPhone 14 also has a new Photonic Engine that helps improve image quality across all lighting conditions.

**Safety Features:** The iPhone 14 introduces a new safety feature called Crash Detection. This feature uses sensors to detect a car crash and automatically contacts emergency services. 

**Other Differences:**

* **iPhone 14 Plus:** This is a new model that was not available in the iPhone 13 lineup. It fea

Exception in thread Thread-17 (_stream_tts):
Traceback (most recent call last):
  File "c:\Users\niran\AppData\Local\Programs\Python\Python311\Lib\site-packages\httpx\_transports\default.py", line 69, in map_httpcore_exceptions
    yield
  File "c:\Users\niran\AppData\Local\Programs\Python\Python311\Lib\site-packages\httpx\_transports\default.py", line 113, in __iter__
    for part in self._httpcore_stream:
  File "c:\Users\niran\AppData\Local\Programs\Python\Python311\Lib\site-packages\httpcore\_sync\connection_pool.py", line 367, in __iter__
    raise exc from None
  File "c:\Users\niran\AppData\Local\Programs\Python\Python311\Lib\site-packages\httpcore\_sync\connection_pool.py", line 363, in __iter__
    for part in self._stream:
  File "c:\Users\niran\AppData\Local\Programs\Python\Python311\Lib\site-packages\httpcore\_sync\http11.py", line 349, in __iter__
    raise exc
  File "c:\Users\niran\AppData\Local\Programs\Python\Python311\Lib\site-packages\httpcore\_sync\http11.py", line 