## Loading the OpenAi-whisper model for STT

In [28]:
import whisper
model = whisper.load_model("base")
model.to("cuda") # using GPU

Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-5): 6 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=512, out_features=512, bias=True)
          (key): Linear(in_features=512, out_features=512, bias=False)
          (value): Linear(in_features=512, out_features=512, bias=True)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (attn_ln): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (mlp_ln): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((512,), eps=1e-05,

In [19]:
# Checking if the model is loaded
def stt(output_file):
    result = model.transcribe(output_file)
    return result["text"]

## Loading the Kokoro model for TTS

In [31]:
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sfvoice
import soundfile as sf

def load_tts_model():
    # 3️⃣ Initalize a pipeline
    
    # 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
    # 🇯🇵 'j' => Japanese: pip install misaki[ja]
    # 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
    pipeline = KPipeline(lang_code='b') # <= make sure lang_code matches voice
    return pipeline

pipeline = load_tts_model()



  WeightNorm.apply(module, name, dim)


In [None]:
# This text is for demonstration purposes only, unseen during training

def tts(text, pipeline, output_file="output.wav"):
    
    if text is "": 
        text = '''
    The sky above the port was the color of television, tuned to a dead channel. It's not like I'm using," Case heard someone say, as he shouldered his way through
    the crowd around the door of the Chat. "It's like my body's developed this massive drug deficiency."
    '''
    
    print("text: ", text)

    # 4️⃣ Generate, display, and save audio files in a loop.
    generator = pipeline(
        text, voice='af_heart', # <= change voice here
        speed=1.2,
          split_pattern=None
    )

    print("generator", generator)

    for i, (gs, ps, audio) in enumerate(generator):
        print(i)  # i => index
        print(gs) # gs => graphemes/text
        print(ps) # ps => phonemes
        display(Audio(data=audio, rate=24000, autoplay=i==0))
        sf.write(output_file, audio, 24000) # save each audio file
    
    return 

tts("THE NAME OF THE ", pipeline) # <= change voice here

  if text is "":


text THE NAME OF THE 
generator <generator object KPipeline.__call__ at 0x000001C93229AD50>
0
THE NAME OF THE
ðə nˈAm ˈɒv ðə


## Recording the voice in a loop

In [36]:
import pyaudio
import audioop
import wave
import time
import os
# Replace these imports with your actual STT/LLM/TTS libraries
import speech_recognition as sr  # Example STT (replace if using another)
# from grok_api import Grok  # Placeholder for LLM (e.g., xAI Grok API)

# Audio recording settings
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
SILENCE_THRESHOLD = 500
SILENCE_DURATION = 2

# File storage settings
OUTPUT_DIR = "interactions"
os.makedirs(OUTPUT_DIR, exist_ok=True)
LOG_FILE = os.path.join(OUTPUT_DIR, "interaction_log.txt")

# Initialize PyAudio once (reused across loops)
audio = pyaudio.PyAudio()

def record_audio(filename):
    stream = audio.open(format=FORMAT,
                        channels=CHANNELS,
                        rate=RATE,
                        input=True,
                        frames_per_buffer=CHUNK)
    
    print("Recording started... Speak now!")
    frames = []
    silent_chunks = 0
    chunks_per_second = RATE / CHUNK
    silence_chunks_needed = int(SILENCE_DURATION * chunks_per_second)

    while True:
        data = stream.read(CHUNK, exception_on_overflow=False)
        frames.append(data)
        rms = audioop.rms(data, 2)
        if rms < SILENCE_THRESHOLD:
            silent_chunks += 1
        else:
            silent_chunks = 0
        if silent_chunks >= silence_chunks_needed:
            print("Silence detected. Stopping recording.")
            break
    
    print("Recording finished.")
    stream.stop_stream()
    stream.close()

    wf = wave.open(filename, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(audio.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()
    print(f"Audio saved as {filename}")
    return filename

def llm_process(text):
    # Placeholder LLM (replace with your actual LLM)
    # Example: Simple echo or use Grok API
    # grok = Grok(api_key="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")  # Hypothetical Grok API
    # response = grok.generate(text)  # Adjust based on actual API
    return text

def log_interaction(user_text, bot_text, audio_file):
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
    with open(LOG_FILE, 'a') as f:
        f.write(f"[{timestamp}]\nUser: {user_text}\nBot: {bot_text}\nAudio: {audio_file}\n\n")


Starting interaction loop. Speak to begin, silence to end each turn.
Recording started... Speak now!
Silence detected. Stopping recording.
Recording finished.
Audio saved as interactions\user_20250320_210112.wav
User said:  Hey, how are you?
Bot says:  Hey, how are you?
 Hey, how are you? bot_text
text  Hey, how are you?


TypeError: 'str' object is not callable

In [42]:

# Main interaction loop
print("Starting interaction loop. Speak to begin, silence to end each turn.")
while True:
    # Generate unique filenames for this interaction
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    user_audio_file = os.path.join(OUTPUT_DIR, f"user_{timestamp}.wav")
    bot_audio_file = os.path.join(OUTPUT_DIR, f"bot_{timestamp}.mp3")  # MP3 for gTTS

    # Step 1: Record user input
    recorded_file = record_audio(user_audio_file)

    # Step 2: Convert to text via STT
    user_text = stt(recorded_file)
    print(f"User said: {user_text}")

    # Step 3: Get LLM response
    bot_text = llm_process(user_text)
    print(f"Bot says: {bot_text}")
    print(bot_text, "bot_text", bot_audio_file, "bot_audio_file")
    # Step 4: Convert bot response to speech via TTS
    tts(text= bot_text, pipeline=pipeline, output_file=bot_audio_file)

    # Step 5: Log the interaction
    log_interaction(user_text, bot_text, recorded_file)

# Cleanup (won’t reach here unless you break the loop)
audio.terminate()

Starting interaction loop. Speak to begin, silence to end each turn.
Recording started... Speak now!
Silence detected. Stopping recording.
Recording finished.
Audio saved as interactions\user_20250320_210643.wav
User said:  Hey, how are you?
Bot says:  Hey, how are you?
 Hey, how are you? bot_text interactions\bot_20250320_210643.mp3 bot_audio_file
text  Hey, how are you?
generator <generator object KPipeline.__call__ at 0x000001C90E8E6B20>
0
Hey, how are you?
hˈA, hˌW ɑː juː?


Recording started... Speak now!
Silence detected. Stopping recording.
Recording finished.
Audio saved as interactions\user_20250320_210648.wav
User said:  Thank you. Peace, how are you? Thank you.
Bot says:  Thank you. Peace, how are you? Thank you.
 Thank you. Peace, how are you? Thank you. bot_text interactions\bot_20250320_210648.mp3 bot_audio_file
text  Thank you. Peace, how are you? Thank you.
generator <generator object KPipeline.__call__ at 0x000001C887C45D20>
0
Thank you. Peace, how are you? Thank you.
θˈaŋk juː. pˈiːs, hˌW ɑː juː? θˈaŋk juː.


Recording started... Speak now!
Silence detected. Stopping recording.
Recording finished.
Audio saved as interactions\user_20250320_210656.wav
User said: 
Bot says: 
 bot_text interactions\bot_20250320_210656.mp3 bot_audio_file
text 
    The sky above the port was the color of television, tuned to a dead channel. It's not like I'm using," Case heard someone say, as he shouldered his way through
    the crowd around the door of the Chat. "It's like my body's developed this massive drug deficiency."
    
generator <generator object KPipeline.__call__ at 0x000001C90E8E6B20>
0
The sky above the port was the color of television, tuned to a dead channel. It's not like I'm using," Case heard someone say, as he shouldered his way through
    the crowd around the door of the Chat. "It's like my body's developed this massive drug deficiency."
ðə skˈI əbˈʌv ðə pˈɔːt wɒz ðə kˈʌlə ɒv tˈɛlɪvˌɪʒᵊn, tjˈuːnd tə ɐ dˈɛd ʧˈanᵊl. ˌɪts nˌɒt lˈIk ˌIm jˈuːzɪŋ,” kˈAs hˈɜːd sˈʌmwʌn sˈA, az hiː ʃˈQldəd hɪz wˈA θ

Recording started... Speak now!
Silence detected. Stopping recording.
Recording finished.
Audio saved as interactions\user_20250320_210700.wav
User said: 
Bot says: 
 bot_text interactions\bot_20250320_210700.mp3 bot_audio_file
text 
    The sky above the port was the color of television, tuned to a dead channel. It's not like I'm using," Case heard someone say, as he shouldered his way through
    the crowd around the door of the Chat. "It's like my body's developed this massive drug deficiency."
    
generator <generator object KPipeline.__call__ at 0x000001C90E8E6B20>
0
The sky above the port was the color of television, tuned to a dead channel. It's not like I'm using," Case heard someone say, as he shouldered his way through
    the crowd around the door of the Chat. "It's like my body's developed this massive drug deficiency."
ðə skˈI əbˈʌv ðə pˈɔːt wɒz ðə kˈʌlə ɒv tˈɛlɪvˌɪʒᵊn, tjˈuːnd tə ɐ dˈɛd ʧˈanᵊl. ˌɪts nˌɒt lˈIk ˌIm jˈuːzɪŋ,” kˈAs hˈɜːd sˈʌmwʌn sˈA, az hiː ʃˈQldəd hɪz wˈA θ

Recording started... Speak now!


KeyboardInterrupt: 

In [40]:
tts("THE NAME OF THE ", "output.wav") # <= change voice here

text THE NAME OF THE 


TypeError: 'str' object is not callable

In [21]:
!pip install grok_api

Collecting grok_api
  Downloading grok_api-0.1.1-py3-none-any.whl.metadata (3.4 kB)
Collecting curl_cffi (from grok_api)
  Downloading curl_cffi-0.10.0-cp39-abi3-win_amd64.whl.metadata (12 kB)
Downloading grok_api-0.1.1-py3-none-any.whl (5.0 kB)
Downloading curl_cffi-0.10.0-cp39-abi3-win_amd64.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 1.4/1.4 MB 12.0 MB/s eta 0:00:00
Installing collected packages: curl_cffi, grok_api
Successfully installed curl_cffi-0.10.0 grok_api-0.1.1


In [25]:
from grok_api.client import Grok  # Hypothetical example

ModuleNotFoundError: No module named 'grok_api.client'; 'grok_api' is not a package