In [None]:
# Gemini conversation: https://gemini.google.com/app/83774c879215fb34?utm_source=app_launcher&utm_medium=owned&utm_campaign=base_all

In [None]:
#!pip install python-dotenv
#!pip -q install google-genai jinja2
#!pip -q install sounddevice
#!pip -q install numpy

In [1]:
from google import genai
from google.genai import types



In [3]:
from dotenv import load_dotenv
import os

# Load your environment file
load_dotenv('my_keys.env')

# Check if the variable exists (but don't print the secret)
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

api_key = GOOGLE_API_KEY

# Safe check
if api_key:
    print("API key loaded successfully!")
else:
    print("API key not found. Check your .env file and path.")

API key loaded successfully!


In [5]:
import asyncio
import sounddevice as sd
import numpy as np


client = genai.Client(api_key=GOOGLE_API_KEY)

MODEL = "gemini-2.0-flash-live-001"

config={
    "response_modalities": ["AUDIO"]
}



async def async_enumerate(aiterable):
  n=0
  async for item in aiterable:
    yield n, item
    n+=1



# --- Configuration for sounddevice (YOU MUST CONFIRM THESE FROM YOUR MODEL'S API DOCS) ---
# These are common defaults, but your model might output different values.
SAMPLE_RATE = 24000  # e.g., 8000, 16000, 24000
CHANNELS = 1         # 1 for mono, 2 for stereo
DTYPE = 'int16'      # 'int16' for 16-bit PCM, 'float32' for floating point
# ---------------------------------------------------------------------------------------


async def main():
    
    async with client.aio.live.connect(model=MODEL, config=config) as session:
        print(f"Connected to model: {MODEL}")
        print("Type 'q' to quit.")
    
        while True:
            message = input("User> ")
            if message.lower() == "q":
                break
    
            await session.send_client_content(
                turns={"role": "user", "parts": [{"text": message}]}, turn_complete=True
            )
    
            print("Model thinking...", end='')
    
            # Initialize the audio stream for THIS turn.
            # This context manager ensures the stream is properly closed when done.
            try:
                with sd.OutputStream(samplerate=SAMPLE_RATE, channels=CHANNELS, dtype=DTYPE) as stream:
                    turn = session.receive()
                    async for n, response in async_enumerate(turn):
                        if response.data is not None:
                            # Convert the bytes received from the model into a NumPy array
                            # that sounddevice can play.
                            audio_chunk = np.frombuffer(response.data, dtype=DTYPE)
                            stream.write(audio_chunk) # Play this audio chunk immediately!
    
                            if n == 0:
                                # This will print the MIME type from the first response part.
                                # Use this to verify audio format, e.g., "audio/wav; codec=pcm; sample_rate=16000"
                                print(f"\nModel response audio type: {response.server_content.model_turn.parts[0].inline_data.mime_type}")
                            print('.', end='')
                    print("\nAudio playback finished for this turn.")
            except Exception as e:
                print(f"\nError during audio playback: {e}")
                print("Ensure SAMPLE_RATE, CHANNELS, and DTYPE match your model's audio output.")
                print("Also, check that PortAudio is installed correctly (e.g., 'brew install portaudio' on Mac).")

# How to run this in a Jupyter Notebook:
await main() # assuming the code above is wrapped in an async def main(): function

Connected to model: gemini-2.0-flash-live-001
Type 'q' to quit.


User>  Hi


Model thinking...
Model response audio type: audio/pcm;rate=24000
........
Audio playback finished for this turn.


User>  How are you?


Model thinking...
Model response audio type: audio/pcm;rate=24000
...............
Audio playback finished for this turn.


User>  Im good thanks


Model thinking...
Model response audio type: audio/pcm;rate=24000
.........
Audio playback finished for this turn.


User>  q
