## Install Dependencies

In [None]:
%pip install azure-ai-voicelive pyaudio python-dotenv azure-identity --quiet

## Import Libraries

In [None]:
from __future__ import annotations
import os
import sys
import asyncio
import json
import base64
from datetime import datetime, timezone
import logging
import queue
from typing import Union, Optional, Dict, Any, Mapping, Callable, TYPE_CHECKING, cast

from azure.core.credentials import AzureKeyCredential
from azure.core.credentials_async import AsyncTokenCredential
from azure.identity.aio import AzureCliCredential, DefaultAzureCredential

from azure.ai.voicelive.aio import connect
from azure.ai.voicelive.models import (
    AudioEchoCancellation,
    AudioNoiseReduction,
    AzureStandardVoice,
    InputAudioFormat,
    ItemType,
    Modality,
    OutputAudioFormat,
    RequestSession,
    ServerEventType,
    ServerVad,
    FunctionTool,
    FunctionCallOutputItem,
    ToolChoiceLiteral,
    AudioInputTranscriptionOptions,
    Tool,
)
from dotenv import load_dotenv
import pyaudio

if TYPE_CHECKING:
    from azure.ai.voicelive.aio import VoiceLiveConnection

print("Libraries imported successfully!")

## Load Environment Variables

Create a `.env` file in the same directory with the following variables:
```
AZURE_VOICELIVE_ENDPOINT=https://your-resource-name.services.ai.azure.com/
AZURE_VOICELIVE_API_KEY=your-api-key
AZURE_VOICELIVE_MODEL=gpt-realtime
AZURE_VOICELIVE_VOICE=en-US-Ava:DragonHDLatestNeural
```

In [None]:
# Load environment variables
load_dotenv('./.env', override=True)

# Configuration
ENDPOINT = os.environ.get("AZURE_VOICELIVE_ENDPOINT", "https://your-resource-name.services.ai.azure.com/")
API_KEY = os.environ.get("AZURE_VOICELIVE_API_KEY", "")
MODEL = os.environ.get("AZURE_VOICELIVE_MODEL", "gpt-realtime")
VOICE = os.environ.get("AZURE_VOICELIVE_VOICE", "en-US-Ava:DragonHDLatestNeural")
USE_TOKEN_CREDENTIAL = False  # Set to True to use Azure CLI credential instead of API key

INSTRUCTIONS = """
You are a helpful AI assistant with access to functions.
Use the functions when appropriate to provide accurate, real-time information.
If you are asked about the weather, please call the get_current_weather function.
If you are asked about the time, please call the get_current_time function.
Explain when you're using a function and include the results in your response naturally.
Always start the conversation in English.
"""

print(f"Endpoint: {ENDPOINT}")
print(f"Model: {MODEL}")
print(f"Voice: {VOICE}")
print(f"Use Token Credential: {USE_TOKEN_CREDENTIAL}")

## Set Up Logging

In [None]:
# Set up logging
if not os.path.exists('logs'):
    os.makedirs('logs')

timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
logfilename = f'logs/{timestamp}_voicelive_function_calling.log'

logging.basicConfig(
    filename=logfilename,
    filemode="w",
    format='%(asctime)s:%(name)s:%(levelname)s:%(message)s',
    level=logging.INFO
)
logger = logging.getLogger(__name__)

print(f"Logging to: {logfilename}")

## Import AudioProcessor

Import the reusable AudioProcessor class from `audio_processor.py` which handles real-time audio capture and playback.

In [None]:
from audio_processor import AudioProcessor

print("AudioProcessor imported from audio_processor.py!")

## Define Function Tools

These are the backend functions that can be called by the Voice Live assistant:
- `get_current_time`: Returns the current time
- `get_current_weather`: Returns simulated weather data for a location

In [None]:
def get_current_time(arguments: Optional[Union[str, Mapping[str, Any]]] = None) -> Dict[str, Any]:
    """Get the current time."""
    if isinstance(arguments, str):
        try:
            args = json.loads(arguments)
        except json.JSONDecodeError:
            args = {}
    else:
        args = arguments if isinstance(arguments, dict) else {}

    timezone_arg = args.get("timezone", "local")
    now = datetime.now()

    if timezone_arg.lower() == "utc":
        now = datetime.now(timezone.utc)
        timezone_name = "UTC"
    else:
        timezone_name = "local"

    formatted_time = now.strftime("%I:%M:%S %p")
    formatted_date = now.strftime("%A, %B %d, %Y")

    return {"time": formatted_time, "date": formatted_date, "timezone": timezone_name}


def get_current_weather(arguments: Union[str, Mapping[str, Any]]) -> Dict[str, Any]:
    """Get the current weather for a location."""
    if isinstance(arguments, str):
        try:
            args = json.loads(arguments)
        except json.JSONDecodeError:
            logger.error(f"Failed to parse weather arguments: {arguments}")
            return {"error": "Invalid arguments"}
    else:
        args = arguments if isinstance(arguments, dict) else {}

    location = args.get("location", "Unknown")
    unit = args.get("unit", "celsius")

    # Simulated weather response
    try:
        return {
            "location": location,
            "temperature": 22 if unit == "celsius" else 72,
            "unit": unit,
            "condition": "Partly Cloudy",
            "humidity": 65,
            "wind_speed": 10,
        }
    except Exception as e:
        logger.error(f"Error getting weather: {e}")
        return {"error": str(e)}


# Map of available functions
AVAILABLE_FUNCTIONS: Dict[str, Callable] = {
    "get_current_time": get_current_time,
    "get_current_weather": get_current_weather,
}

# Test the functions
print("Testing get_current_time:")
print(get_current_time())
print("\nTesting get_current_weather:")
print(get_current_weather({"location": "Seattle, WA", "unit": "fahrenheit"}))

## Define FunctionTool Schemas

Create FunctionTool definitions with names, parameter schemas, and text descriptions for the Voice Live session.

In [None]:
# Define function tools for Voice Live session
FUNCTION_TOOLS: list[Tool] = [
    FunctionTool(
        name="get_current_time",
        description="Get the current time",
        parameters={
            "type": "object",
            "properties": {
                "timezone": {
                    "type": "string",
                    "description": "The timezone to get the current time for, e.g., 'UTC', 'local'",
                }
            },
            "required": [],
        },
    ),
    FunctionTool(
        name="get_current_weather",
        description="Get the current weather in a given location",
        parameters={
            "type": "object",
            "properties": {
                "location": {
                    "type": "string",
                    "description": "The city and state, e.g., 'San Francisco, CA'",
                },
                "unit": {
                    "type": "string",
                    "enum": ["celsius", "fahrenheit"],
                    "description": "The unit of temperature to use (celsius or fahrenheit)",
                },
            },
            "required": ["location"],
        },
    ),
]

print(f"Defined {len(FUNCTION_TOOLS)} function tools:")
for tool in FUNCTION_TOOLS:
    print(f"  - {tool.name}: {tool.description}")

## Define AsyncFunctionCallingClient Class

This class handles:
- Session setup with function tools
- Event processing from Voice Live
- Function call execution and response handling

In [None]:
class AsyncFunctionCallingClient:
    """Voice assistant with function calling capabilities using VoiceLive SDK patterns."""

    def __init__(
        self,
        endpoint: str,
        credential: Union[AzureKeyCredential, AsyncTokenCredential],
        model: str,
        voice: str,
        instructions: str,
    ):
        self.endpoint = endpoint
        self.credential = credential
        self.model = model
        self.voice = voice
        self.instructions = instructions
        self.connection: Optional["VoiceLiveConnection"] = None
        self.audio_processor: Optional[AudioProcessor] = None
        self.session_ready = False
        self.conversation_started = False
        self._active_response = False
        self._response_api_done = False
        self._pending_function_call: Optional[Dict[str, Any]] = None

    async def start(self):
        """Start the voice assistant session."""
        try:
            logger.info("Connecting to VoiceLive API with model %s", self.model)

            # Connect to VoiceLive WebSocket API
            async with connect(
                endpoint=self.endpoint,
                credential=self.credential,
                model=self.model,
            ) as connection:
                conn = connection
                self.connection = conn

                # Initialize audio processor
                ap = AudioProcessor(conn)
                self.audio_processor = ap

                # Configure session for voice conversation
                await self._setup_session()

                # Start audio systems
                ap.start_playback()

                logger.info("Voice assistant with function calling ready!")
                print("\n" + "=" * 60)
                print("üé§ VOICE ASSISTANT WITH FUNCTION CALLING READY")
                print("Try saying:")
                print("  ‚Ä¢ 'What's the current time?'")
                print("  ‚Ä¢ 'What's the weather in Seattle?'")
                print("Press Ctrl+C to exit")
                print("=" * 60 + "\n")

                # Process events
                await self._process_events()
        except asyncio.CancelledError:
            logger.info("Session cancelled by user")
            raise
        finally:
            if self.audio_processor:
                self.audio_processor.shutdown()

    async def _setup_session(self):
        """Configure the VoiceLive session for audio conversation with function tools."""
        logger.info("Setting up voice conversation session with function tools...")

        # Create voice configuration
        voice_config: Union[AzureStandardVoice, str]
        if self.voice.startswith("en-US-") or self.voice.startswith("en-CA-") or "-" in self.voice:
            # Azure voice
            voice_config = AzureStandardVoice(name=self.voice)
        else:
            # OpenAI voice (alloy, echo, fable, onyx, nova, shimmer)
            voice_config = self.voice

        # Create turn detection configuration
        turn_detection_config = ServerVad(
            threshold=0.5,
            prefix_padding_ms=300,
            silence_duration_ms=500)

        # Create session configuration with function tools
        session_config = RequestSession(
            modalities=[Modality.TEXT, Modality.AUDIO],
            instructions=self.instructions,
            voice=voice_config,
            input_audio_format=InputAudioFormat.PCM16,
            output_audio_format=OutputAudioFormat.PCM16,
            turn_detection=turn_detection_config,
            input_audio_echo_cancellation=AudioEchoCancellation(),
            input_audio_noise_reduction=AudioNoiseReduction(type="azure_deep_noise_suppression"),
            tools=FUNCTION_TOOLS,
            tool_choice=ToolChoiceLiteral.AUTO,
            input_audio_transcription=AudioInputTranscriptionOptions(model="whisper-1"),
        )

        conn = self.connection
        assert conn is not None, "Connection must be established before setting up session"
        await conn.session.update(session=session_config)

        logger.info("Session configuration with function tools sent")

    async def _process_events(self):
        """Process events from the VoiceLive connection."""
        try:
            conn = self.connection
            assert conn is not None, "Connection must be established before processing events"
            async for event in conn:
                await self._handle_event(event)
        except asyncio.CancelledError:
            logger.info("Event processing cancelled")
            raise
        except Exception:
            logger.exception("Error processing events")
            raise

    async def _handle_event(self, event):
        """Handle different types of events from VoiceLive."""
        logger.debug("Received event: %s", event.type)
        ap = self.audio_processor
        conn = self.connection
        assert ap is not None, "AudioProcessor must be initialized"
        assert conn is not None, "Connection must be established"

        if event.type == ServerEventType.SESSION_UPDATED:
            logger.info("Session ready: %s", event.session.id)
            self.session_ready = True

            # Proactive greeting
            if not self.conversation_started:
                self.conversation_started = True
                logger.info("Sending proactive greeting request")
                try:
                    await conn.response.create()
                except Exception:
                    logger.exception("Failed to send proactive greeting request")

            # Start audio capture once session is ready
            ap.start_capture()

        elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STARTED:
            logger.info("User started speaking - stopping playback")
            print("üé§ Listening...")

            ap.skip_pending_audio()

            # Only cancel if response is active and not already done
            if self._active_response and not self._response_api_done:
                try:
                    await conn.response.cancel()
                    logger.debug("Cancelled in-progress response due to barge-in")
                except Exception as e:
                    if "no active response" in str(e).lower():
                        logger.debug("Cancel ignored - response already completed")
                    else:
                        logger.warning("Cancel failed: %s", e)

        elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STOPPED:
            logger.info("üé§ User stopped speaking")
            print("ü§î Processing...")

        elif event.type == ServerEventType.RESPONSE_CREATED:
            logger.info("ü§ñ Assistant response created")
            self._active_response = True
            self._response_api_done = False

        elif event.type == ServerEventType.RESPONSE_AUDIO_DELTA:
            logger.debug("Received audio delta")
            ap.queue_audio(event.delta)

        elif event.type == ServerEventType.RESPONSE_AUDIO_DONE:
            logger.info("ü§ñ Assistant finished speaking")
            print("üé§ Ready for next input...")

        elif event.type == ServerEventType.RESPONSE_DONE:
            logger.info("‚úÖ Response complete")
            self._active_response = False
            self._response_api_done = True

            # Execute pending function call if arguments are ready
            if self._pending_function_call and "arguments" in self._pending_function_call:
                await self._execute_function_call(self._pending_function_call)
                self._pending_function_call = None

        elif event.type == ServerEventType.ERROR:
            msg = event.error.message
            if "Cancellation failed: no active response" in msg:
                logger.debug("Benign cancellation error: %s", msg)
            else:
                logger.error("‚ùå VoiceLive error: %s", msg)
                print(f"Error: {msg}")

        elif event.type == ServerEventType.CONVERSATION_ITEM_CREATED:
            logger.debug("Conversation item created: %s", event.item.id)

            if event.item.type == ItemType.FUNCTION_CALL:
                function_call_item = event.item
                self._pending_function_call = {
                    "name": function_call_item.name,
                    "call_id": function_call_item.call_id,
                    "previous_item_id": function_call_item.id
                }
                print(f"üîß Calling function: {function_call_item.name}")
                logger.info(f"Function call detected: {function_call_item.name} with call_id: {function_call_item.call_id}")

        elif event.type == ServerEventType.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE:
            if self._pending_function_call and event.call_id == self._pending_function_call["call_id"]:
                logger.info(f"Function arguments received: {event.arguments}")
                self._pending_function_call["arguments"] = event.arguments

    async def _execute_function_call(self, function_call_info):
        """Execute a function call and send the result back to the conversation."""
        conn = self.connection
        assert conn is not None, "Connection must be established"

        function_name = function_call_info["name"]
        call_id = function_call_info["call_id"]
        previous_item_id = function_call_info["previous_item_id"]
        arguments = function_call_info["arguments"]

        try:
            if function_name in AVAILABLE_FUNCTIONS:
                logger.info(f"Executing function: {function_name}")
                result = AVAILABLE_FUNCTIONS[function_name](arguments)

                function_output = FunctionCallOutputItem(call_id=call_id, output=json.dumps(result))

                # Send result back to conversation
                await conn.conversation.item.create(previous_item_id=previous_item_id, item=function_output)
                logger.info(f"Function result sent: {result}")
                print(f"‚úÖ Function {function_name} completed")

                # Request new response to process the function result
                await conn.response.create()
                logger.info("Requested new response with function result")

            else:
                logger.error(f"Unknown function: {function_name}")

        except Exception as e:
            logger.error(f"Error executing function {function_name}: {e}")

print("AsyncFunctionCallingClient class defined!")

## Check Audio System

Verify that audio input and output devices are available.

In [None]:
# Check audio system
try:
    p = pyaudio.PyAudio()
    
    # Check for input devices
    input_devices = [
        i
        for i in range(p.get_device_count())
        if cast(Union[int, float], p.get_device_info_by_index(i).get("maxInputChannels", 0) or 0) > 0
    ]
    
    # Check for output devices
    output_devices = [
        i
        for i in range(p.get_device_count())
        if cast(Union[int, float], p.get_device_info_by_index(i).get("maxOutputChannels", 0) or 0) > 0
    ]
    
    p.terminate()

    if not input_devices:
        print("‚ùå No audio input devices found. Please check your microphone.")
        audio_ok = False
    elif not output_devices:
        print("‚ùå No audio output devices found. Please check your speakers.")
        audio_ok = False
    else:
        print(f"‚úÖ Found {len(input_devices)} input device(s) and {len(output_devices)} output device(s)")
        audio_ok = True

except Exception as e:
    print(f"‚ùå Audio system check failed: {e}")
    audio_ok = False

## Run the Voice Assistant

Start the voice assistant with function calling capabilities.

**Available voice commands:**
- "What's the current time?"
- "What's the weather in Seattle?"

Press Ctrl+C or interrupt the kernel to stop.

In [None]:
# Validate credentials
if not API_KEY and not USE_TOKEN_CREDENTIAL:
    print("‚ùå Error: No authentication provided")
    print("Please provide an API key in .env file or set USE_TOKEN_CREDENTIAL = True")
else:
    # Create credential
    if USE_TOKEN_CREDENTIAL:
        credential = AzureCliCredential()
        print("Using Azure token credential")
    else:
        credential = AzureKeyCredential(API_KEY)
        print("Using API key credential")

    # Create and start voice assistant with function calling
    client = AsyncFunctionCallingClient(
        endpoint=ENDPOINT,
        credential=credential,
        model=MODEL,
        voice=VOICE,
        instructions=INSTRUCTIONS,
    )

    print("\nüéôÔ∏è  Voice Assistant with Function Calling - Azure VoiceLive SDK")
    print("=" * 65)

    try:
        await client.start()
    except KeyboardInterrupt:
        print("\nüëã Voice assistant shut down. Goodbye!")
    except asyncio.CancelledError:
        print("\nüëã Session cancelled. Voice assistant shut down.")
    except Exception as e:
        logger.exception("Fatal error")
        print(f"Fatal Error: {e}")