# Azure VoiceLive SDK - Basic Voice Assistant

This notebook demonstrates how to build a real-time voice assistant using the **Azure AI VoiceLive SDK**. The assistant captures audio from your microphone, sends it to Azure VoiceLive API, and plays back the AI-generated audio response.

## Features
- Real-time bidirectional audio streaming
- Voice Activity Detection (VAD) for natural conversation
- Echo cancellation and noise reduction
- Support for Azure and OpenAI voices

## Prerequisites
- Azure AI Foundry resource with VoiceLive enabled
- Python 3.9+
- PyAudio for audio capture/playback
- Microphone and speakers

## 1. Install Required Packages

In [None]:
# First install portaudio via homebrew: brew install portaudio
# Then install Python packages
%pip install azure-ai-voicelive azure-identity python-dotenv pyaudio

## 2. Import Libraries and Setup Logging

In [None]:
from __future__ import annotations
import os
import asyncio
from datetime import datetime
import logging
from typing import Union, Optional, TYPE_CHECKING, cast, List, Dict

from azure.core.credentials import AzureKeyCredential
from azure.core.credentials_async import AsyncTokenCredential
from azure.identity.aio import AzureCliCredential

from azure.ai.voicelive.aio import connect
from azure.ai.voicelive.models import (
    AudioEchoCancellation,
    AudioInputTranscriptionOptions,
    AudioNoiseReduction,
    AzureSemanticVad,
    AzureStandardVoice,
    EouDetection,
    InputAudioFormat,
    Modality,
    OutputAudioFormat,
    RequestSession,
    ServerEventType,
    ServerVad
)
from dotenv import load_dotenv
import pyaudio

# Import reusable AudioProcessor from local module
from audio_processor import AudioProcessor

if TYPE_CHECKING:
    from azure.ai.voicelive.aio import VoiceLiveConnection

# Load environment variables
load_dotenv('./.env', override=True)

# Setup logging
if not os.path.exists('logs'):
    os.makedirs('logs')

timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
logging.basicConfig(
    filename=f'logs/{timestamp}_voicelive.log',
    filemode="w",
    format='%(asctime)s:%(name)s:%(levelname)s:%(message)s',
    level=logging.INFO
)
logger = logging.getLogger(__name__)
print("‚úÖ Libraries imported and logging configured")

## 3. AudioProcessor

The `AudioProcessor` class is imported from `audio_processor.py`. It handles real-time audio capture and playback using PyAudio.

**Features:**
- PCM16, 24kHz, mono audio format
- Callback-based capture and playback threads
- Support for barge-in (skip pending audio)
- Proper resource cleanup

See `audio_processor.py` for the full implementation.

## 4. BasicVoiceAssistant Class

The main voice assistant class that manages the VoiceLive connection and handles events.

In [None]:
class BasicVoiceAssistant:
    """Basic voice assistant implementing the VoiceLive SDK patterns."""

    def __init__(
        self,
        endpoint: str,
        credential: Union[AzureKeyCredential, AsyncTokenCredential],
        model: str,
        voice: str,
        instructions: str,
        voice_temperature: Optional[float] = None,
        input_audio_transcription_enabled: bool = False,
        input_audio_transcription_model: Optional[str] = None,
        phrase_list: Optional[List[str]] = None,
        transcription_language: Optional[str] = None,
        custom_speech_models: Optional[Dict[str, str]] = None,
        # VAD configuration
        vad_config: Optional[Dict] = None,
    ):
        self.endpoint = endpoint
        self.credential = credential
        self.model = model
        self.voice = voice
        self.instructions = instructions
        self.voice_temperature = voice_temperature
        self.input_audio_transcription_enabled = input_audio_transcription_enabled
        self.input_audio_transcription_model = input_audio_transcription_model
        self.phrase_list = phrase_list or []
        self.custom_speech_models = custom_speech_models or {}
        self.vad_config = vad_config or {}
        # Auto-detect language from voice if not specified
        if transcription_language:
            self.transcription_language = transcription_language
        elif voice and "-" in voice:
            # Extract locale from voice name (e.g., "zh-CN-Xiaochen:..." -> "zh-CN")
            parts = voice.split("-")
            if len(parts) >= 2:
                self.transcription_language = f"{parts[0]}-{parts[1].split(':')[0]}"
            else:
                self.transcription_language = "en-US"
        else:
            self.transcription_language = "en-US"
        self.connection: Optional["VoiceLiveConnection"] = None
        self.audio_processor: Optional[AudioProcessor] = None
        self.session_ready = False
        self._active_response = False
        self._response_api_done = False
        self._stop_requested = False
        # Buffer for streaming assistant transcript
        self._assistant_transcript_buffer = ""

    def _is_azure_transcription_model(self) -> bool:
        """Check if the transcription model is an Azure model that supports phrase_list."""
        azure_models = ["azure-speech", "azure-fast-transcription"]
        return self.input_audio_transcription_model in azure_models

    def stop(self):
        """Request graceful stop of the voice assistant."""
        self._stop_requested = True
        print("\nüõë Stop requested, shutting down...")

    async def start(self):
        """Start the voice assistant session."""
        try:
            logger.info(
                "Connecting to VoiceLive API with model %s", self.model)

            async with connect(
                endpoint=self.endpoint,
                credential=self.credential,
                model=self.model,
            ) as connection:
                conn = connection
                self.connection = conn

                ap = AudioProcessor(conn)
                self.audio_processor = ap

                await self._setup_session()
                ap.start_playback()

                logger.info("Voice assistant ready! Start speaking...")
                print("\n" + "=" * 60)
                print("üé§ VOICE ASSISTANT READY")
                print("Start speaking to begin conversation")
                if self.input_audio_transcription_enabled:
                    print(
                        f"üìù Input transcription: {self.input_audio_transcription_model}")
                    print(
                        f"üåê Transcription language: {self.transcription_language}")
                    if self.custom_speech_models and self.transcription_language in self.custom_speech_models:
                        print("üéØ Custom Speech: enabled")
                print("Or use Cmd+Shift+P ‚Üí 'Notebook: Restart Kernel' to exit")
                print("=" * 60 + "\n")

                await self._process_events()
        finally:
            if self.audio_processor:
                self.audio_processor.shutdown()

    def _create_vad_config(self) -> Union[AzureSemanticVad, ServerVad]:
        """Create VAD configuration based on settings."""
        vad_type = self.vad_config.get("type", "server_vad")
        
        if vad_type == "azure_semantic_vad":
            # Build end_of_utterance_detection if configured
            eou_config = self.vad_config.get("end_of_utterance_detection")
            eou_detection = None
            if eou_config:
                eou_detection = EouDetection(
                    model=eou_config.get("model", "semantic_detection_v1"),
                )
            
            vad = AzureSemanticVad(
                threshold=self.vad_config.get("threshold", 0.5),
                prefix_padding_ms=self.vad_config.get("prefix_padding_ms", 300),
                speech_duration_ms=self.vad_config.get("speech_duration_ms", 80),
                silence_duration_ms=self.vad_config.get("silence_duration_ms", 500),
                remove_filler_words=self.vad_config.get("remove_filler_words", False),
                end_of_utterance_detection=eou_detection,
            )
            logger.info(f"Using AzureSemanticVad with config: {self.vad_config}")
            return vad
        else:
            # ServerVad configuration
            vad = ServerVad(
                threshold=self.vad_config.get("threshold", 0.8),
                prefix_padding_ms=self.vad_config.get("prefix_padding_ms", 200),
                silence_duration_ms=self.vad_config.get("silence_duration_ms", 1000),
            )
            logger.info(f"Using ServerVad with config: {self.vad_config}")
            return vad

    async def _setup_session(self):
        """Configure the VoiceLive session for audio conversation."""
        logger.info("Setting up voice conversation session...")

        # Configure Azure HD voice with optional temperature
        voice_config: Union[AzureStandardVoice, str]
        if self.voice.startswith("en-US-") or self.voice.startswith("zh-CN-") or "-" in self.voice:
            voice_config = AzureStandardVoice(
                name=self.voice,
                temperature=self.voice_temperature
            )
            logger.info(
                f"Using Azure HD voice: {self.voice}, temperature: {self.voice_temperature}")
        else:
            voice_config = self.voice

        # VAD configuration - use custom config or auto-select based on transcription model
        if self.vad_config:
            turn_detection_config = self._create_vad_config()
        elif self.input_audio_transcription_enabled and self._is_azure_transcription_model():
            # AzureSemanticVad is REQUIRED when using azure-speech transcription
            turn_detection_config = AzureSemanticVad()
            logger.info("Using default AzureSemanticVad for Azure transcription")
        else:
            # ServerVad for other cases (OpenAI whisper, etc.)
            turn_detection_config = ServerVad(
                threshold=0.8,
                prefix_padding_ms=200,
                silence_duration_ms=1000,
            )
            logger.info("Using default ServerVad for turn detection")

        # Configure input audio transcription if enabled
        input_transcription_config: Optional[AudioInputTranscriptionOptions] = None
        if self.input_audio_transcription_enabled:
            # phrase_list and custom_speech are only supported for Azure models
            if self._is_azure_transcription_model():
                input_transcription_config = AudioInputTranscriptionOptions(
                    model=self.input_audio_transcription_model,
                    language=self.transcription_language,
                    phrase_list=self.phrase_list if self.phrase_list else None,
                    # Custom Speech model mapping (language -> endpoint ID)
                    custom_speech=self.custom_speech_models if self.custom_speech_models else None,
                )
                logger.info(
                    f"Input audio transcription enabled: model={self.input_audio_transcription_model}, "
                    f"language={self.transcription_language}, phrase_list={self.phrase_list}, "
                    f"custom_speech={list(self.custom_speech_models.keys()) if self.custom_speech_models else None}"
                )
            else:
                # For non-Azure models (whisper-1), don't pass phrase_list, language, or custom_speech
                input_transcription_config = AudioInputTranscriptionOptions(
                    model=self.input_audio_transcription_model,
                )
                logger.info(
                    f"Input audio transcription enabled: model={self.input_audio_transcription_model} "
                    "(phrase_list/custom_speech not supported)"
                )

        session_config = RequestSession(
            modalities=[Modality.TEXT, Modality.AUDIO],
            instructions=self.instructions,
            voice=voice_config,
            input_audio_format=InputAudioFormat.PCM16,
            output_audio_format=OutputAudioFormat.PCM16,
            turn_detection=turn_detection_config,
            input_audio_echo_cancellation=AudioEchoCancellation(),
            input_audio_noise_reduction=AudioNoiseReduction(
                type="azure_deep_noise_suppression"),
            input_audio_transcription=input_transcription_config,
        )

        conn = self.connection
        assert conn is not None, "Connection must be established before setting up session"
        await conn.session.update(session=session_config)
        logger.info("Session configuration sent")

    async def _process_events(self):
        """Process events from the VoiceLive connection."""
        try:
            conn = self.connection
            assert conn is not None, "Connection must be established before processing events"
            async for event in conn:
                if self._stop_requested:
                    logger.info("Stop requested, exiting event loop")
                    break
                await self._handle_event(event)
        except asyncio.CancelledError:
            logger.info("Event processing cancelled")
            raise
        except Exception:
            logger.exception("Error processing events")
            raise

    async def _handle_event(self, event):
        """Handle different types of events from VoiceLive."""
        logger.debug("Received event: %s", event.type)
        ap = self.audio_processor
        conn = self.connection
        assert ap is not None, "AudioProcessor must be initialized"
        assert conn is not None, "Connection must be established"

        if event.type == ServerEventType.SESSION_UPDATED:
            logger.info("Session ready: %s", event.session.id)
            self.session_ready = True
            ap.start_capture()

        elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STARTED:
            logger.info("User started speaking - stopping playback")
            print("üé§ Listening...")
            ap.skip_pending_audio()

            if self._active_response and not self._response_api_done:
                try:
                    await conn.response.cancel()
                    logger.debug(
                        "Cancelled in-progress response due to barge-in")
                except Exception as e:
                    if "no active response" in str(e).lower():
                        logger.debug(
                            "Cancel ignored - response already completed")
                    else:
                        logger.warning("Cancel failed: %s", e)

        elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STOPPED:
            logger.info("üé§ User stopped speaking")
            print("ü§î Processing...")

        elif event.type == ServerEventType.RESPONSE_CREATED:
            logger.info("ü§ñ Assistant response created")
            self._active_response = True
            self._response_api_done = False
            self._assistant_transcript_buffer = ""  # Reset buffer for new response

        elif event.type == ServerEventType.RESPONSE_AUDIO_DELTA:
            logger.debug("Received audio delta")
            ap.queue_audio(event.delta)

        elif event.type == ServerEventType.RESPONSE_AUDIO_DONE:
            logger.info("ü§ñ Assistant finished speaking")
            print("üé§ Ready for next input...")

        elif event.type == ServerEventType.RESPONSE_DONE:
            logger.info("‚úÖ Response complete")
            self._active_response = False
            self._response_api_done = True

        # Handle assistant audio transcript (output transcription)
        elif event.type == ServerEventType.RESPONSE_AUDIO_TRANSCRIPT_DELTA:
            delta = getattr(event, 'delta', None)
            if delta:
                self._assistant_transcript_buffer += delta
                logger.debug(f"Assistant transcript delta: {delta}")

        elif event.type == ServerEventType.RESPONSE_AUDIO_TRANSCRIPT_DONE:
            transcript = getattr(event, 'transcript', None)
            if transcript:
                logger.info(f"ü§ñ Assistant said: {transcript}")
                print(f"ü§ñ Assistant: {transcript}")
            elif self._assistant_transcript_buffer:
                # Use buffered transcript if final event doesn't include it
                logger.info(
                    f"ü§ñ Assistant said: {self._assistant_transcript_buffer}")
                print(f"ü§ñ Assistant: {self._assistant_transcript_buffer}")
            self._assistant_transcript_buffer = ""

        # Handle input audio transcription events (user speech)
        elif event.type == ServerEventType.CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_COMPLETED:
            transcript = getattr(event, 'transcript', None)
            if transcript:
                logger.info(f"üìù User said: {transcript}")
                print(f"üìù You said: {transcript}")

        elif event.type == ServerEventType.CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_DELTA:
            delta = getattr(event, 'delta', None)
            if delta:
                logger.debug(f"Transcription delta: {delta}")

        elif event.type == ServerEventType.CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_FAILED:
            error = getattr(event, 'error', None)
            logger.warning(f"Input transcription failed: {error}")

        elif event.type == ServerEventType.ERROR:
            msg = event.error.message
            if "Cancellation failed: no active response" in msg:
                logger.debug("Benign cancellation error: %s", msg)
            else:
                logger.error("‚ùå VoiceLive error: %s", msg)
                print(f"Error: {msg}")

        elif event.type == ServerEventType.CONVERSATION_ITEM_CREATED:
            logger.debug("Conversation item created: %s", event.item.id)

        else:
            logger.debug("Unhandled event type: %s", event.type)


print("‚úÖ BasicVoiceAssistant class defined")

## 5. Check Audio System

Verify that audio input/output devices are available before starting the assistant.

In [None]:
def check_audio_system():
    """Check if audio input/output devices are available."""
    try:
        p = pyaudio.PyAudio()
        
        # Check for input devices
        input_devices = [
            i for i in range(p.get_device_count())
            if cast(Union[int, float], p.get_device_info_by_index(i).get("maxInputChannels", 0) or 0) > 0
        ]
        
        # Check for output devices
        output_devices = [
            i for i in range(p.get_device_count())
            if cast(Union[int, float], p.get_device_info_by_index(i).get("maxOutputChannels", 0) or 0) > 0
        ]
        
        p.terminate()

        if not input_devices:
            print("‚ùå No audio input devices found. Please check your microphone.")
            return False
        if not output_devices:
            print("‚ùå No audio output devices found. Please check your speakers.")
            return False
        
        print(f"‚úÖ Found {len(input_devices)} input device(s) and {len(output_devices)} output device(s)")
        return True
        
    except Exception as e:
        print(f"‚ùå Audio system check failed: {e}")
        return False

# Run the audio check
audio_ok = check_audio_system()

## 6. Run the Voice Assistant

The `run_voice_assistant()` function creates and starts the voice assistant with the current configuration.

In [None]:
async def run_voice_assistant():
    """Run the voice assistant."""
    # Validate credentials
    if not API_KEY and not USE_TOKEN_CREDENTIAL:
        print("‚ùå Error: No authentication provided")
        print("Please set AZURE_VOICELIVE_API_KEY in the .env file,")
        print("or set USE_TOKEN_CREDENTIAL = True for Azure authentication.")
        return

    # Create client with appropriate credential
    credential: Union[AzureKeyCredential, AsyncTokenCredential]
    if USE_TOKEN_CREDENTIAL:
        credential = AzureCliCredential()
        logger.info("Using Azure token credential")
        print("üîê Using Azure CLI credential")
    else:
        credential = AzureKeyCredential(API_KEY)
        logger.info("Using API key credential")
        print("üîë Using API key credential")

    # Create and start voice assistant
    assistant = BasicVoiceAssistant(
        endpoint=ENDPOINT,
        credential=credential,
        model=MODEL,
        voice=VOICE,
        instructions=INSTRUCTIONS,
        voice_temperature=VOICE_TEMPERATURE,
        # Input audio transcription settings
        input_audio_transcription_enabled=INPUT_AUDIO_TRANSCRIPTION_ENABLED,
        input_audio_transcription_model=INPUT_AUDIO_TRANSCRIPTION_MODEL,
        phrase_list=PHRASE_LIST,
        # Custom Speech model mapping
        custom_speech_models=CUSTOM_SPEECH_MODELS if CUSTOM_SPEECH_ENABLED else None,
        # VAD configuration
        vad_config=VAD_CONFIG,
    )

    print("\nüéôÔ∏è  Basic Voice Assistant with Azure VoiceLive SDK")
    print("=" * 50)
    if INPUT_AUDIO_TRANSCRIPTION_ENABLED:
        print(f"üìù Input transcription: {INPUT_AUDIO_TRANSCRIPTION_MODEL}")
        if CUSTOM_SPEECH_ENABLED and CUSTOM_SPEECH_MODELS:
            print(f"üéØ Custom Speech: enabled for {list(CUSTOM_SPEECH_MODELS.keys())}")
        elif PHRASE_LIST:
            print(f"   Custom phrases: {', '.join(PHRASE_LIST[:3])}{'...' if len(PHRASE_LIST) > 3 else ''}")
    print(f"üéõÔ∏è VAD: {VAD_CONFIG.get('type', 'auto')}")

    try:
        await assistant.start()
    except KeyboardInterrupt:
        print("\nüëã Voice assistant shut down. Goodbye!")
    except asyncio.CancelledError:
        print("\nüëã Voice assistant interrupted. Goodbye!")
    except Exception as e:
        print(f"‚ùå Fatal Error: {e}")
        raise

print("‚úÖ run_voice_assistant() defined")

## 7. Configuration & Run

Update the settings below and run this cell to start the assistant:

In [None]:
# ============================================================
# CONFIGURATION - Edit these settings and run this cell
# ============================================================

# Azure Credentials (loaded from .env file)
API_KEY = os.environ.get("AZURE_VOICELIVE_API_KEY")
ENDPOINT = os.environ.get("AZURE_VOICELIVE_ENDPOINT",
                          "https://your-resource.services.ai.azure.com/")
MODEL = "gpt-4o"

# Azure HD Voice Configuration
# Format: "{locale}-{VoiceName}:DragonHDLatestNeural"
# Examples:
#   - "en-US-Ava:DragonHDLatestNeural" (English US)
#   - "zh-CN-Xiaochen:DragonHDLatestNeural" (Chinese)
#   - "ja-JP-Nanami:DragonHDLatestNeural" (Japanese)
VOICE = "zh-CN-Xiaochen:DragonHDLatestNeural"
VOICE_TEMPERATURE = 0.8  # Controls voice expressiveness (0.0-1.0)

# Input Audio Transcription Configuration
INPUT_AUDIO_TRANSCRIPTION_ENABLED = True

# Transcription model options:
#   - "whisper-1": OpenAI Whisper (auto-detect language, no custom speech)
#   - "azure-speech": Azure Speech (supports phrase_list & custom_speech)
#   - "azure-fast-transcription": Azure Fast Transcription (supports phrase_list & custom_speech)
INPUT_AUDIO_TRANSCRIPTION_MODEL = "azure-speech"

# Custom phrase list for better recognition of specific terms
# Only works with azure-speech and azure-fast-transcription
PHRASE_LIST: List[str] = [
    "Neo QLED TV",
    "TUF Gaming", 
    "TUF",
    "ASUS TUF",
    "ASUS TUF Gaming",
    "AutoQuote Explorer",
    "asus tuf dash"
]

# ============================================================
# CUSTOM SPEECH CONFIGURATION
# ============================================================
# Enable Custom Speech for improved domain-specific recognition
# Requires azure-speech or azure-fast-transcription model
CUSTOM_SPEECH_ENABLED = True

# Custom Speech Model Endpoint IDs
# Map language codes to your deployed Custom Speech model IDs
# Models must be deployed in the SAME region as your Foundry resource
# Get the Endpoint ID from your Custom Speech deployment in Azure Speech Studio
CUSTOM_SPEECH_MODELS: Dict[str, str] = {
    # Chinese Custom Speech model
    "zh-CN": "dbdc0514-efcf-49c9-b022-040e463c4725",
    
    # Add more language-specific models as needed:
    # "en-US": "your-english-model-endpoint-id",
    # "ja-JP": "your-japanese-model-endpoint-id",
}

# ============================================================
# VAD (Voice Activity Detection) CONFIGURATION
# ============================================================
# Two VAD types available:
#   - "azure_semantic_vad": Advanced semantic-based detection (required for azure-speech)
#   - "server_vad": Basic threshold-based detection (for whisper-1 or when no transcription)

VAD_CONFIG: Dict = {
    "type": "azure_semantic_vad",  # or "server_vad"
    
    # Common settings for both VAD types:
    "threshold": 0.3,              # Speech detection sensitivity (0.0-1.0, lower = more sensitive)
    "prefix_padding_ms": 300,      # Audio to include before speech starts (ms)
    "silence_duration_ms": 500,    # How long to wait after silence to end turn (ms)
    
    # AzureSemanticVad specific settings:
    "speech_duration_ms": 80,      # Minimum speech duration to trigger detection (ms)
    "remove_filler_words": False,  # Remove "um", "uh", etc. from transcription
    
    # End-of-utterance detection (AzureSemanticVad only)
    # NOTE: EOU detection is only supported for cascaded pipelines (phi4-mm-realtime, etc.)
    # Set to None to disable, or provide config for supported models
    "end_of_utterance_detection": None,  # Disabled - not supported with gpt-realtime
    # Example for supported models:
    # "end_of_utterance_detection": {
    #     "model": "semantic_detection_v1",
    # },
}

# Alternative: ServerVad config (simpler, for non-Azure transcription)
# VAD_CONFIG: Dict = {
#     "type": "server_vad",
#     "threshold": 0.8,              # Higher threshold to reduce false positives
#     "prefix_padding_ms": 200,
#     "silence_duration_ms": 1000,   # Wait 1 second of silence
# }

# System instructions for the AI assistant
INSTRUCTIONS = os.environ.get(
    "AZURE_VOICELIVE_INSTRUCTIONS",
    "You are a helpful AI assistant. Respond naturally and conversationally. Keep your responses concise but engaging."
)

# Authentication method
USE_TOKEN_CREDENTIAL = False  # Set True to use Azure CLI credential

# ============================================================
# AUTO-DETECTION & STATUS
# ============================================================

def _get_transcription_language(voice: str) -> str:
    if voice and "-" in voice:
        parts = voice.split("-")
        if len(parts) >= 2:
            return f"{parts[0]}-{parts[1].split(':')[0]}"
    return "en-US"

TRANSCRIPTION_LANGUAGE = _get_transcription_language(VOICE)

print(f"üìç Endpoint: {ENDPOINT}")
print(f"ü§ñ Model: {MODEL}")
print(f"üéôÔ∏è Voice: {VOICE}")
print(f"üå°Ô∏è Voice Temperature: {VOICE_TEMPERATURE}")
print(f"üåê Transcription Language: {TRANSCRIPTION_LANGUAGE} (auto-detected)")
print(f"üîë API Key: {'Set' if API_KEY else 'Not set'}")
print(f"üìù Input Transcription: {'Enabled' if INPUT_AUDIO_TRANSCRIPTION_ENABLED else 'Disabled'}")
if INPUT_AUDIO_TRANSCRIPTION_ENABLED:
    print(f"   Model: {INPUT_AUDIO_TRANSCRIPTION_MODEL}")
    if INPUT_AUDIO_TRANSCRIPTION_MODEL in ["azure-speech", "azure-fast-transcription"]:
        print(f"   Phrase List: {len(PHRASE_LIST)} terms")
        print(f"üéØ Custom Speech: {'Enabled' if CUSTOM_SPEECH_ENABLED else 'Disabled'}")
        if CUSTOM_SPEECH_ENABLED and CUSTOM_SPEECH_MODELS:
            for lang, model_id in CUSTOM_SPEECH_MODELS.items():
                print(f"      {lang}: {model_id[:30]}...")
                
print(f"üéõÔ∏è VAD Type: {VAD_CONFIG.get('type', 'auto')}")
if VAD_CONFIG:
    print(f"   Threshold: {VAD_CONFIG.get('threshold', 'default')}")
    print(f"   Silence Duration: {VAD_CONFIG.get('silence_duration_ms', 'default')}ms")
    if VAD_CONFIG.get('type') == 'azure_semantic_vad':
        print(f"   Remove Filler Words: {VAD_CONFIG.get('remove_filler_words', False)}")
        eou = VAD_CONFIG.get('end_of_utterance_detection')
        print(f"   EOU Detection: {eou.get('model') if eou else 'Disabled'}")

# ============================================================
# RUN THE VOICE ASSISTANT
# ============================================================

if audio_ok:
    try:
        await run_voice_assistant()
    except asyncio.CancelledError:
        print("\nüëã Session ended. Goodbye!")
else:
    print("‚ö†Ô∏è Please fix audio issues before running the voice assistant.")