## 1. Install Required Packages

In [None]:
# First install portaudio via homebrew: brew install portaudio
# Then install Python packages
%pip install azure-ai-voicelive azure-identity python-dotenv pyaudio

## 2. Import Libraries and Setup Logging

In [None]:
from __future__ import annotations
import os
import asyncio
from datetime import datetime
import logging
from typing import Union, Optional, TYPE_CHECKING, cast, List, Dict

from azure.core.credentials import AzureKeyCredential
from azure.core.credentials_async import AsyncTokenCredential
from azure.identity.aio import AzureCliCredential

from azure.ai.voicelive.aio import connect
from azure.ai.voicelive.models import (
    AudioEchoCancellation,
    AudioInputTranscriptionOptions,
    AudioNoiseReduction,
    AzureSemanticVad,
    AzureStandardVoice,
    InputAudioFormat,
    Modality,
    OutputAudioFormat,
    RequestSession,
    ServerEventType,
    ServerVad
)
from dotenv import load_dotenv
import pyaudio

# Import reusable AudioProcessor from local module
from audio_processor import AudioProcessor

if TYPE_CHECKING:
    from azure.ai.voicelive.aio import VoiceLiveConnection

# Load environment variables
load_dotenv('./.env', override=True)

# Setup logging
if not os.path.exists('logs'):
    os.makedirs('logs')

timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
logging.basicConfig(
    filename=f'logs/{timestamp}_custom_speech.log',
    filemode="w",
    format='%(asctime)s:%(name)s:%(levelname)s:%(message)s',
    level=logging.INFO
)
logger = logging.getLogger(__name__)
print("‚úÖ Libraries imported and logging configured")

## 3. Configuration

### Custom Speech Model Configuration

The `custom_speech` field maps language codes to Custom Speech model IDs:

```json
{
  "input_audio_transcription": {
    "model": "azure-speech",
    "language": "zh-CN",
    "custom_speech": {
      "zh-CN": "your-custom-model-endpoint-id"
    }
  }
}
```

**Important Notes:**
- Custom Speech model must be deployed in the **same region** as your Foundry resource
- The model ID is the Endpoint ID from your Custom Speech deployment
- You can specify different models for different languages

In [None]:
# ============================================================
# Azure VoiceLive Configuration
# ============================================================
API_KEY = os.environ.get("AZURE_VOICELIVE_API_KEY")
ENDPOINT = os.environ.get("AZURE_VOICELIVE_ENDPOINT",
                          "https://your-resource.services.ai.azure.com/")
MODEL = "gpt-4o"

# Azure HD Voice Configuration
VOICE = "zh-CN-Xiaochen:DragonHDLatestNeural"  # Chinese voice for this demo
VOICE_TEMPERATURE = 0.8

# ============================================================
# Custom Speech Model Configuration
# ============================================================
# Enable custom speech for improved domain-specific recognition
CUSTOM_SPEECH_ENABLED = True

# Azure Speech transcription model (required for custom speech)
INPUT_AUDIO_TRANSCRIPTION_MODEL = "azure-speech"

# Language for transcription (must match your custom speech model)
# Options: "zh-CN", "en-US", "ja-JP", etc.
TRANSCRIPTION_LANGUAGE = "zh-CN"

# Custom Speech Model Endpoint IDs
# Map language codes to your deployed Custom Speech model IDs
# These models must be deployed in the same region as your Foundry resource
CUSTOM_SPEECH_MODELS: Dict[str, str] = {
    # Your Custom Speech model for Chinese
    "zh-CN": "dbdc0514-efcf-49c9-b022-040e463c4725",
    # Add more language-specific models as needed:
    # "en-US": "your-english-model-endpoint-id",
    # "ja-JP": "your-japanese-model-endpoint-id",
}

# Optional: Phrase list for additional recognition hints
# Works alongside custom speech for extra boost
PHRASE_LIST: List[str] = [
    # Add domain-specific terms here
]

# ============================================================
# System Instructions
# ============================================================
INSTRUCTIONS = os.environ.get(
    "AZURE_VOICELIVE_INSTRUCTIONS",
    """‰Ω†ÊòØ‰∏Ä‰∏™ÊúâÂ∏ÆÂä©ÁöÑAIÂä©Êâã„ÄÇËØ∑Áî®‰∏≠ÊñáËá™ÁÑ∂Âú∞ÂõûÂ∫îÁî®Êà∑ÁöÑÈóÆÈ¢ò„ÄÇ
‰øùÊåÅÂõûÁ≠îÁÆÄÊ¥Å‰ΩÜÊúâÂê∏ÂºïÂäõ„ÄÇ
You are a helpful AI assistant. Respond naturally in Chinese.
Keep your responses concise but engaging."""
)

# Authentication mode
USE_TOKEN_CREDENTIAL = False

# ============================================================
# Print Configuration Summary
# ============================================================
print("üìã Configuration Summary")
print("=" * 50)
print(f"üìç Endpoint: {ENDPOINT}")
print(f"ü§ñ Model: {MODEL}")
print(f"üéôÔ∏è Voice: {VOICE}")
print(f"üå°Ô∏è Voice Temperature: {VOICE_TEMPERATURE}")
print(f"üîë API Key: {'Set' if API_KEY else 'Not set'}")
print()
print("üìù Custom Speech Configuration:")
print(f"   Enabled: {CUSTOM_SPEECH_ENABLED}")
print(f"   Transcription Model: {INPUT_AUDIO_TRANSCRIPTION_MODEL}")
print(f"   Language: {TRANSCRIPTION_LANGUAGE}")
if CUSTOM_SPEECH_ENABLED:
    print("   Custom Models:")
    for lang, model_id in CUSTOM_SPEECH_MODELS.items():
        print(f"      {lang}: {model_id[:20]}...")

## 4. CustomSpeechVoiceAssistant Class

Extended voice assistant class with Custom Speech model support.

In [None]:
class CustomSpeechVoiceAssistant:
    """Voice assistant with Custom Speech model support for improved transcription."""

    def __init__(
        self,
        endpoint: str,
        credential: Union[AzureKeyCredential, AsyncTokenCredential],
        model: str,
        voice: str,
        instructions: str,
        voice_temperature: Optional[float] = None,
        # Custom Speech configuration
        custom_speech_enabled: bool = False,
        transcription_model: str = "azure-speech",
        transcription_language: str = "zh-CN",
        custom_speech_models: Optional[Dict[str, str]] = None,
        phrase_list: Optional[List[str]] = None,
    ):
        self.endpoint = endpoint
        self.credential = credential
        self.model = model
        self.voice = voice
        self.instructions = instructions
        self.voice_temperature = voice_temperature

        # Custom Speech settings
        self.custom_speech_enabled = custom_speech_enabled
        self.transcription_model = transcription_model
        self.transcription_language = transcription_language
        self.custom_speech_models = custom_speech_models or {}
        self.phrase_list = phrase_list or []

        # Connection state
        self.connection: Optional["VoiceLiveConnection"] = None
        self.audio_processor: Optional[AudioProcessor] = None
        self.session_ready = False
        self._active_response = False
        self._response_api_done = False
        self._stop_requested = False

    def stop(self):
        """Request graceful stop of the voice assistant."""
        self._stop_requested = True
        print("\nüõë Stop requested, shutting down...")

    async def start(self):
        """Start the voice assistant session."""
        try:
            logger.info(
                "Connecting to VoiceLive API with model %s", self.model)

            async with connect(
                endpoint=self.endpoint,
                credential=self.credential,
                model=self.model,
            ) as connection:
                conn = connection
                self.connection = conn

                ap = AudioProcessor(conn)
                self.audio_processor = ap

                await self._setup_session()
                ap.start_playback()

                logger.info("Voice assistant ready! Start speaking...")
                print("\n" + "=" * 60)
                print("üé§ CUSTOM SPEECH VOICE ASSISTANT READY")
                print(f"Language: {self.transcription_language}")
                if self.custom_speech_enabled:
                    model_id = self.custom_speech_models.get(
                        self.transcription_language, "N/A")
                    print(f"Custom Model: {model_id[:30]}...")
                print("Start speaking to begin conversation")
                print("Or use Cmd+Shift+P ‚Üí 'Notebook: Restart Kernel' to exit")
                print("=" * 60 + "\n")

                await self._process_events()
        finally:
            if self.audio_processor:
                self.audio_processor.shutdown()

    async def _setup_session(self):
        """Configure the VoiceLive session with Custom Speech model."""
        logger.info(
            "Setting up voice conversation session with Custom Speech...")

        # Configure Azure HD voice with optional temperature
        voice_config: Union[AzureStandardVoice, str]
        if "-" in self.voice:
            voice_config = AzureStandardVoice(
                name=self.voice,
                temperature=self.voice_temperature
            )
            logger.info(
                f"Using Azure HD voice: {self.voice}, temperature: {self.voice_temperature}")
        else:
            voice_config = self.voice

        # VAD configuration
        # turn_detection_config = ServerVad(
        #     threshold=0.8,
        #     prefix_padding_ms=200,
        #     silence_duration_ms=1000
        # )

        turn_detection_config = AzureSemanticVad(
            threshold=0.3,
            prefix_padding_ms=300,
            speech_duration_ms=80,
            silence_duration_ms=500,
            remove_filler_words=False,
        )

        # Configure input audio transcription with Custom Speech
        input_transcription_config: Optional[AudioInputTranscriptionOptions] = None

        if self.custom_speech_enabled:
            # Build the transcription options with custom speech model
            input_transcription_config = AudioInputTranscriptionOptions(
                model=self.transcription_model,
                language=self.transcription_language,
                # Custom Speech model mapping
                custom_speech=self.custom_speech_models if self.custom_speech_models else None,
                # Optional: Add phrase list for additional hints
                phrase_list=self.phrase_list if self.phrase_list else None,
            )

            logger.info(
                f"Custom Speech enabled: model={self.transcription_model}, "
                f"language={self.transcription_language}, "
                f"custom_speech_models={list(self.custom_speech_models.keys())}"
            )
            print(
                f"üéØ Custom Speech configured for: {self.transcription_language}")

        session_config = RequestSession(
            modalities=[Modality.TEXT, Modality.AUDIO],
            instructions=self.instructions,
            voice=voice_config,
            input_audio_format=InputAudioFormat.PCM16,
            output_audio_format=OutputAudioFormat.PCM16,
            turn_detection=turn_detection_config,
            input_audio_echo_cancellation=AudioEchoCancellation(),
            input_audio_noise_reduction=AudioNoiseReduction(
                type="azure_deep_noise_suppression"),
            input_audio_transcription=input_transcription_config,
        )

        conn = self.connection
        assert conn is not None, "Connection must be established before setting up session"
        await conn.session.update(session=session_config)
        logger.info("Session configuration sent with Custom Speech")

    async def _process_events(self):
        """Process events from the VoiceLive connection."""
        try:
            conn = self.connection
            assert conn is not None, "Connection must be established before processing events"
            async for event in conn:
                if self._stop_requested:
                    logger.info("Stop requested, exiting event loop")
                    break
                await self._handle_event(event)
        except asyncio.CancelledError:
            logger.info("Event processing cancelled")
            raise
        except Exception:
            logger.exception("Error processing events")
            raise

    async def _handle_event(self, event):
        """Handle different types of events from VoiceLive."""
        logger.debug("Received event: %s", event.type)
        ap = self.audio_processor
        conn = self.connection
        assert ap is not None, "AudioProcessor must be initialized"
        assert conn is not None, "Connection must be established"

        if event.type == ServerEventType.SESSION_UPDATED:
            logger.info("Session ready: %s", event.session.id)
            self.session_ready = True
            ap.start_capture()

        elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STARTED:
            logger.info("User started speaking - stopping playback")
            print("üé§ Listening...")
            ap.skip_pending_audio()

            if self._active_response and not self._response_api_done:
                try:
                    await conn.response.cancel()
                    logger.debug(
                        "Cancelled in-progress response due to barge-in")
                except Exception as e:
                    if "no active response" in str(e).lower():
                        logger.debug(
                            "Cancel ignored - response already completed")
                    else:
                        logger.warning("Cancel failed: %s", e)

        elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STOPPED:
            logger.info("üé§ User stopped speaking")
            print("ü§î Processing...")

        elif event.type == ServerEventType.RESPONSE_CREATED:
            logger.info("ü§ñ Assistant response created")
            self._active_response = True
            self._response_api_done = False

        elif event.type == ServerEventType.RESPONSE_AUDIO_DELTA:
            logger.debug("Received audio delta")
            ap.queue_audio(event.delta)

        elif event.type == ServerEventType.RESPONSE_AUDIO_DONE:
            logger.info("ü§ñ Assistant finished speaking")
            print("üé§ Ready for next input...")

        elif event.type == ServerEventType.RESPONSE_DONE:
            logger.info("‚úÖ Response complete")
            self._active_response = False
            self._response_api_done = True

        # Handle input audio transcription events
        elif event.type == ServerEventType.CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_COMPLETED:
            transcript = getattr(event, 'transcript', None)
            if transcript:
                logger.info(f"üìù User said (Custom Speech): {transcript}")
                print(f"üìù You said: {transcript}")

        elif event.type == ServerEventType.CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_DELTA:
            delta = getattr(event, 'delta', None)
            if delta:
                logger.debug(f"Transcription delta: {delta}")

        elif event.type == ServerEventType.CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_FAILED:
            error = getattr(event, 'error', None)
            logger.warning(f"Input transcription failed: {error}")
            print(f"‚ö†Ô∏è Transcription error: {error}")

        elif event.type == ServerEventType.ERROR:
            msg = event.error.message
            if "Cancellation failed: no active response" in msg:
                logger.debug("Benign cancellation error: %s", msg)
            else:
                logger.error("‚ùå VoiceLive error: %s", msg)
                print(f"Error: {msg}")

        elif event.type == ServerEventType.CONVERSATION_ITEM_CREATED:
            logger.debug("Conversation item created: %s", event.item.id)

        else:
            logger.debug("Unhandled event type: %s", event.type)


print("‚úÖ CustomSpeechVoiceAssistant class defined")

## 5. Check Audio System

Verify that audio input/output devices are available before starting the assistant.

In [None]:
def check_audio_system():
    """Check if audio input/output devices are available."""
    try:
        p = pyaudio.PyAudio()
        
        # Check for input devices
        input_devices = [
            i for i in range(p.get_device_count())
            if cast(Union[int, float], p.get_device_info_by_index(i).get("maxInputChannels", 0) or 0) > 0
        ]
        
        # Check for output devices
        output_devices = [
            i for i in range(p.get_device_count())
            if cast(Union[int, float], p.get_device_info_by_index(i).get("maxOutputChannels", 0) or 0) > 0
        ]
        
        p.terminate()

        if not input_devices:
            print("‚ùå No audio input devices found. Please check your microphone.")
            return False
        if not output_devices:
            print("‚ùå No audio output devices found. Please check your speakers.")
            return False
        
        print(f"‚úÖ Found {len(input_devices)} input device(s) and {len(output_devices)} output device(s)")
        return True
        
    except Exception as e:
        print(f"‚ùå Audio system check failed: {e}")
        return False

# Run the audio check
audio_ok = check_audio_system()

## 6. Run the Custom Speech Voice Assistant

Start the voice assistant with Custom Speech model for improved transcription accuracy.

**Note:** This cell will run continuously until you interrupt it. Use the stop button (‚èπ) in the notebook toolbar to stop.

In [None]:
async def run_custom_speech_assistant():
    """Run the voice assistant with Custom Speech model."""
    # Validate credentials
    if not API_KEY and not USE_TOKEN_CREDENTIAL:
        print("‚ùå Error: No authentication provided")
        print("Please set AZURE_VOICELIVE_API_KEY in the .env file,")
        print("or set USE_TOKEN_CREDENTIAL = True for Azure authentication.")
        return

    # Validate Custom Speech configuration
    if CUSTOM_SPEECH_ENABLED:
        if not CUSTOM_SPEECH_MODELS:
            print("‚ö†Ô∏è Warning: Custom Speech enabled but no models configured")
        elif TRANSCRIPTION_LANGUAGE not in CUSTOM_SPEECH_MODELS:
            print(f"‚ö†Ô∏è Warning: No custom model for language '{TRANSCRIPTION_LANGUAGE}'")
            print(f"   Available: {list(CUSTOM_SPEECH_MODELS.keys())}")

    # Create client with appropriate credential
    credential: Union[AzureKeyCredential, AsyncTokenCredential]
    if USE_TOKEN_CREDENTIAL:
        credential = AzureCliCredential()
        logger.info("Using Azure token credential")
        print("üîê Using Azure CLI credential")
    else:
        credential = AzureKeyCredential(API_KEY)
        logger.info("Using API key credential")
        print("üîë Using API key credential")

    # Create and start voice assistant
    assistant = CustomSpeechVoiceAssistant(
        endpoint=ENDPOINT,
        credential=credential,
        model=MODEL,
        voice=VOICE,
        instructions=INSTRUCTIONS,
        voice_temperature=VOICE_TEMPERATURE,
        # Custom Speech settings
        custom_speech_enabled=CUSTOM_SPEECH_ENABLED,
        transcription_model=INPUT_AUDIO_TRANSCRIPTION_MODEL,
        transcription_language=TRANSCRIPTION_LANGUAGE,
        custom_speech_models=CUSTOM_SPEECH_MODELS,
        phrase_list=PHRASE_LIST,
    )

    print("\nüéôÔ∏è  Custom Speech Voice Assistant")
    print("=" * 50)
    print(f"üìù Transcription Model: {INPUT_AUDIO_TRANSCRIPTION_MODEL}")
    print(f"üåê Language: {TRANSCRIPTION_LANGUAGE}")
    if CUSTOM_SPEECH_ENABLED and TRANSCRIPTION_LANGUAGE in CUSTOM_SPEECH_MODELS:
        print(f"üéØ Custom Model ID: {CUSTOM_SPEECH_MODELS[TRANSCRIPTION_LANGUAGE][:30]}...")
    print("=" * 50)

    try:
        await assistant.start()
    except KeyboardInterrupt:
        print("\nüëã Voice assistant shut down. Goodbye!")
    except asyncio.CancelledError:
        print("\nüëã Voice assistant interrupted. Goodbye!")
    except Exception as e:
        print(f"‚ùå Fatal Error: {e}")
        raise

# Run the assistant
if audio_ok:
    try:
        await run_custom_speech_assistant()
    except asyncio.CancelledError:
        print("\nüëã Session ended. Goodbye!")
else:
    print("‚ö†Ô∏è Please fix audio issues before running the voice assistant.")

## 7. Multi-Language Custom Speech Example

For scenarios requiring multiple languages with custom models, you can configure like this:

In [None]:
# Example: Multi-language Custom Speech configuration
# Each language can have its own custom-trained model

MULTI_LANGUAGE_CUSTOM_SPEECH = {
    # Chinese - your deployed custom model
    "zh-CN": "dbdc0514-efcf-49c9-b022-040e463c4725",
    
    # English - add your English custom model when available
    # "en-US": "your-english-custom-model-id",
    
    # Japanese - add your Japanese custom model when available
    # "ja-JP": "your-japanese-custom-model-id",
}

print("Multi-language Custom Speech Configuration:")
print("=" * 50)
for lang, model_id in MULTI_LANGUAGE_CUSTOM_SPEECH.items():
    print(f"  {lang}: {model_id}")
print()
print("üí° Tips:")
print("  - Deploy custom models in the SAME region as your Foundry resource")
print("  - Custom Speech training and hosting incur additional costs")
print("  - For language codes, see Azure Speech Service documentation")

## 8. Transcription Model Comparison

Different models support different transcription options:

| Model Type | Supported Transcription Models | Custom Speech Support |
|------------|-------------------------------|----------------------|
| gpt-4o | `whisper-1`, `gpt-4o-transcribe`, `gpt-4o-mini-transcribe` | ‚ùå No |
| gpt-realtime | `whisper-1`, `gpt-4o-transcribe`, `gpt-4o-mini-transcribe` | ‚ùå No |
| gpt-realtime-mini | `whisper-1`, `gpt-4o-transcribe`, `gpt-4o-mini-transcribe` | ‚ùå No |
| phi4-mm-realtime | `azure-speech` | ‚úÖ Yes |
| Non-multimodal models | `azure-speech` | ‚úÖ Yes |

**Note:** Custom Speech and Phrase List are only supported with `azure-speech` transcription model.

In [None]:
# Helper function to print supported configurations
def print_transcription_support():
    """Print transcription model support matrix."""
    print("\nüìä Transcription Model Support Matrix")
    print("=" * 70)
    print(f"{'Model':<25} {'Transcription':<30} {'Custom Speech'}")
    print("-" * 70)
    
    configs = [
        ("gpt-4o", "whisper-1, gpt-4o-transcribe", "‚ùå"),
        ("gpt-realtime", "whisper-1, gpt-4o-transcribe", "‚ùå"),
        ("gpt-realtime-mini", "whisper-1, gpt-4o-transcribe", "‚ùå"),
        ("phi4-mm-realtime", "azure-speech", "‚úÖ"),
        ("Non-multimodal", "azure-speech", "‚úÖ"),
    ]
    
    for model, transcription, custom in configs:
        print(f"{model:<25} {transcription:<30} {custom}")
    
    print("\nüí° Custom Speech requires:")
    print("   1. azure-speech as transcription model")
    print("   2. Custom model deployed in same region as Foundry resource")
    print("   3. Valid endpoint ID from Custom Speech deployment")

print_transcription_support()