## 1. Install Required Packages

In [None]:
# First install portaudio via homebrew: brew install portaudio
# Then install Python packages
%pip install azure-ai-voicelive azure-identity python-dotenv pyaudio

## 2. Import Libraries and Setup

In [None]:
from __future__ import annotations
import os
import asyncio
from datetime import datetime
import logging
from typing import Union, Optional, TYPE_CHECKING, cast, List

from azure.core.credentials import AzureKeyCredential
from azure.core.credentials_async import AsyncTokenCredential
from azure.identity.aio import AzureCliCredential

from azure.ai.voicelive.aio import connect
from azure.ai.voicelive.models import (
    AudioEchoCancellation,
    AudioInputTranscriptionOptions,
    AudioNoiseReduction,
    AzureSemanticVad,
    AzureStandardVoice,
    InputAudioFormat,
    Modality,
    OutputAudioFormat,
    RequestSession,
    ServerEventType,
)
from dotenv import load_dotenv
import pyaudio

# Import reusable AudioProcessor from local module
from audio_processor import AudioProcessor

if TYPE_CHECKING:
    from azure.ai.voicelive.aio import VoiceLiveConnection

# Load environment variables
load_dotenv('./.env', override=True)

# Setup logging
if not os.path.exists('logs'):
    os.makedirs('logs')

timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
logging.basicConfig(
    filename=f'logs/{timestamp}_phrase_list.log',
    filemode="w",
    format='%(asctime)s:%(name)s:%(levelname)s:%(message)s',
    level=logging.INFO
)
logger = logging.getLogger(__name__)
print("‚úÖ Libraries imported and logging configured")

## 3. Understanding Phrase Lists

### Key Concepts

| Feature | Description |
|---------|-------------|
| **Purpose** | Boost recognition of specific words/phrases |
| **Supported Models** | `azure-speech`, `azure-fast-transcription` |
| **NOT Supported** | `whisper-1` (OpenAI Whisper) |
| **Max Phrases** | Typically 100-500 phrases recommended |
| **Best Practices** | Include variations, common misspellings |

### When to Use Phrase Lists

‚úÖ **Good Use Cases:**
- Product catalogs with unique names
- Company-specific terminology
- Technical documentation terms
- Customer service applications with specific terms

‚ùå **Not Recommended:**
- Common everyday words
- Extremely long phrases (keep to 3-4 words max)
- Thousands of phrases (performance impact)

## 4. Define Phrase List Configuration

Here we define different phrase list examples for various domains.

In [None]:
# ============================================================
# PHRASE LIST EXAMPLES FOR DIFFERENT DOMAINS
# ============================================================

# Example 1: Consumer Electronics / Retail
ELECTRONICS_PHRASES: List[str] = [
    # TV Products
    "Neo QLED TV",
    "OLED TV",
    "Mini LED",
    "Samsung Frame TV",
    
    # Gaming Products
    "TUF Gaming",
    "ASUS TUF Gaming",
    "TUF Dash",
    "ROG Strix",
    "ROG Zephyrus",
    "GeForce RTX",
    "Radeon RX",
    
    # Audio Products
    "AirPods Pro",
    "Galaxy Buds",
    "WH-1000XM5",
    "QuietComfort",
]

# Example 2: Software / Technology
TECH_PHRASES: List[str] = [
    # Azure Services
    "Azure OpenAI",
    "Azure Cosmos DB",
    "Azure Functions",
    "Azure AI Foundry",
    "VoiceLive SDK",
    
    # Development Tools
    "VS Code",
    "GitHub Copilot",
    "AutoQuote Explorer",
    "Jupyter Notebook",
    
    # Frameworks & Libraries
    "FastAPI",
    "LangChain",
    "Semantic Kernel",
    "PyTorch",
    "TensorFlow",
]

# Example 3: Company/Brand Names
BRAND_PHRASES: List[str] = [
    "Microsoft",
    "OpenAI",
    "NVIDIA",
    "AMD",
    "ASUS",
    "Advantech",
    "Contoso",
    "Fabrikam",
]

# Example 4: Chinese Terms (‰∏≠ÊñáÊúØËØ≠)
CHINESE_PHRASES: List[str] = [
    "‰∫∫Â∑•Êô∫ËÉΩ",
    "Êú∫Âô®Â≠¶‰π†",
    "Ê∑±Â∫¶Â≠¶‰π†",
    "Ëá™ÁÑ∂ËØ≠Ë®ÄÂ§ÑÁêÜ",
    "ËØ≠Èü≥ËØÜÂà´",
    "ËØ≠Èü≥ÂêàÊàê",
    "Â§ßËØ≠Ë®ÄÊ®°Âûã",
    "GPT-4o",
]

print("üìù Phrase list examples defined:")
print(f"   Electronics: {len(ELECTRONICS_PHRASES)} phrases")
print(f"   Technology: {len(TECH_PHRASES)} phrases")
print(f"   Brands: {len(BRAND_PHRASES)} phrases")
print(f"   Chinese: {len(CHINESE_PHRASES)} phrases")

## 5. Voice Assistant with Phrase List Support

In [None]:
class PhraseListVoiceAssistant:
    """Voice assistant with phrase list support for improved recognition."""

    def __init__(
        self,
        endpoint: str,
        credential: Union[AzureKeyCredential, AsyncTokenCredential],
        model: str,
        voice: str,
        instructions: str,
        # Phrase list configuration
        phrase_list: List[str],
        transcription_model: str = "azure-speech",
        transcription_language: Optional[str] = None,
    ):
        self.endpoint = endpoint
        self.credential = credential
        self.model = model
        self.voice = voice
        self.instructions = instructions
        self.phrase_list = phrase_list
        self.transcription_model = transcription_model
        
        # Auto-detect language from voice if not specified
        if transcription_language:
            self.transcription_language = transcription_language
        elif voice and "-" in voice:
            parts = voice.split("-")
            if len(parts) >= 2:
                self.transcription_language = f"{parts[0]}-{parts[1].split(':')[0]}"
            else:
                self.transcription_language = "en-US"
        else:
            self.transcription_language = "en-US"
            
        self.connection: Optional["VoiceLiveConnection"] = None
        self.audio_processor: Optional[AudioProcessor] = None
        self._active_response = False
        self._assistant_transcript_buffer = ""

    async def start(self):
        """Start the voice assistant session."""
        try:
            logger.info("Connecting to VoiceLive API with phrase list support")

            async with connect(
                endpoint=self.endpoint,
                credential=self.credential,
                model=self.model,
            ) as connection:
                self.connection = connection
                self.audio_processor = AudioProcessor(connection)

                await self._setup_session()
                self.audio_processor.start_playback()

                print("\n" + "=" * 60)
                print("üé§ PHRASE LIST VOICE ASSISTANT READY")
                print(f"üìù Phrase list: {len(self.phrase_list)} terms configured")
                print(f"üåê Language: {self.transcription_language}")
                print(f"üîä Transcription: {self.transcription_model}")
                print("\nTry saying some of your custom phrases!")
                print("Use Cmd+Shift+P ‚Üí 'Notebook: Restart Kernel' to exit")
                print("=" * 60 + "\n")

                await self._process_events()
        finally:
            if self.audio_processor:
                self.audio_processor.shutdown()

    async def _setup_session(self):
        """Configure session with phrase list."""
        logger.info(f"Setting up session with {len(self.phrase_list)} phrases")

        # Configure voice
        voice_config = AzureStandardVoice(name=self.voice)

        # Configure VAD - AzureSemanticVad is required for azure-speech
        turn_detection_config = AzureSemanticVad(
            threshold=0.3,
            prefix_padding_ms=300,
            silence_duration_ms=500,
        )

        # Configure input transcription WITH phrase list
        # This is where the phrase list magic happens!
        input_transcription_config = AudioInputTranscriptionOptions(
            model=self.transcription_model,
            language=self.transcription_language,
            phrase_list=self.phrase_list,  # <-- Key configuration!
        )
        
        logger.info(
            f"Phrase list configured: {self.phrase_list[:5]}{'...' if len(self.phrase_list) > 5 else ''}"
        )

        session_config = RequestSession(
            modalities=[Modality.TEXT, Modality.AUDIO],
            instructions=self.instructions,
            voice=voice_config,
            input_audio_format=InputAudioFormat.PCM16,
            output_audio_format=OutputAudioFormat.PCM16,
            turn_detection=turn_detection_config,
            input_audio_echo_cancellation=AudioEchoCancellation(),
            input_audio_noise_reduction=AudioNoiseReduction(
                type="azure_deep_noise_suppression"),
            input_audio_transcription=input_transcription_config,
        )

        assert self.connection is not None
        await self.connection.session.update(session=session_config)
        logger.info("Session with phrase list configured")

    async def _process_events(self):
        """Process events from the VoiceLive connection."""
        assert self.connection is not None
        async for event in self.connection:
            await self._handle_event(event)

    async def _handle_event(self, event):
        """Handle different types of events."""
        ap = self.audio_processor
        conn = self.connection
        assert ap is not None and conn is not None

        if event.type == ServerEventType.SESSION_UPDATED:
            logger.info("Session ready")
            ap.start_capture()

        elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STARTED:
            print("üé§ Listening...")
            ap.skip_pending_audio()
            if self._active_response:
                try:
                    await conn.response.cancel()
                except Exception:
                    pass

        elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STOPPED:
            print("ü§î Processing...")

        elif event.type == ServerEventType.RESPONSE_CREATED:
            self._active_response = True
            self._assistant_transcript_buffer = ""

        elif event.type == ServerEventType.RESPONSE_AUDIO_DELTA:
            ap.queue_audio(event.delta)

        elif event.type == ServerEventType.RESPONSE_AUDIO_DONE:
            print("üé§ Ready for next input...")

        elif event.type == ServerEventType.RESPONSE_DONE:
            self._active_response = False

        elif event.type == ServerEventType.RESPONSE_AUDIO_TRANSCRIPT_DELTA:
            delta = getattr(event, 'delta', None)
            if delta:
                self._assistant_transcript_buffer += delta

        elif event.type == ServerEventType.RESPONSE_AUDIO_TRANSCRIPT_DONE:
            transcript = getattr(event, 'transcript', None) or self._assistant_transcript_buffer
            if transcript:
                print(f"ü§ñ Assistant: {transcript}")
            self._assistant_transcript_buffer = ""

        # Key event: User speech transcription with phrase list applied
        elif event.type == ServerEventType.CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_COMPLETED:
            transcript = getattr(event, 'transcript', None)
            if transcript:
                # Check if any phrase list terms were recognized
                matched_phrases = [p for p in self.phrase_list if p.lower() in transcript.lower()]
                if matched_phrases:
                    print(f"üìù You said: {transcript}")
                    print(f"   ‚úÖ Recognized phrases: {matched_phrases}")
                else:
                    print(f"üìù You said: {transcript}")

        elif event.type == ServerEventType.CONVERSATION_ITEM_INPUT_AUDIO_TRANSCRIPTION_FAILED:
            error = getattr(event, 'error', None)
            logger.warning(f"Transcription failed: {error}")

        elif event.type == ServerEventType.ERROR:
            msg = event.error.message
            if "no active response" not in msg.lower():
                print(f"‚ùå Error: {msg}")


print("‚úÖ PhraseListVoiceAssistant class defined")

## 6. Audio System Check

In [None]:
def check_audio_system():
    """Check if audio input/output devices are available."""
    try:
        p = pyaudio.PyAudio()
        
        input_devices = [
            i for i in range(p.get_device_count())
            if cast(Union[int, float], p.get_device_info_by_index(i).get("maxInputChannels", 0) or 0) > 0
        ]
        
        output_devices = [
            i for i in range(p.get_device_count())
            if cast(Union[int, float], p.get_device_info_by_index(i).get("maxOutputChannels", 0) or 0) > 0
        ]
        
        p.terminate()

        if not input_devices:
            print("‚ùå No audio input devices found.")
            return False
        if not output_devices:
            print("‚ùå No audio output devices found.")
            return False
        
        print(f"‚úÖ Audio: {len(input_devices)} input, {len(output_devices)} output device(s)")
        return True
        
    except Exception as e:
        print(f"‚ùå Audio check failed: {e}")
        return False

audio_ok = check_audio_system()

## 7. Configuration & Run

Configure your phrase list and run the assistant. The key settings are:

1. **ACTIVE_PHRASE_LIST**: Choose which phrase list to use
2. **TRANSCRIPTION_MODEL**: Must be `azure-speech` or `azure-fast-transcription`
3. **VOICE**: Sets the language for transcription automatically

In [None]:
# ============================================================
# CONFIGURATION
# ============================================================

# Azure Credentials
API_KEY = os.environ.get("AZURE_VOICELIVE_API_KEY")
ENDPOINT = os.environ.get("AZURE_VOICELIVE_ENDPOINT", "https://your-resource.services.ai.azure.com/")
MODEL = "gpt-realtime"

# Voice Configuration
# Change to "en-US-Ava:DragonHDLatestNeural" for English
VOICE = "zh-CN-Xiaochen:DragonHDLatestNeural"

# ============================================================
# PHRASE LIST CONFIGURATION - The focus of this notebook!
# ============================================================

# Transcription model (phrase list requires azure-speech or azure-fast-transcription)
TRANSCRIPTION_MODEL = "azure-speech"  # Options: "azure-speech", "azure-fast-transcription"

# Choose your phrase list or create a custom one
# Options: ELECTRONICS_PHRASES, TECH_PHRASES, BRAND_PHRASES, CHINESE_PHRASES
# Or combine them:
ACTIVE_PHRASE_LIST: List[str] = (
    ELECTRONICS_PHRASES + 
    TECH_PHRASES + 
    BRAND_PHRASES
)

# Or define your own custom phrases:
# ACTIVE_PHRASE_LIST: List[str] = [
#     "Your Custom Term",
#     "Another Product Name",
#     "Technical Jargon",
# ]

# System instructions
INSTRUCTIONS = """
You are a helpful voice assistant with expertise in technology products.
When users mention specific product names or technical terms, acknowledge them correctly.
Keep responses concise and conversational.
"""

# Authentication
USE_TOKEN_CREDENTIAL = False

# ============================================================
# DISPLAY CONFIGURATION
# ============================================================

print("üìã PHRASE LIST CONFIGURATION")
print("=" * 50)
print(f"üìç Endpoint: {ENDPOINT}")
print(f"ü§ñ Model: {MODEL}")
print(f"üéôÔ∏è Voice: {VOICE}")
print(f"üîä Transcription Model: {TRANSCRIPTION_MODEL}")
print(f"üîë API Key: {'Set' if API_KEY else 'Not set'}")
print()
print(f"üìù PHRASE LIST ({len(ACTIVE_PHRASE_LIST)} terms):")
print("-" * 50)
# Display phrases in columns
for i, phrase in enumerate(ACTIVE_PHRASE_LIST[:20]):
    print(f"   ‚Ä¢ {phrase}")
if len(ACTIVE_PHRASE_LIST) > 20:
    print(f"   ... and {len(ACTIVE_PHRASE_LIST) - 20} more")
print()

# Validate configuration
if TRANSCRIPTION_MODEL not in ["azure-speech", "azure-fast-transcription"]:
    print("‚ö†Ô∏è  WARNING: Phrase lists only work with azure-speech or azure-fast-transcription!")
    print(f"   Current model '{TRANSCRIPTION_MODEL}' does not support phrase lists.")

## 8. Run the Voice Assistant

In [None]:
async def run_phrase_list_assistant():
    """Run the voice assistant with phrase list."""
    if not API_KEY and not USE_TOKEN_CREDENTIAL:
        print("‚ùå Error: No authentication provided")
        print("Set AZURE_VOICELIVE_API_KEY in .env file")
        return

    credential: Union[AzureKeyCredential, AsyncTokenCredential]
    if USE_TOKEN_CREDENTIAL:
        credential = AzureCliCredential()
        print("üîê Using Azure CLI credential")
    else:
        credential = AzureKeyCredential(API_KEY)
        print("üîë Using API key credential")

    assistant = PhraseListVoiceAssistant(
        endpoint=ENDPOINT,
        credential=credential,
        model=MODEL,
        voice=VOICE,
        instructions=INSTRUCTIONS,
        phrase_list=ACTIVE_PHRASE_LIST,
        transcription_model=TRANSCRIPTION_MODEL,
    )

    print("\nüéôÔ∏è  Starting Phrase List Voice Assistant...")
    print(f"üìù {len(ACTIVE_PHRASE_LIST)} phrases configured for enhanced recognition")

    try:
        await assistant.start()
    except asyncio.CancelledError:
        print("\nüëã Session ended.")
    except Exception as e:
        print(f"‚ùå Error: {e}")
        raise


# Run the assistant
if audio_ok:
    try:
        await run_phrase_list_assistant()
    except asyncio.CancelledError:
        print("\nüëã Goodbye!")
else:
    print("‚ö†Ô∏è Please fix audio issues before running.")

## 9. Tips for Effective Phrase Lists

### Best Practices

1. **Include Variations**
   ```python
   phrase_list = [
       "TUF Gaming",
       "TUF",
       "ASUS TUF",
       "ASUS TUF Gaming",
   ]
   ```

2. **Include Common Pronunciations**
   ```python
   phrase_list = [
       "ASUS",      # Correct
       "A-sus",     # Common pronunciation
   ]
   ```

3. **Group Related Terms**
   ```python
   # Product line
   ROG_PHRASES = ["ROG", "ROG Strix", "ROG Zephyrus", "ROG Flow"]
   
   # Service names
   AZURE_PHRASES = ["Azure", "Azure OpenAI", "Azure AI Foundry"]
   ```

4. **Keep Phrases Short**
   - ‚úÖ Good: "Neo QLED" (2 words)
   - ‚ö†Ô∏è Okay: "Samsung Neo QLED TV" (4 words)
   - ‚ùå Bad: "Samsung 65 inch Neo QLED 8K Smart TV" (too long)

### Performance Considerations

- **Recommended**: 50-200 phrases for optimal performance
- **Maximum**: ~500 phrases (may impact latency)
- **Update Strategy**: Rotate phrases based on context if needed

## 10. Comparing Recognition: With vs Without Phrase List

| Spoken Phrase | Without Phrase List | With Phrase List |
|--------------|---------------------|------------------|
| "TUF Gaming laptop" | "tough gaming laptop" | "TUF Gaming laptop" ‚úÖ |
| "Neo QLED TV" | "neo Q L E D TV" | "Neo QLED TV" ‚úÖ |
| "Azure Cosmos DB" | "azure cosmos D B" | "Azure Cosmos DB" ‚úÖ |
| "ROG Strix" | "rog stricks" | "ROG Strix" ‚úÖ |

The phrase list significantly improves recognition of specialized vocabulary!