# Pronunciation Assessment with Timestamped Feedback

This notebook demonstrates the complete workflow for pronunciation assessment:

1. **POWSM Phone Recognition (PR)** - Extract actual IPA phones from audio
2. **POWSM G2P** - Generate target IPA from English text
3. **MFA Alignment** - Align phones to timestamps in audio
4. **Error Detection** - Compare actual vs target IPA
5. **Timestamp Mapping** - Map errors to audio timestamps

## Setup

In [5]:
import sys
import os
from pathlib import Path
import tempfile
import json
import shutil
from typing import Dict, List

# Add mod directory to path for imports
mod_path = Path("../../mod")
sys.path.insert(0, str(mod_path.absolute()))

# POWSM imports
from espnet2.bin.s2t_inference import Speech2Text
from espnet2.bin.s2t_inference_language import Speech2Language
import soundfile as sf
import librosa
import numpy as np

# MFA availability check
# MFA requires Kaldi/kalpy which are conda packages
# We need to find the conda MFA, not the venv one (which lacks kalpy)
import subprocess
import shutil
import os

MFA_AVAILABLE = False
MFA_CMD = None

def find_conda_mfa():
    """Try to find MFA in conda environments."""
    # Common conda installation paths
    conda_bases = [
        os.path.expanduser("~/miniconda3"),
        os.path.expanduser("~/anaconda3"),
        os.path.expanduser("~/opt/anaconda3"),
        os.path.expanduser("~/conda"),
        "/opt/anaconda3",
        "/opt/miniconda3",
        "/usr/local/anaconda3",
        "/usr/local/miniconda3",
    ]
    
    # Check CONDA_PREFIX environment variable (current conda env)
    if "CONDA_PREFIX" in os.environ:
        conda_bases.insert(0, os.environ["CONDA_PREFIX"])
    
    # Check CONDA_DEFAULT_ENV to see if we're in a conda env
    if "CONDA_DEFAULT_ENV" in os.environ:
        env_name = os.environ["CONDA_DEFAULT_ENV"]
        if "CONDA_PREFIX" in os.environ:
            mfa_path = os.path.join(os.environ["CONDA_PREFIX"], "bin", "mfa")
            if os.path.exists(mfa_path):
                return mfa_path, os.environ["CONDA_PREFIX"]
    
    # Check common environment names
    env_names = ["aligner", "mfa", "pronunciation", "base"]
    
    for conda_base in conda_bases:
        if os.path.exists(conda_base):
            # Check if it's a conda base or envs directory
            envs_dir = os.path.join(conda_base, "envs")
            if os.path.exists(envs_dir):
                # It's a conda base, check envs
                for env_name in env_names:
                    env_path = os.path.join(envs_dir, env_name)
                    mfa_path = os.path.join(env_path, "bin", "mfa")
                    if os.path.exists(mfa_path):
                        return mfa_path, env_path
            else:
                # Might be a direct environment
                mfa_path = os.path.join(conda_base, "bin", "mfa")
                if os.path.exists(mfa_path):
                    return mfa_path, conda_base
    
    return None, None

def check_mfa_dependencies(conda_env_path):
    """Check if MFA dependencies (openfst, kaldi, kalpy) are available."""
    if not conda_env_path:
        return False, []
    
    bin_dir = os.path.join(conda_env_path, "bin")
    missing = []
    
    # Check for fstcompile (OpenFST)
    fstcompile = os.path.join(bin_dir, "fstcompile")
    if not os.path.exists(fstcompile):
        missing.append("openfst (fstcompile)")
    
    # Check for kaldi binaries (at least one)
    kaldi_bins = ["gmm-align-compiled", "gmm-latgen-faster", "fstcompile"]
    has_kaldi = any(os.path.exists(os.path.join(bin_dir, bin_name)) for bin_name in kaldi_bins)
    if not has_kaldi:
        missing.append("kaldi")
    
    # Check for kalpy (Python module - harder to check, but we'll try)
    # We can't easily check Python modules from here, but we'll catch it at runtime
    
    return len(missing) == 0, missing

# Strategy: Try conda MFA first, then check PATH
print("Checking for MFA availability...")
print("="*70)
print("MFA Setup: Hybrid Conda/Venv Configuration")
print("="*70)
print("This notebook runs in a Python venv but uses MFA from a conda environment.")
print("MFA requires Kaldi, kalpy, and OpenFST which are conda-only packages.")
print("="*70)

# First, try to find conda MFA
conda_mfa, conda_env_path = find_conda_mfa()
if conda_mfa:
    print(f"\n‚úì Found conda MFA at: {conda_mfa}")
    if conda_env_path:
        print(f"  Conda environment: {conda_env_path}")
    
    # Check for required dependencies
    deps_ok, missing_deps = check_mfa_dependencies(conda_env_path)
    if not deps_ok:
        print(f"\n‚ö†Ô∏è  Missing MFA dependencies: {', '.join(missing_deps)}")
        print("\nTo fix, run in your terminal:")
        print(f"  conda activate {os.path.basename(conda_env_path) if conda_env_path else 'aligner'}")
        print("  conda install -c conda-forge openfst kaldi kalpy")
        print("\nOr reinstall MFA with all dependencies:")
        print("  conda install -c conda-forge montreal-forced-aligner --force-reinstall")
    else:
        print("  ‚úì Dependencies check passed (OpenFST, Kaldi found)")
    
    MFA_CMD = conda_mfa
    
    # Test if MFA works - check if it's executable and doesn't have kalpy errors
    mfa_works = False
    
    # Simple test: try any MFA command and check for errors
    test_commands = [
        ["--help"],
        ["version"],
    ]
    
    for test_cmd in test_commands:
        try:
            # Use conda run to ensure proper environment
            if conda_env_path:
                # Try using conda run to execute in the right environment
                env_name = os.path.basename(conda_env_path)
                result = subprocess.run(
                    ["conda", "run", "-n", env_name, "mfa"] + test_cmd,
                    capture_output=True,
                    text=True,
                    timeout=5,
                    env=os.environ.copy()
                )
            else:
                result = subprocess.run(
                    [conda_mfa] + test_cmd,
                    capture_output=True,
                    text=True,
                    timeout=5,
                    env=os.environ.copy()
                )
            
            # Check for common errors
            error_output = (result.stderr + result.stdout).lower()
            
            # Check for kalpy error
            if "kalpy" in error_output or "_kalpy" in error_output or "no module named '_kalpy'" in error_output:
                print(f"\n‚ö†Ô∏è  MFA found but missing kalpy (Kaldi Python bindings)")
                print("   Run: conda install -c conda-forge kalpy")
                break
            
            # Check for openfst/fstcompile error
            if "fstcompile" in error_output or "openfst" in error_output or "thirdparty" in error_output:
                print(f"\n‚ö†Ô∏è  MFA found but missing OpenFST (fstcompile)")
                print("   Run: conda install -c conda-forge openfst")
                break
            
            # If we get here and no errors, MFA should work
            if "usage" in error_output or "command" in error_output or result.returncode == 0:
                mfa_works = True
                if "version" in test_cmd and result.returncode == 0:
                    version_info = (result.stdout + result.stderr).strip()
                    if version_info:
                        print(f"\n‚úì MFA version: {version_info}")
                break
        except FileNotFoundError:
            # conda command not found - try direct execution
            try:
                result = subprocess.run(
                    [conda_mfa] + test_cmd,
                    capture_output=True,
                    text=True,
                    timeout=5,
                    env=os.environ.copy()
                )
                error_output = (result.stderr + result.stdout).lower()
                if "kalpy" not in error_output and "fstcompile" not in error_output:
                    if "usage" in error_output or "command" in error_output or result.returncode == 0:
                        mfa_works = True
                        break
            except:
                pass
        except Exception as e:
            continue
    
    if mfa_works:
        MFA_AVAILABLE = True
        print("\n‚úì MFA is available and ready to use!")
        # Store conda env info for later use
        if conda_env_path:
            MFA_CONDA_ENV = os.path.basename(conda_env_path)
            MFA_CONDA_ENV_PATH = conda_env_path  # Store full path for PATH manipulation
        else:
            MFA_CONDA_ENV = None
            MFA_CONDA_ENV_PATH = None
    else:
        print("\n‚ö†Ô∏è  MFA found but may not work properly. Check dependencies above.")
        # Still store the path even if test failed, in case dependencies are installed later
        if conda_env_path:
            MFA_CONDA_ENV = os.path.basename(conda_env_path)
            MFA_CONDA_ENV_PATH = conda_env_path
        else:
            MFA_CONDA_ENV = None
            MFA_CONDA_ENV_PATH = None

# If conda MFA not found or not working, check PATH
if not MFA_AVAILABLE:
    mfa_path = shutil.which("mfa")
    if mfa_path:
        # Check if it's the venv one (which won't work)
        if ".venv" in mfa_path or "venv" in mfa_path:
            print(f"‚ö†Ô∏è  Found MFA in venv ({mfa_path}), but it requires kalpy (conda-only)")
            print("   The venv MFA package is installed but cannot run without Kaldi/kalpy.")
            print("   ")
            print("   To use MFA, you need to:")
            print("   1. Activate your conda environment: conda activate aligner")
            print("   2. Make sure conda's bin directory is in PATH")
            print("   3. Restart this notebook")
        else:
            print(f"‚úì Found MFA command at: {mfa_path}")
            MFA_CMD = mfa_path
            try:
                result = subprocess.run(
                    ["mfa", "--version"],
                    capture_output=True,
                    text=True,
                    timeout=5
                )
                if result.returncode == 0:
                    MFA_AVAILABLE = True
                    print(f"‚úì MFA available: {result.stdout.strip()}")
                else:
                    print(f"‚ö†Ô∏è  MFA command found but returned error")
                    print(f"   Error: {result.stderr if result.stderr else 'Unknown error'}")
            except Exception as e:
                print(f"‚ö†Ô∏è  Error running MFA: {e}")

if not MFA_AVAILABLE:
    print("\n" + "="*70)
    print("‚ö†Ô∏è  MFA not available or not working")
    print("="*70)
    print("\nMFA requires conda installation (Kaldi, kalpy, OpenFST are conda-only packages)")
    print("\nüìã Installation Instructions:")
    print("\n1. Create conda environment with MFA:")
    print("   conda create -n aligner -c conda-forge montreal-forced-aligner")
    print("\n2. Activate the environment:")
    print("   conda activate aligner")
    print("\n3. Verify installation:")
    print("   mfa --version")
    print("   # Should show MFA version without errors")
    print("\n4. Download required models:")
    print("   mfa model download dictionary english_us_mfa")
    print("   mfa model download acoustic english_mfa")
    print("\n5. For Jupyter notebook:")
    print("   Option A: Install ipykernel in conda env and use it as kernel:")
    print("     conda activate aligner")
    print("     conda install ipykernel")
    print("     python -m ipykernel install --user --name aligner --display-name 'Python (aligner)'")
    print("     # Then select 'Python (aligner)' kernel in Jupyter")
    print("\n   Option B: Keep using venv kernel, MFA will be called via subprocess")
    print("     (Current setup - should work if conda MFA is found)")
    print("\n" + "="*70)
    print("üìù Note: The notebook will still work without MFA, but timestamp alignment")
    print("   will use dummy/estimated timestamps instead of precise MFA alignments.")
    print("="*70)


Checking for MFA availability...
MFA Setup: Hybrid Conda/Venv Configuration
This notebook runs in a Python venv but uses MFA from a conda environment.
MFA requires Kaldi, kalpy, and OpenFST which are conda-only packages.

‚úì Found conda MFA at: /Users/umitcanevleksiz/miniconda3/envs/aligner/bin/mfa
  Conda environment: /Users/umitcanevleksiz/miniconda3/envs/aligner
  ‚úì Dependencies check passed (OpenFST, Kaldi found)

‚úì MFA is available and ready to use!


## IPA Format Conversion Utilities


## Configuration: Input Audio and Target Text

**Configure your assessment here:**

Set the audio file path and the target text (what the speaker should say).
The notebook will:
- Extract actual pronunciation from the audio
- Generate target pronunciation from the text
- Compare and identify errors with timestamps
- Save results to `results/` directory


In [6]:
def parse_powsm_phones(powsm_ipa: str) -> List[str]:
    """Parse IPA phonemes from POWSM format."""
    cleaned = powsm_ipa.strip().strip('/')
    if not cleaned:
        return []
    phonemes = [p.strip('/') for p in cleaned.split('//') if p.strip('/')]
    return phonemes


def powsm_to_mfa_format(powsm_ipa: str) -> str:
    """Convert POWSM format to MFA space-separated format."""
    phones = parse_powsm_phones(powsm_ipa)
    return ' '.join(phones)


def mfa_to_powsm_format(mfa_ipa: str) -> str:
    """Convert MFA format to POWSM format."""
    if not mfa_ipa:
        return ""
    phones = mfa_ipa.strip().split()
    return '//'.join(['/' + p + '/' for p in phones])


## Initialize POWSM Models


In [7]:
device = "cpu"  # Change to "cuda" if GPU available

print("Loading POWSM models...")

# Language detection model
print("  - Loading language detection model...")
s2lang = Speech2Language.from_pretrained(
    "espnet/powsm",
    device=device,
    nbest=1,
    first_lang_sym="<afr>",
    last_lang_sym="<zul>"
)

# Phone Recognition model
print("  - Loading Phone Recognition (PR) model...")
s2t_pr = Speech2Text.from_pretrained(
    "espnet/powsm",
    device=device,
    lang_sym="<eng>",
    task_sym="<pr>",
)

# ASR model (for G2P)
print("  - Loading ASR model...")
s2t_asr = Speech2Text.from_pretrained(
    "espnet/powsm",
    device=device,
    lang_sym="<eng>",
    task_sym="<asr>",
)

# G2P model
print("  - Loading G2P model...")
s2t_g2p = Speech2Text.from_pretrained(
    "espnet/powsm",
    device=device,
    lang_sym="<eng>",
    task_sym="<g2p>",
)

print("‚úì All POWSM models loaded!")


Loading POWSM models...
  - Loading language detection model...


Fetching 7 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 77878.32it/s]


  - Loading Phone Recognition (PR) model...


Fetching 7 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 64669.89it/s]


  - Loading ASR model...


Fetching 7 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 68598.43it/s]


  - Loading G2P model...


Fetching 7 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 73769.17it/s]


‚úì All POWSM models loaded!


## Demo: Pronunciation Assessment Workflow


In [8]:
# ============================================================================
# CONFIGURATION: Set your audio file and target text here
# ============================================================================

CONFIG_ROOT = Path("./audio/powsm")

CONFIGS = [
    {
        "label": "Sentence 12 (√úmit)",
        "sentence_id": "12",
        "audio_filename": "umit12-r.wav",
        "target_text": "The weather is rather warm this Thursday. I think we should go to the theater together. Thank you for thinking about this thoroughly."
    },
    {
        "label": "Sentence 14 (√úmit)",
        "sentence_id": "14",
        "audio_filename": "umit14-r.wav",
        "target_text": "The red car arrived early in the morning. The driver parked near the restaurant and ordered breakfast. The fresh bread was really delicious."
    },
    {
        "label": "Sentence 12 (Yusuf)",
        "sentence_id": "12",
        "audio_filename": "yusuf12-r.wav",
        "target_text": "The weather is rather warm this Thursday. I think we should go to the theater together. Thank you for thinking about this thoroughly."
    }
]

# Pick which configuration to run (set index to another entry to process a different file)
selected_config_index = 2
selected_config = CONFIGS[selected_config_index]

# Derive runtime values from the selected configuration
audio_file_path = CONFIG_ROOT / selected_config["sentence_id"] / selected_config["audio_filename"]
audio_file = str(audio_file_path)
target_text = selected_config["target_text"]

# ============================================================================
# Validate configuration
# ============================================================================

print("="*70)
print("CONFIGURATION")
print("="*70)
print(f"Selected config: {selected_config.get('label', selected_config['audio_filename'])} (index {selected_config_index})")
print(f"Sentence ID: {selected_config['sentence_id']}")
print(f"Audio file: {audio_file}")

# Check if audio file exists
if not os.path.exists(audio_file):
    print(f"‚ö†Ô∏è  ERROR: Audio file not found: {audio_file}")
    print("   Please update the selected configuration above.")
    raise FileNotFoundError(f"Audio file not found: {audio_file}")

# Get audio basename for output filename
audio_basename = os.path.splitext(os.path.basename(audio_file))[0]

# Load audio to get duration
speech, rate = sf.read(audio_file)
audio_duration = len(speech) / rate

print(f"\n‚úì Audio File: {audio_file}")
print(f"  Duration: {audio_duration:.2f}s")
print(f"  Sample Rate: {rate}Hz")
print(f"\n‚úì Target Text:")
print(f"  {target_text}")
print(f"\n‚úì Output will be saved to: results/pronunciation_assessment_{audio_basename}_<timestamp>.json")
print("="*70)


CONFIGURATION
Selected config: Sentence 12 (Yusuf) (index 2)
Sentence ID: 12
Audio file: audio/powsm/12/yusuf12-r.wav

‚úì Audio File: audio/powsm/12/yusuf12-r.wav
  Duration: 20.00s
  Sample Rate: 16000Hz

‚úì Target Text:
  The weather is rather warm this Thursday. I think we should go to the theater together. Thank you for thinking about this thoroughly.

‚úì Output will be saved to: results/pronunciation_assessment_yusuf12-r_<timestamp>.json


### Step 1: Extract Actual IPA from Audio (POWSM Phone Recognition)


In [9]:
# ============================================================================
# Step 1: Extract Actual IPA from Audio (POWSM Phone Recognition)
# ============================================================================

print("\n" + "="*70)
print("STEP 1: Extract Actual Pronunciation from Audio")
print("="*70)

# Detect language (optional)
print("\nDetecting language...")
try:
    lang_pred = s2lang(speech)[0]
    detected_lang = lang_pred[0] if lang_pred else "<eng>"
    print(f"‚úì Detected language: {detected_lang}")
except:
    detected_lang = "<eng>"
    print(f"‚úì Using default language: {detected_lang}")

# Phone Recognition
print("\nRunning POWSM Phone Recognition...")
result_pr = s2t_pr(speech, text_prev="<na>")
pred_pr = result_pr[0][0]

# Post-processing
if "<notimestamps>" in pred_pr:
    pred_pr = pred_pr.split("<notimestamps>")[1].strip()
else:
    pred_pr = pred_pr.strip()

# Clean version (without slashes)
pred_pr_clean = pred_pr.replace("/", "")

# Parse into list
actual_phones = parse_powsm_phones(pred_pr)

print(f"\n‚úì Phone Recognition Complete")
print(f"  Total phones detected: {len(actual_phones)}")
print(f"\nüìû Actual IPA (POWSM format):")
print(f"   {pred_pr}")
print(f"\nüìû Actual IPA (clean, no separators):")
print(f"   {pred_pr_clean}")
print(f"\nüìû Actual phones (full list, {len(actual_phones)} phones):")
for i, phone in enumerate(actual_phones, 1):
    print(f"   {i:3d}. {phone}")



STEP 1: Extract Actual Pronunciation from Audio

Detecting language...


  with autocast(False):


‚úì Detected language: <eng>

Running POWSM Phone Recognition...

‚úì Phone Recognition Complete
  Total phones detected: 82

üìû Actual IPA (POWSM format):
   /√∞//…ô//w//…õ//√∞//…úÀû//…™//z//…π//…ô//√∞//…úÀû//w//…î//…π//m//√∞//…™//s//Œ∏//…úÀû//d//z//d//e//…™//a//…™//Œ∏//…™ÃÉ//≈ã//k//w//i// É//i//k ∞//…î//z//…™//t//Œ∏//i//…ô//t//…úÀû//t ∞//…ô//…°//…õ//√∞//…úÀû//Œ∏//√¶ÃÉ//≈ã//k//j//u//f//…π//…úÀû//Œ∏//…™ÃÉ//≈ã//k//…™ÃÉ//≈ã//…ô//b//a// ä//t//…™//z//…ô//Œ∏//…î//…π//o// ä//lÃ¥//i/

üìû Actual IPA (clean, no separators):
   √∞…ôw…õ√∞…úÀû…™z…π…ô√∞…úÀûw…î…πm√∞…™sŒ∏…úÀûdzde…™a…™Œ∏…™ÃÉ≈ãkwi Éik ∞…îz…™tŒ∏i…ôt…úÀût ∞…ô…°…õ√∞…úÀûŒ∏√¶ÃÉ≈ãkjuf…π…úÀûŒ∏…™ÃÉ≈ãk…™ÃÉ≈ã…ôba ät…™z…ôŒ∏…î…πo älÃ¥i

üìû Actual phones (full list, 82 phones):
     1. √∞
     2. …ô
     3. w
     4. …õ
     5. √∞
     6. …úÀû
     7. …™
     8. z
     9. …π
    10. …ô
    11. √∞
    12. …úÀû
    13. w
    14. …î
    15. …π
    16. m
    17. √∞
    18. …™
    19. s
    20. Œ∏
    21. …úÀû
    22. d
    23. z
    24. d
    25.

### Step 2: Generate Target IPA from Text (POWSM G2P)


In [10]:
# ============================================================================
# Step 2: Generate Target IPA from Text (POWSM G2P)
# ============================================================================

print("\n" + "="*70)
print("STEP 2: Generate Target Pronunciation from Text")
print("="*70)

# Step 2a: Get ASR transcript (for comparison)
print("\n2a. Getting ASR transcript (for comparison)...")
result_asr = s2t_asr(speech, text_prev="<na>")
pred_asr = result_asr[0][0]

if "<notimestamps>" in pred_asr:
    pred_asr = pred_asr.split("<notimestamps>")[1].strip()
else:
    pred_asr = pred_asr.strip()

print(f"   ‚úì ASR Result: {pred_asr}")

# Step 2b: Audio-guided G2P using ORIGINAL TRANSCRIPT (ground truth)
print("\n2b. Audio-guided G2P using ORIGINAL TRANSCRIPT (ground truth)...")
print(f"   Using transcript: {target_text}")
result_g2p_original = s2t_g2p(speech, text_prev=target_text)
pred_g2p_original = result_g2p_original[0][0]

if "<notimestamps>" in pred_g2p_original:
    pred_g2p_original = pred_g2p_original.split("<notimestamps>")[1].strip()
else:
    pred_g2p_original = pred_g2p_original.strip()

pred_g2p_original_clean = pred_g2p_original.replace("/", "")

# Parse into list (use original transcript version as primary)
target_phones = parse_powsm_phones(pred_g2p_original)

print(f"\n‚úì Target IPA Generated (from original transcript)")
print(f"  Total phones: {len(target_phones)}")
print(f"\nüî§ Target IPA (POWSM format):")
print(f"   {pred_g2p_original}")
print(f"\nüî§ Target IPA (clean, no separators):")
print(f"   {pred_g2p_original_clean}")
print(f"\nüî§ Target phones (full list, {len(target_phones)} phones):")
for i, phone in enumerate(target_phones, 1):
    print(f"   {i:3d}. {phone}")

# Step 2c: Audio-guided G2P using ASR transcript (for comparison)
print("\n" + "-"*70)
print("2c. Audio-guided G2P using ASR TRANSCRIPT (for comparison)...")
print(f"   Using ASR transcript: {pred_asr}")
result_g2p_asr = s2t_g2p(speech, text_prev=pred_asr)
pred_g2p_asr = result_g2p_asr[0][0]

if "<notimestamps>" in pred_g2p_asr:
    pred_g2p_asr = pred_g2p_asr.split("<notimestamps>")[1].strip()
else:
    pred_g2p_asr = pred_g2p_asr.strip()

pred_g2p_asr_clean = pred_g2p_asr.replace("/", "")

# Parse into list for comparison
target_phones_asr = parse_powsm_phones(pred_g2p_asr)

print(f"\n‚úì Target IPA Generated (from ASR transcript)")
print(f"  Total phones: {len(target_phones_asr)}")
print(f"\nüî§ Target IPA from ASR (POWSM format):")
print(f"   {pred_g2p_asr}")
print(f"\nüî§ Target IPA from ASR (clean, no separators):")
print(f"   {pred_g2p_asr_clean}")
print(f"\nüî§ Target phones from ASR (full list, {len(target_phones_asr)} phones):")
for i, phone in enumerate(target_phones_asr, 1):
    print(f"   {i:3d}. {phone}")

print("\n" + "="*70)
print("üìä Summary:")
print("="*70)
print(f"  Primary (Original transcript): {len(target_phones)} phones")
print(f"  Comparison (ASR transcript):  {len(target_phones_asr)} phones")
print(f"  Difference: {abs(len(target_phones) - len(target_phones_asr))} phones")
print("="*70)



STEP 2: Generate Target Pronunciation from Text

2a. Getting ASR transcript (for comparison)...
   ‚úì ASR Result: the weather is rather warm this Thursday  ‚Åá  think we she cause it the theater together think you for thinking about is authority

2b. Audio-guided G2P using ORIGINAL TRANSCRIPT (ground truth)...
   Using transcript: The weather is rather warm this Thursday. I think we should go to the theater together. Thank you for thinking about this thoroughly.

‚úì Target IPA Generated (from original transcript)
  Total phones: 86

üî§ Target IPA (POWSM format):
   /√∞//…ô//w//…õ//√∞//…úÀû//…™//z//…π//…ô//√∞//…úÀû//w//…î//…π//m//√∞//…™//s//Œ∏//…úÀû//z//d//e//…™//a//…™//Œ∏//…™ÃÉ//≈ã//k//w//i// É// ä//d//k//o// ä//t ∞//u//√∞//…ô//Œ∏//i//…ô//t//…úÀû//t ∞//…ô//…°//…õ//√∞//…úÀû//a//…™//Œ∏//√¶ÃÉ//≈ã//k//j//u//f//…π//…úÀû//Œ∏//…™ÃÉ//≈ã//k//…™ÃÉ//≈ã//…ô//b//a// ä//t//√∞//…™//s//Œ∏//…î//…π//o// ä//lÃ¥//i/

üî§ Target IPA (clean, no separators):
   √∞…ôw…õ√∞…úÀû…™z…π…ô√∞…úÀûw…î…πm√∞…™sŒ∏…ú

### Step 3: MFA Alignment (Align Phones to Timestamps)

Aligns phones from the target text to timestamps in the audio using Montreal Forced Aligner (MFA).

**Note:** MFA requires installation and model downloads. See instructions in the setup cell if not available.


In [11]:
# ============================================================================
# Step 3: MFA Alignment (Align Phones to Timestamps)
# ============================================================================

print("\n" + "="*70)
print("STEP 3: Align Phones to Audio Timestamps (MFA)")
print("="*70)

if not MFA_AVAILABLE:
    print("\n‚ö†Ô∏è  MFA not available. Skipping alignment.")
    print("\nTo use MFA alignment:")
    print("  1. Install MFA via conda: conda create -n aligner -c conda-forge montreal-forced-aligner")
    print("  2. Activate conda environment: conda activate aligner")
    print("  3. Download models:")
    print("     mfa model download dictionary english_us_mfa")
    print("     mfa model download acoustic english_mfa")
    print("  4. Restart this notebook with conda environment activated")
    print("\nNote: The notebook will continue with estimated timestamps.")
    mfa_alignments = []
else:
    print("\nRunning MFA alignment...")
    
    # Use corpus-based alignment (more reliable than align_one)
    # Create temporary corpus directory structure
    with tempfile.TemporaryDirectory() as temp_base:
        corpus_dir = os.path.join(temp_base, "corpus")
        output_dir = os.path.join(temp_base, "output")
        os.makedirs(corpus_dir, exist_ok=True)
        os.makedirs(output_dir, exist_ok=True)
        
        # Get audio basename
        audio_basename = os.path.splitext(os.path.basename(audio_file))[0]
        
        # Copy audio file to corpus directory
        audio_in_corpus = os.path.join(corpus_dir, f"{audio_basename}.wav")
        shutil.copy2(audio_file, audio_in_corpus)
        
        # Create .lab file (MFA expects .lab files for text)
        lab_file = os.path.join(corpus_dir, f"{audio_basename}.lab")
        with open(lab_file, 'w', encoding='utf-8') as f:
            f.write(target_text)
        
        try:
            # Use MFA_CMD if available, otherwise use "mfa"
            mfa_command = MFA_CMD if MFA_CMD else "mfa"
            
            print(f"  Aligning: {audio_file} with text: {target_text[:50]}...")
            print(f"  Corpus dir: {corpus_dir}")
            print(f"  Output dir: {output_dir}")
            
            # Prepare environment with conda bin directory in PATH
            env = os.environ.copy()
            if 'MFA_CONDA_ENV_PATH' in globals() and MFA_CONDA_ENV_PATH:
                conda_bin = os.path.join(MFA_CONDA_ENV_PATH, "bin")
                # Prepend conda bin to PATH so fstcompile and other tools are found
                env["PATH"] = conda_bin + os.pathsep + env.get("PATH", "")
                print(f"  Using conda environment PATH: {conda_bin}")
            
            # Try alignment with increasing beam sizes if needed
            beam_sizes = [
                (10, 40),   # Default
                (100, 400), # Large (as suggested by error)
                (200, 800), # Larger
                (400, 1600) # Huge (for longer sequences)
            ]
            
            result = None
            alignment_successful = False
            
            for beam, retry_beam in beam_sizes:
                if alignment_successful:
                    break
                    
                print(f"  Trying alignment with beam={beam}, retry_beam={retry_beam}...")
                
                # Try to use conda run if we have the env name, otherwise use direct command
                if 'MFA_CONDA_ENV' in globals() and MFA_CONDA_ENV:
                    try:
                        # Use conda run to ensure proper environment
                        cmd = ["conda", "run", "-n", MFA_CONDA_ENV, "mfa", "align", 
                               corpus_dir, "english_us_mfa", "english_mfa", output_dir, 
                               "--clean", "--beam", str(beam), "--retry_beam", str(retry_beam)]
                        result = subprocess.run(
                            cmd,
                            capture_output=True,
                            text=True,
                            timeout=300,
                            env=env  # Use modified environment with conda bin in PATH
                        )
                    except FileNotFoundError:
                        # conda command not in PATH, fall back to direct MFA command
                        cmd = [mfa_command, "align", corpus_dir, "english_us_mfa", "english_mfa", output_dir, 
                               "--clean", "--beam", str(beam), "--retry_beam", str(retry_beam)]
                        result = subprocess.run(
                            cmd,
                            capture_output=True,
                            text=True,
                            timeout=300,
                            env=env  # Use modified environment with conda bin in PATH
                        )
                else:
                    # Direct MFA command with modified PATH
                    cmd = [mfa_command, "align", corpus_dir, "english_us_mfa", "english_mfa", output_dir, 
                           "--clean", "--beam", str(beam), "--retry_beam", str(retry_beam)]
                    result = subprocess.run(
                        cmd,
                        capture_output=True,
                        text=True,
                        timeout=300,
                        env=env  # Use modified environment with conda bin in PATH
                    )
                
                # Check if alignment was successful
                if result.returncode == 0:
                    alignment_successful = True
                    print(f"  ‚úì Alignment successful with beam={beam}, retry_beam={retry_beam}")
                    break
                elif "noalignmentserror" in result.stderr.lower() or "no successful alignments" in result.stderr.lower():
                    # Try next beam size
                    print(f"  ‚ö†Ô∏è  No alignments with beam={beam}, retry_beam={retry_beam}, trying larger beam...")
                    continue
                else:
                    # Different error, break and show it
                    break
            
            if result.returncode == 0:
                # Parse TextGrid output
                textgrid_path = os.path.join(output_dir, f"{audio_basename}.TextGrid")
                
                if os.path.exists(textgrid_path):
                    # Parse TextGrid
                    try:
                        from textgrid import TextGrid
                        tg = TextGrid.fromFile(textgrid_path)
                        
                        mfa_alignments = []
                        # Find phone tier
                        for tier in tg.tiers:
                            if tier.name.lower() in ['phones', 'phone']:
                                for interval in tier:
                                    if interval.mark.strip():
                                        mfa_alignments.append({
                                            "phoneme": interval.mark.strip(),
                                            "start": interval.minTime,
                                            "end": interval.maxTime
                                        })
                                break
                        
                        print(f"\n‚úì MFA alignment complete: {len(mfa_alignments)} phones aligned")
                        print(f"\nüìä All alignments ({len(mfa_alignments)} phones):")
                        for i, align in enumerate(mfa_alignments, 1):
                            print(f"   {i:3d}. {align['phoneme']:5s} [{align['start']:6.3f}s - {align['end']:6.3f}s]")
                    except ImportError:
                        print("‚ö†Ô∏è  textgrid library not available. Install with: pip install textgrid")
                        print("   Trying manual TextGrid parsing...")
                        # Try manual parsing as fallback
                        mfa_alignments = parse_textgrid_manual(textgrid_path)
                        if mfa_alignments:
                            print(f"‚úì Parsed {len(mfa_alignments)} alignments manually")
                        else:
                            mfa_alignments = []
                else:
                    # Check for any TextGrid files
                    textgrid_files = [f for f in os.listdir(output_dir) if f.endswith('.TextGrid')]
                    if textgrid_files:
                        textgrid_path = os.path.join(output_dir, textgrid_files[0])
                        try:
                            from textgrid import TextGrid
                            tg = TextGrid.fromFile(textgrid_path)
                            mfa_alignments = []
                            for tier in tg.tiers:
                                if tier.name.lower() in ['phones', 'phone']:
                                    for interval in tier:
                                        if interval.mark.strip():
                                            mfa_alignments.append({
                                                "phoneme": interval.mark.strip(),
                                                "start": interval.minTime,
                                                "end": interval.maxTime
                                            })
                                    break
                            print(f"\n‚úì MFA alignment complete: {len(mfa_alignments)} phones aligned")
                        except:
                            mfa_alignments = []
                    else:
                        print(f"‚ö†Ô∏è  TextGrid file not found in {output_dir}")
                        print(f"   MFA output: {result.stdout[:200] if result.stdout else 'No output'}")
                        mfa_alignments = []
            else:
                print(f"‚ö†Ô∏è  MFA alignment failed (return code: {result.returncode})")
                print("\n" + "="*70)
                print("FULL ERROR OUTPUT:")
                print("="*70)
                if result.stderr:
                    error_msg = result.stderr.strip()
                    print("STDERR:")
                    print(error_msg)  # Show full error, not truncated
                    print("\n" + "="*70)
                    # Check for common issues
                    if "fstcompile" in error_msg.lower() or "openfst" in error_msg.lower() or "thirdparty" in error_msg.lower():
                        print("\n" + "="*70)
                        print("‚ùå MISSING DEPENDENCY: OpenFST (fstcompile)")
                        print("="*70)
                        print("\nMFA requires OpenFST to be installed in the conda environment.")
                        print("\nTo fix this, run the following commands in your terminal:")
                        print("\n  1. Activate your conda environment:")
                        print("     conda activate aligner")
                        print("\n  2. Install OpenFST:")
                        print("     conda install -c conda-forge openfst")
                        print("\n  3. Verify installation:")
                        print("     which fstcompile")
                        print("     # Should show: <conda_env_path>/bin/fstcompile")
                        print("\n  4. Restart this notebook after installation.")
                        print("\nAlternative: Reinstall MFA with all dependencies:")
                        print("  conda install -c conda-forge montreal-forced-aligner --force-reinstall")
                        print("="*70)
                    elif "not found" in error_msg.lower() or "does not exist" in error_msg.lower():
                        print("\n   üí° Tip: Make sure the audio file exists and is accessible")
                    elif "noalignmentserror" in error_msg.lower() or "no successful alignments" in error_msg.lower():
                        print("\n" + "="*70)
                        print("‚ùå ALIGNMENT FAILED: No successful alignments found")
                        print("="*70)
                        print("\nThis usually means:")
                        print("  1. The text doesn't match what's in the audio")
                        print("  2. The audio quality is too poor")
                        print("  3. The text contains words not in the dictionary")
                        print("  4. The beam size was too small (already tried larger beams)")
                        print("\nüí° Suggestions:")
                        print("  1. Check if the transcript matches the audio content")
                        print("  2. Verify audio quality (clear speech, not too noisy)")
                        print("  3. Try validating the corpus:")
                        print(f"     mfa validate {corpus_dir} english_us_mfa")
                        print("  4. Check if all words in the text are in the dictionary")
                        print("  5. Try with a shorter or simpler text")
                        print("="*70)
                    elif "dictionary" in error_msg.lower() or "model" in error_msg.lower():
                        print("\n   üí° Tip: Make sure MFA models are downloaded:")
                        print("      mfa model download dictionary english_us_mfa")
                        print("      mfa model download acoustic english_mfa")
                    elif "usage" in error_msg.lower() or "error" in error_msg.lower():
                        print("\n   üí° Tip: The MFA command syntax might be incorrect.")
                        print("      Try running manually to see the correct syntax:")
                        print(f"      {mfa_command} align --help")
                if result.stdout:
                    stdout_msg = result.stdout.strip()
                    print("STDOUT:")
                    print(stdout_msg)  # Show full output
                    print("="*70)
                mfa_alignments = []
        except Exception as e:
            print(f"‚ö†Ô∏è  MFA alignment error: {e}")
            import traceback
            traceback.print_exc()
            mfa_alignments = []


# Helper function for manual TextGrid parsing (fallback)
def parse_textgrid_manual(textgrid_path: str) -> List[Dict]:
    """Manually parse TextGrid file if textgrid library not available."""
    alignments = []
    try:
        with open(textgrid_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        # Simple TextGrid parser - look for intervals with xmin, xmax, and text
        i = 0
        in_phone_tier = False
        while i < len(lines):
            line = lines[i].strip()
            if 'item [2]' in line or 'name = "phones"' in line.lower() or 'name = "phone"' in line.lower():
                in_phone_tier = True
                i += 1
                continue
            
            if in_phone_tier and 'intervals [' in line:
                # Found an interval
                i += 1
                xmin = None
                xmax = None
                text = ""
                
                while i < len(lines) and ('intervals [' not in lines[i] or i == len(lines) - 1):
                    if 'xmin =' in lines[i]:
                        xmin = float(lines[i].split('=')[1].strip())
                    elif 'xmax =' in lines[i]:
                        xmax = float(lines[i].split('=')[1].strip())
                    elif 'text =' in lines[i]:
                        text = lines[i].split('=')[1].strip().strip('"')
                    i += 1
                    if i >= len(lines):
                        break
                
                if xmin is not None and xmax is not None and text.strip():
                    alignments.append({
                        "phoneme": text.strip(),
                        "start": xmin,
                        "end": xmax
                    })
                continue
            i += 1
        
        return alignments
    except Exception as e:
        print(f"   Manual parsing failed: {e}")
        return []



STEP 3: Align Phones to Audio Timestamps (MFA)

Running MFA alignment...
  Aligning: audio/powsm/12/yusuf12-r.wav with text: The weather is rather warm this Thursday. I think ...
  Corpus dir: /var/folders/qk/0bk8sm9136q_16hvc9hdhr6m0000gn/T/tmpdqzkghrd/corpus
  Output dir: /var/folders/qk/0bk8sm9136q_16hvc9hdhr6m0000gn/T/tmpdqzkghrd/output
  Using conda environment PATH: /Users/umitcanevleksiz/miniconda3/envs/aligner/bin
  Trying alignment with beam=10, retry_beam=40...
  ‚ö†Ô∏è  No alignments with beam=10, retry_beam=40, trying larger beam...
  Trying alignment with beam=100, retry_beam=400...
  ‚úì Alignment successful with beam=100, retry_beam=400

‚úì MFA alignment complete: 68 phones aligned

üìä All alignments (68 phones):
     1. …ô     [ 0.000s -  0.010s]
     2. w     [ 0.010s -  0.040s]
     3. …õ     [ 0.040s -  0.050s]
     4. √∞     [ 0.050s -  0.080s]
     5. …ö     [ 0.080s -  0.110s]
     6. z     [ 0.110s -  0.140s]
     7. …π     [ 0.140s -  0.170s]
     8. √¶     

### Step 4: Compare Actual vs Target IPA (Error Detection)


In [12]:
# ============================================================================
# Step 4: Compare Actual vs Target IPA (Error Detection)
# ============================================================================

print("\n" + "="*70)
print("STEP 4: Compare Actual vs Target Pronunciation")
print("="*70)

# Import edit distance function
sys.path.insert(0, str(mod_path / "assessment"))
from edit_distance import edit_operations

print(f"\nComparing pronunciation...")
print(f"  Actual phones:  {len(actual_phones)} phones")
print(f"  Target phones:  {len(target_phones)} phones")

# Run edit distance
operations = edit_operations(actual_phones, target_phones)

# Count operation types
substitutes = [op for op in operations if op[0] == "substitute"]
inserts = [op for op in operations if op[0] == "insert"]
deletes = [op for op in operations if op[0] == "delete"]

print(f"\n‚úì Error Detection Complete")
print(f"\nüìä Edit Operations Summary:")
print(f"   Total errors:     {len(operations)}")
print(f"   Substitutions:    {len(substitutes)}")
print(f"   Insertions:       {len(inserts)}")
print(f"   Deletions:        {len(deletes)}")

# Show ALL errors (not truncated)
if len(operations) > 0:
    print(f"\nüìã All Errors Detected ({len(operations)} total):")
    for i, op in enumerate(operations, 1):
        op_type = op[0]
        pos = op[1]
        
        if op_type == "substitute":
            expected = op[2] if len(op) > 2 else "?"
            actual = actual_phones[pos] if pos < len(actual_phones) else "?"
            print(f"   {i:3d}. {op_type:12s} @ pos {pos:3d}: expected '{expected}' but got '{actual}'")
        elif op_type == "insert":
            expected = op[2] if len(op) > 2 else "?"
            print(f"   {i:3d}. {op_type:12s} @ pos {pos:3d}: missing '{expected}'")
        elif op_type == "delete":
            actual = actual_phones[pos] if pos < len(actual_phones) else "?"
            print(f"   {i:3d}. {op_type:12s} @ pos {pos:3d}: extra '{actual}'")
else:
    print(f"\n‚úì No errors detected! Perfect pronunciation!")

# Calculate score
total_phonemes = len(target_phones)
if total_phonemes == 0:
    score = 1.0 if len(actual_phones) == 0 else 0.0
else:
    error_cost = sum(
        1 if op[0] == "delete" else
        1 if op[0] == "insert" else
        2 if op[0] == "substitute" else 0
        for op in operations
    )
    max_cost = total_phonemes * 2
    score = max(0.0, 1.0 - (error_cost / max_cost))

print(f"\nüìà Pronunciation Score: {score:.2%}")
print("="*70)



STEP 4: Compare Actual vs Target Pronunciation

Comparing pronunciation...
  Actual phones:  82 phones
  Target phones:  86 phones

‚úì Error Detection Complete

üìä Edit Operations Summary:
   Total errors:     15
   Substitutions:    7
   Insertions:       6
   Deletions:        2

üìã All Errors Detected (15 total):
     1. delete       @ pos  21: extra 'd'
     2. insert       @ pos  35: missing ' ä'
     3. insert       @ pos  35: missing 'd'
     4. insert       @ pos  35: missing 'k'
     5. substitute   @ pos  35: expected 'o' but got 'i'
     6. substitute   @ pos  36: expected ' ä' but got 'k ∞'
     7. substitute   @ pos  37: expected 't ∞' but got '…î'
     8. substitute   @ pos  38: expected 'u' but got 'z'
     9. substitute   @ pos  39: expected '√∞' but got '…™'
    10. substitute   @ pos  40: expected '…ô' but got 't'
    11. insert       @ pos  52: missing 'a'
    12. insert       @ pos  52: missing '…™'
    13. insert       @ pos  72: missing '√∞'
    14. delete  

### Step 5: Map Errors to Timestamps


In [13]:
def map_errors_to_timestamps(
    operations: List,
    actual_phones: List[str],
    target_phones: List[str],
    mfa_alignments: List[Dict],
    sample_rate: int
) -> List[Dict]:
    """
    Map edit operations to timestamps from MFA alignment.
    
    MFA alignments provide timestamps for phones aligned from target text.
    We map errors based on their position in the target sequence.
    """
    errors_with_timestamps = []
    
    if not mfa_alignments:
        # Fallback: evenly distribute timestamps
        total_duration = len(actual_phones) * 0.1  # Assume ~100ms per phone
        for op in operations:
            op_type = op[0]
            pos = op[1]
            
            error_dict = {"type": op_type, "position": pos}
            
            if op_type == "substitute":
                error_dict["expected"] = op[2] if len(op) > 2 else None
                error_dict["actual"] = actual_phones[pos] if pos < len(actual_phones) else None
            elif op_type == "insert":
                error_dict["expected"] = op[2] if len(op) > 2 else None
            elif op_type == "delete":
                error_dict["actual"] = actual_phones[pos] if pos < len(actual_phones) else None
            
            # Dummy timestamps
            start_time = pos * 0.1
            end_time = (pos + 1) * 0.1
            error_dict["timestamp_seconds"] = {"start": start_time, "end": end_time}
            error_dict["timestamp_samples"] = {
                "start": int(start_time * sample_rate),
                "end": int(end_time * sample_rate)
            }
            
            errors_with_timestamps.append(error_dict)
        return errors_with_timestamps
    
    # Map operations to MFA timestamps
    for op in operations:
        op_type = op[0]
        pos = op[1]
        
        error_dict = {"type": op_type, "position": pos}
        
        if op_type == "substitute":
            error_dict["expected"] = op[2] if len(op) > 2 else None
            error_dict["actual"] = actual_phones[pos] if pos < len(actual_phones) else None
            
            # Use timestamp from target position
            if pos < len(mfa_alignments):
                align = mfa_alignments[pos]
                error_dict["timestamp_seconds"] = {"start": align["start"], "end": align["end"]}
                error_dict["timestamp_samples"] = {
                    "start": int(align["start"] * sample_rate),
                    "end": int(align["end"] * sample_rate)
                }
            else:
                # Fallback for out-of-bounds
                error_dict["timestamp_seconds"] = {"start": 0.0, "end": 0.1}
                error_dict["timestamp_samples"] = {"start": 0, "end": int(0.1 * sample_rate)}
                
        elif op_type == "insert":
            error_dict["expected"] = op[2] if len(op) > 2 else None
            
            # Insertion: use timestamp between adjacent phones
            if pos > 0 and pos <= len(mfa_alignments):
                if pos < len(mfa_alignments):
                    prev_end = mfa_alignments[pos-1]["end"]
                    next_start = mfa_alignments[pos]["start"]
                    mid_time = (prev_end + next_start) / 2
                    error_dict["timestamp_seconds"] = {"start": mid_time - 0.05, "end": mid_time + 0.05}
                else:
                    last_end = mfa_alignments[-1]["end"]
                    error_dict["timestamp_seconds"] = {"start": last_end, "end": last_end + 0.1}
            else:
                error_dict["timestamp_seconds"] = {"start": 0.0, "end": 0.1}
            
            error_dict["timestamp_samples"] = {
                "start": int(error_dict["timestamp_seconds"]["start"] * sample_rate),
                "end": int(error_dict["timestamp_seconds"]["end"] * sample_rate)
            }
            
        elif op_type == "delete":
            error_dict["actual"] = actual_phones[pos] if pos < len(actual_phones) else None
            
            # Deletion: use timestamp from actual phone position
            if pos < len(mfa_alignments):
                align = mfa_alignments[pos]
                error_dict["timestamp_seconds"] = {"start": align["start"], "end": align["end"]}
                error_dict["timestamp_samples"] = {
                    "start": int(align["start"] * sample_rate),
                    "end": int(align["end"] * sample_rate)
                }
            else:
                error_dict["timestamp_seconds"] = {"start": 0.0, "end": 0.1}
                error_dict["timestamp_samples"] = {"start": 0, "end": int(0.1 * sample_rate)}
        
        errors_with_timestamps.append(error_dict)
    
    return errors_with_timestamps


# ============================================================================
# Step 5: Map Errors to Timestamps
# ============================================================================

print("\n" + "="*70)
print("STEP 5: Map Errors to Audio Timestamps")
print("="*70)

# Map errors to timestamps
print("\nMapping errors to timestamps...")
errors_with_timestamps = map_errors_to_timestamps(
    operations,
    actual_phones,
    target_phones,
    mfa_alignments if 'mfa_alignments' in locals() else [],
    rate
)

print(f"\n‚úì Mapped {len(errors_with_timestamps)} errors to timestamps")

# Show ALL errors with timestamps (not truncated)
if len(errors_with_timestamps) > 0:
    print(f"\nüìã All Errors with Timestamps ({len(errors_with_timestamps)} total):")
    for i, error in enumerate(errors_with_timestamps, 1):
        op_type = error["type"]
        ts = error["timestamp_seconds"]
        
        if op_type == "substitute":
            print(f"   {i:3d}. {op_type:12s} @ [{ts['start']:6.3f}s - {ts['end']:6.3f}s]: "
                  f"expected '{error.get('expected', '?')}' but got '{error.get('actual', '?')}'")
        elif op_type == "insert":
            print(f"   {i:3d}. {op_type:12s} @ [{ts['start']:6.3f}s - {ts['end']:6.3f}s]: "
                  f"missing '{error.get('expected', '?')}'")
        elif op_type == "delete":
            print(f"   {i:3d}. {op_type:12s} @ [{ts['start']:6.3f}s - {ts['end']:6.3f}s]: "
                  f"extra '{error.get('actual', '?')}'")
else:
    print("\n‚úì No errors to map (perfect pronunciation!)")

print("="*70)



STEP 5: Map Errors to Audio Timestamps

Mapping errors to timestamps...

‚úì Mapped 15 errors to timestamps

üìã All Errors with Timestamps (15 total):
     1. delete       @ [ 0.610s -  0.640s]: extra 'd'
     2. insert       @ [ 7.980s -  8.080s]: missing ' ä'
     3. insert       @ [ 7.980s -  8.080s]: missing 'd'
     4. insert       @ [ 7.980s -  8.080s]: missing 'k'
     5. substitute   @ [ 8.030s -  8.060s]: expected 'o' but got 'i'
     6. substitute   @ [ 8.060s -  8.090s]: expected ' ä' but got 'k ∞'
     7. substitute   @ [ 8.090s -  8.100s]: expected 't ∞' but got '…î'
     8. substitute   @ [ 8.100s -  8.130s]: expected 'u' but got 'z'
     9. substitute   @ [ 8.130s -  8.160s]: expected '√∞' but got '…™'
    10. substitute   @ [ 8.160s -  8.180s]: expected '…ô' but got 't'
    11. insert       @ [ 8.440s -  8.540s]: missing 'a'
    12. insert       @ [ 8.440s -  8.540s]: missing '…™'
    13. insert       @ [ 0.000s -  0.100s]: missing '√∞'
    14. delete       @ [ 0.000

In [14]:
# ============================================================================
# Final Results: Compile and Save Assessment
# ============================================================================

print("\n" + "="*70)
print("FINAL RESULTS: Compiling Assessment")
print("="*70)

# Generate timestamp for output filename
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Compile final result (using original transcript G2P as primary)
assessment_result = {
    "metadata": {
        "config_label": selected_config.get("label"),
        "selected_config_index": selected_config_index,
        "sentence_id": selected_config["sentence_id"],
        "audio_file": os.path.basename(audio_file),
        "audio_path": audio_file,
        "audio_basename": audio_basename,
        "audio_duration_seconds": audio_duration,
        "sample_rate": rate,
        "timestamp": timestamp,
    },
    "target_text": target_text,
    "actual_text": pred_asr if 'pred_asr' in locals() else None,
    "actual_ipa_powsm": pred_pr,
    "target_ipa_powsm": {
        "from_original_transcript": {
            "powsm_format": pred_g2p_original if 'pred_g2p_original' in locals() else None,
            "clean": pred_g2p_original_clean if 'pred_g2p_original_clean' in locals() else None,
            "phones": target_phones
        },
        "from_asr_transcript": {
            "powsm_format": pred_g2p_asr if 'pred_g2p_asr' in locals() else None,
            "clean": pred_g2p_asr_clean if 'pred_g2p_asr_clean' in locals() else None,
            "phones": target_phones_asr if 'target_phones_asr' in locals() else None
        }
    },
    "actual_phones": actual_phones,
    "target_phones": target_phones,  # Primary: from original transcript
    "target_phones_asr": target_phones_asr if 'target_phones_asr' in locals() else None,  # Comparison: from ASR
    "score": score,
    "errors": errors_with_timestamps,
    "mfa_alignments": mfa_alignments if 'mfa_alignments' in locals() else [],
    "statistics": {
        "total_errors": len(operations),
        "substitutions": len(substitutes),
        "insertions": len(inserts),
        "deletions": len(deletes),
        "actual_phone_count": len(actual_phones),
        "target_phone_count": len(target_phones),
        "target_phone_count_asr": len(target_phones_asr) if 'target_phones_asr' in locals() else None
    }
}

# Create output directory if it doesn't exist
output_dir = Path("results")
output_dir.mkdir(exist_ok=True)

# Generate output filename with audio basename and timestamp
output_filename = f"pronunciation_assessment_{audio_basename}_{timestamp}.json"
output_file = output_dir / output_filename

# Add output file path to metadata
assessment_result["metadata"]["output_file"] = str(output_file)

# Save to JSON
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(assessment_result, f, indent=2, ensure_ascii=False)

# Print summary
print("\n" + "="*70)
print("PRONUNCIATION ASSESSMENT SUMMARY")
print("="*70)
print(f"\nSelected configuration: {selected_config.get('label', selected_config['audio_filename'])} (index {selected_config_index})")
print(f"Sentence ID: {selected_config['sentence_id']}")
print(f"\nüìù Input:")
print(f"   Target Text: {target_text}")
print(f"   Actual Text (ASR): {pred_asr if 'pred_asr' in locals() else 'N/A'}")
print(f"\nüìä Results:")
print(f"   Pronunciation Score: {score:.2%}")
print(f"\nüìà Statistics:")
print(f"   Total Errors: {len(operations)}")
print(f"     - Substitutions: {len(substitutes)}")
print(f"     - Insertions:    {len(inserts)}")
print(f"     - Deletions:     {len(deletes)}")
print(f"   Actual Phones: {len(actual_phones)}")
print(f"   Target Phones (Original): {len(target_phones)}")
if 'target_phones_asr' in locals():
    print(f"   Target Phones (ASR):    {len(target_phones_asr)}")
    print(f"   Difference:            {abs(len(target_phones) - len(target_phones_asr))}")
print(f"   MFA Alignments: {len(mfa_alignments) if 'mfa_alignments' in locals() else 0}")
print(f"\nüíæ Output:")
print(f"   ‚úì Results saved to: {output_file}")
print(f"   ‚úì Audio file: {os.path.basename(audio_file)}")
print(f"   ‚úì Timestamp: {timestamp}")
print(f"\nüìù Note: Assessment uses target phones from ORIGINAL TRANSCRIPT (ground truth)")
if 'target_phones_asr' in locals():
    print(f"   Both versions (original and ASR-based) are saved in the JSON output.")
print(f"\nConfig index: {selected_config_index}")
print("="*70)



FINAL RESULTS: Compiling Assessment

PRONUNCIATION ASSESSMENT SUMMARY

Selected configuration: Sentence 12 (Yusuf) (index 2)
Sentence ID: 12

üìù Input:
   Target Text: The weather is rather warm this Thursday. I think we should go to the theater together. Thank you for thinking about this thoroughly.
   Actual Text (ASR): the weather is rather warm this Thursday  ‚Åá  think we she cause it the theater together think you for thinking about is authority

üìä Results:
   Pronunciation Score: 87.21%

üìà Statistics:
   Total Errors: 15
     - Substitutions: 7
     - Insertions:    6
     - Deletions:     2
   Actual Phones: 82
   Target Phones (Original): 86
   Target Phones (ASR):    80
   Difference:            6
   MFA Alignments: 68

üíæ Output:
   ‚úì Results saved to: results/pronunciation_assessment_yusuf12-r_20251209_224011.json
   ‚úì Audio file: yusuf12-r.wav
   ‚úì Timestamp: 20251209_224011

üìù Note: Assessment uses target phones from ORIGINAL TRANSCRIPT (ground truth)
