In [26]:
import os
import datetime
import numpy as np
import pandas as pd
import wave
import sounddevice as sd
from typing import Optional, Dict, List, Tuple
from datetime import datetime
from allosaurus.app import read_recognizer
import pronouncing

In [27]:
# IPA to CMU mapping table
IPA_TO_CMU = {
    "h": "HH",
    "ɛ": "EH",
    "l": "L",
    "o": "OW",
    "w": "W",
    "ɹ̩": "ER",
    "t": "T",
    "ɪ": "IH",
    "k": "K",
    "æ": "AE",
    "ɑ": "AA",
    "u": "UW",
    "n": "N",
    "d": "D",
    "s": "S",
    "z": "Z",
    "v": "V",
    "f": "F",
    "ʃ": "SH",
    "ʒ": "ZH",
    "ʌ": "AH",
    "i": "IY",
    "ɡ": "G",
    "b": "B",
    "p": "P",
    "m": "M",
    "ŋ": "NG",
    "eɪ": "EY",
    "aɪ": "AY",
    "aʊ": "AW",
    "ɔɪ": "OY",
    # Add additional phonemes as needed
}

# Similar phonemes for tolerance
SIMILAR_PHONEMES = {
    "ɛ": ["EH", "AE"],  # IPA -> Similar ARPAbet
    "ɪ": ["IH", "IY"],
    "ɹ̩": ["ER"],
    "ʌ": ["AH", "UH"],
    "o": ["OW", "AO"],
    "æ": ["AE", "EH"],
    "ɑ": ["AA", "AH"],
    "i": ["IY", "IH"],
    "u": ["UW", "UH"],
    "ŋ": ["NG", "N"],
    "ʃ": ["SH", "S"],
    "ʒ": ["ZH", "Z"],
    "ð": ["DH"],  # Add IPA to CMU equivalences
    "ɹ": ["R", "ER"],
    "æ": ["AE"],
    "ə": ["AH", "UH"],
    "d͡ʒ": ["JH"],
    "ɑ": ["AA", "AH"],
    "a": ["AW", "AA"],  # Broad categories
    "ʌ": ["AH", "UH"],
    "z": ["Z", "ZH"],
    "e": ["EY"],
    "j": ["Y"],
    # Add additional similar pairs
}

In [28]:
def normalize_ipa(phoneme):
    replacements = {
        "ð": "DH",
        "ɹ": "R",
        "ə": "AH",
        "d͡ʒ": "JH",
        "a": "AW",
        # Add other replacements
    }
    return replacements.get(phoneme, phoneme)  # Default to the same phoneme

In [29]:

# Audio recording function
def record_audio(duration: int = 5, filename: Optional[str] = None, samplerate: int = 16000) -> str:
    """Record audio with error handling and save as a WAV file."""
    if not filename:
        filename = f"recording_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"

    filepath = os.path.join("recordings/", filename)
    os.makedirs(os.path.dirname(filepath), exist_ok=True)

    try:
        print(f"🎙️ Recording for {duration} seconds...")
        audio = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=1, dtype=np.int16, blocking=True)
        sd.wait()

        # Volume checks
        max_amplitude = np.max(np.abs(audio))
        if max_amplitude > 32000:
            print("⚠️ Warning: Audio may be clipping. Speak more quietly.")
        elif max_amplitude < 1000:
            print("⚠️ Warning: Audio volume is very low. Speak louder.")

        # Save as WAV
        with wave.open(filepath, "wb") as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(samplerate)
            wf.writeframes(audio.tobytes())
        print(f"Audio saved to {filepath}")
        return filepath

    except Exception as e:
        print(f"❌ Error recording audio: {e}")
        raise

In [30]:
# Process audio and extract phonemes
def process_audio(filepath: str) -> List[Tuple[float, float, str]]:
    """Process the recorded audio file and return phoneme data."""
    try:
        model = read_recognizer("eng2102")
        result = model.recognize(filepath, timestamp=True)
        
        # Debugging: Print the raw result
        print("Raw result:", result)

        # Parse the string into structured data
        phoneme_data = []
        for line in result.strip().split("\n"):
            parts = line.split()
            if len(parts) == 3:
                start_time = float(parts[0])
                duration = float(parts[1])
                phoneme = parts[2]
                phoneme_data.append((start_time, duration, phoneme))
        
        return phoneme_data

    except Exception as e:
        print(f"❌ Error processing audio: {e}")
        raise

In [31]:

# Text-to-phoneme conversion function
def text_to_phonemes(sentence: str) -> Dict[str, str]:
    """Convert a sentence into phonemes word by word."""
    words = sentence.split()
    phonemes_dict = {}

    for word in words:
        phonemes = pronouncing.phones_for_word(word.lower())
        if phonemes:
            phonemes_dict[word] = phonemes[0]
        else:
            phonemes_dict[word] = "N/A"  # If no phonemes found, mark as N/A

    return phonemes_dict

In [32]:
# Remove stress markers
def remove_stress_markers(cmu_phoneme: str) -> str:
    """Remove stress markers from CMU phonemes."""
    return ''.join([char for char in cmu_phoneme if not char.isdigit()])

In [33]:
# Map phonemes
def dynamic_map_phonemes(phoneme: str) -> str:
    """Map Allosaurus IPA phonemes to CMU phonemes."""
    return IPA_TO_CMU.get(phoneme, phoneme)  # Default to itself if not found

In [34]:
# Check for phoneme match
def is_phoneme_match(ipa_phoneme: str, cmu_phoneme: str) -> bool:
    """Check if two phonemes are a match or similar."""
    cmu_phoneme = remove_stress_markers(cmu_phoneme)
    mapped_phoneme = dynamic_map_phonemes(ipa_phoneme)
    return cmu_phoneme == mapped_phoneme or cmu_phoneme in SIMILAR_PHONEMES.get(ipa_phoneme, [])

In [35]:
# Compare phonemes
def compare_phonemes(
    text_phonemes: Dict[str, str],
    audio_phonemes: List[Tuple[float, float, str]]
) -> List[Dict[str, str]]:
    """Compare text phonemes with audio phonemes."""
    comparison_results = []
    audio_index = 0

    for word, text_phoneme in text_phonemes.items():
        text_phoneme_list = [remove_stress_markers(p) for p in text_phoneme.split()]
        mapped_audio_phonemes = [
            dynamic_map_phonemes(p[2]) for p in audio_phonemes[audio_index:audio_index + len(text_phoneme_list)]
        ]

        word_result = {
            "word": word,
            "text_phonemes": " ".join(text_phoneme_list),
            "audio_phonemes": " ".join(mapped_audio_phonemes),
            "accuracy": 0.0,
            "status": "",
            "phoneme_comparison": [],
        }

        correct_phonemes = 0
        for i, text_ph in enumerate(text_phoneme_list):
            if audio_index + i < len(audio_phonemes):
                audio_ph = mapped_audio_phonemes[i]
                is_match = is_phoneme_match(audio_phonemes[audio_index + i][2], text_ph)
                word_result["phoneme_comparison"].append({
                    "text_phoneme": text_ph,
                    "audio_phoneme": audio_ph,
                    "match": is_match,
                    "timestamp": audio_phonemes[audio_index + i][0],
                })
                if is_match:
                    correct_phonemes += 1

        word_result["accuracy"] = round((correct_phonemes / len(text_phoneme_list)) * 100, 2) if text_phoneme_list else 0.0
        word_result["status"] = "✅ Excellent" if word_result["accuracy"] == 100 else "⚠️ Needs Improvement" if word_result["accuracy"] >= 50 else "❌ Poor"
        comparison_results.append(word_result)
        audio_index += len(text_phoneme_list)

    return comparison_results

In [36]:
# Main workflow
if __name__ == "__main__":
    # Step 1: Record audio
    audio_file = record_audio(duration=5, filename="sample.wav")

    # Step 2: Process audio to extract phonemes
    audio_phonemes = process_audio(audio_file)

    # Step 3: Convert text to phonemes
    sentence = "The quick brown fox jumps over the lazy dog"
    text_phonemes = text_to_phonemes(sentence)

    # Step 4: Compare phonemes
    comparison_results = compare_phonemes(text_phonemes, audio_phonemes)

    # Step 5: Display results
    for result in comparison_results:
        print(f"Word: {result['word']}")
        print(f"  Text Phonemes: {result['text_phonemes']}")
        print(f"  Audio Phonemes: {result['audio_phonemes']}")
        print(f"  Accuracy: {result['accuracy']}%")
        print(f"  Status: {result['status']}")
        print("  Phoneme Comparison:")
        for pc in result["phoneme_comparison"]:
            match_status = "✔️" if pc["match"] else "❌"
            print(f"    {pc['text_phoneme']} -> {pc['audio_phoneme']} {match_status} (Timestamp: {pc['timestamp']})")
        print()

🎙️ Recording for 5 seconds...
Audio saved to recordings/sample.wav
Raw result: 0.990 0.025 æ
1.040 0.025 n
1.090 0.025 ð
1.130 0.025 ə
1.870 0.025 k
1.950 0.025 l
1.990 0.025 ɪ
2.110 0.025 k
2.290 0.025 d
2.340 0.025 a
2.350 0.025 w
2.520 0.025 n
2.620 0.025 f
2.680 0.025 ɑ
2.890 0.025 k
2.980 0.025 s
3.250 0.025 d͡ʒ
3.310 0.025 ʌ
3.440 0.025 m
3.640 0.025 s
3.800 0.025 o
3.810 0.025 w
3.960 0.025 v
4.000 0.025 ɹ̩
4.160 0.025 ð
4.190 0.025 ə
4.270 0.025 l
4.310 0.025 i
4.450 0.025 z
4.510 0.025 i
4.590 0.025 d
4.620 0.025 w
4.810 0.025 ɡ
Word: The
  Text Phonemes: DH AH
  Audio Phonemes: AE N
  Accuracy: 0.0%
  Status: ❌ Poor
  Phoneme Comparison:
    DH -> AE ❌ (Timestamp: 0.99)
    AH -> N ❌ (Timestamp: 1.04)

Word: quick
  Text Phonemes: K W IH K
  Audio Phonemes: ð ə K L
  Accuracy: 0.0%
  Status: ❌ Poor
  Phoneme Comparison:
    K -> ð ❌ (Timestamp: 1.09)
    W -> ə ❌ (Timestamp: 1.13)
    IH -> K ❌ (Timestamp: 1.87)
    K -> L ❌ (Timestamp: 1.95)

Word: brown
  Text Phonemes: B R