In [1]:
import os
import datetime
import numpy as np
import pandas as pd
import wave
import sounddevice as sd
from typing import Optional, Dict, List, Tuple
from datetime import datetime
from allosaurus.app import read_recognizer
from phonemizer import phonemize

In [2]:

# Audio recording function
def record_audio(duration: int = 5, filename: Optional[str] = None, samplerate: int = 16000) -> str:
    """Record audio with error handling and save as a WAV file."""
    if not filename:
        filename = f"recording_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"

    filepath = os.path.join("recordings/", filename)
    os.makedirs(os.path.dirname(filepath), exist_ok=True)

    try:
        print(f"🎙️ Recording for {duration} seconds...")
        audio = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=1, dtype=np.int16, blocking=True)
        sd.wait()

        # Volume checks
        max_amplitude = np.max(np.abs(audio))
        if max_amplitude > 32000:
            print("⚠️ Warning: Audio may be clipping. Speak more quietly.")
        elif max_amplitude < 1000:
            print("⚠️ Warning: Audio volume is very low. Speak louder.")

        # Save as WAV
        with wave.open(filepath, "wb") as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(samplerate)
            wf.writeframes(audio.tobytes())
        print(f"Audio saved to {filepath}")
        return filepath

    except Exception as e:
        print(f"❌ Error recording audio: {e}")
        raise

In [3]:
# Audio processing function
def process_audio(filepath: str) -> List[Tuple[float, float, str]]:
    """Process the recorded audio file and return phoneme data."""
    try:
        model = read_recognizer("eng2102")
        result = model.recognize(filepath, timestamp=True)
        
        # Debugging: Print the raw result
        print("Raw result:", result)

        # Parse the string into structured data
        phoneme_data = []
        for line in result.strip().split("\n"):
            parts = line.split()
            if len(parts) == 3:
                start_time = float(parts[0])
                duration = float(parts[1])
                phoneme = parts[2]
                phoneme_data.append((start_time, duration, phoneme))
        
        return phoneme_data

    except Exception as e:
        print(f"❌ Error processing audio: {e}")
        raise

In [4]:

# Text-to-phoneme conversion using phonemizer
def text_to_phonemes(sentence: str) -> Dict[str, str]:
    """Convert a sentence into phonemes word by word using phonemizer."""
    words = sentence.split()
    phonemes_dict = {}

    try:
        for word in words:
            # Convert each word to phonemes in IPA format
            phonemes = phonemize(word, language="en-us", backend="espeak", strip=True)
            phonemes_dict[word] = phonemes
    except Exception as e:
        print(f"❌ Error converting text to phonemes: {e}")
        raise

    return phonemes_dict

In [5]:
# Dynamic phoneme mapping function
def dynamic_map_phonemes(phoneme: str, mapping: Dict[str, str]) -> str:
    """Dynamically map Allosaurus phonemes to CMU phonemes or similar sounds."""
    return mapping.get(phoneme, phoneme)

In [6]:
# Compare phonemes function
def compare_phonemes(
    text_phonemes: Dict[str, str],
    audio_phonemes: List[Tuple[float, float, str]],
    phoneme_mapping: Dict[str, str]
) -> List[Dict[str, str]]:
    """Compare text phonemes with audio phonemes and provide detailed reporting."""
    comparison_results = []
    audio_index = 0

    for word, text_phoneme in text_phonemes.items():
        text_phoneme_list = text_phoneme.split()
        mapped_audio_phonemes = [
            dynamic_map_phonemes(p[2], phoneme_mapping)
            for p in audio_phonemes[audio_index:audio_index + len(text_phoneme_list)]
        ]

        word_result = {
            "word": word,
            "text_phonemes": " ".join(text_phoneme_list),
            "audio_phonemes": " ".join(mapped_audio_phonemes),
            "phoneme_comparison": [],
            "start_time": None,
            "end_time": None,
            "duration": 0.0,
            "accuracy": 0.0,
            "status": ""
        }

        matches = []
        start_time = None
        end_time = None
        total_duration = 0.0
        correct_phonemes = 0
        total_phonemes = len(text_phoneme_list)

        for i, text_ph in enumerate(text_phoneme_list):
            if audio_index + i < len(audio_phonemes):
                audio_ph = mapped_audio_phonemes[i]
                phoneme_result = {
                    "text_phoneme": text_ph,
                    "audio_phoneme": audio_ph,
                    "match": text_ph == audio_ph,
                    "timestamp": round(audio_phonemes[audio_index + i][0], 2),
                }
                matches.append(phoneme_result)

                if text_ph == audio_ph:
                    correct_phonemes += 1

                if start_time is None:
                    start_time = audio_phonemes[audio_index + i][0]
                end_time = audio_phonemes[audio_index + i][0] + audio_phonemes[audio_index + i][1]
                total_duration += audio_phonemes[audio_index + i][1]

        word_result["start_time"] = round(start_time, 2) if start_time else None
        word_result["end_time"] = round(end_time, 2) if end_time else None
        word_result["duration"] = round(total_duration, 2)
        word_result["accuracy"] = round((correct_phonemes / total_phonemes) * 100, 2) if total_phonemes > 0 else 0.0
        word_result["phoneme_comparison"] = matches

        if word_result["accuracy"] == 100:
            word_result["status"] = "✅ Excellent"
        elif word_result["accuracy"] >= 50:
            word_result["status"] = "⚠️ Needs Improvement"
        else:
            word_result["status"] = "❌ Poor"

        comparison_results.append(word_result)
        audio_index += total_phonemes  # Move index forward

    return comparison_results

In [7]:
# Example phoneme mapping
PHONEME_MAPPING = {
    "ah": "a",  # Example mapping
    "eh": "ɛ",
    "ih": "ɪ",
    "ow": "oʊ",
    # Extend with additional mappings as needed
}

In [8]:
# Main workflow
if __name__ == "__main__":
    # Step 1: Record audio
    audio_file = record_audio(duration=3, filename="sample.wav")

    # Step 2: Process audio to extract phonemes
    audio_phonemes = process_audio(audio_file)

    # Step 3: Convert text to phonemes using phonemizer
    sentence = "Hello world"
    text_phonemes = text_to_phonemes(sentence)

    # Step 4: Compare phonemes
    comparison_results = compare_phonemes(text_phonemes, audio_phonemes, PHONEME_MAPPING)

    # Step 5: Display results
    for result in comparison_results:
        print(f"Word: {result['word']}")
        print(f"  Text Phonemes: {result['text_phonemes']}")
        print(f"  Audio Phonemes: {result['audio_phonemes']}")
        print(f"  Accuracy: {result['accuracy']}%")
        print(f"  Status: {result['status']}")
        print("  Phoneme Comparison:")
        for pc in result["phoneme_comparison"]:
            match_status = "✔️" if pc["match"] else "❌"
            print(f"    {pc['text_phoneme']} -> {pc['audio_phoneme']} {match_status} (Timestamp: {pc['timestamp']})")
        print()

🎙️ Recording for 3 seconds...
Audio saved to recordings/sample.wav
Raw result: 0.390 0.025 t
0.470 0.025 ɛ
0.680 0.025 l
0.740 0.025 ə
0.900 0.025 ɡ
0.950 0.025 ʊ
1.040 0.025 d
1.140 0.025 m
1.180 0.025 ɑ
1.610 0.025 m
1.650 0.025 i
1.900 0.025 n
Word: Hello
  Text Phonemes: həloʊ
  Audio Phonemes: t
  Accuracy: 0.0%
  Status: ❌ Poor
  Phoneme Comparison:
    həloʊ -> t ❌ (Timestamp: 0.39)

Word: world
  Text Phonemes: wɜːld
  Audio Phonemes: ɛ
  Accuracy: 0.0%
  Status: ❌ Poor
  Phoneme Comparison:
    wɜːld -> ɛ ❌ (Timestamp: 0.47)

