In [22]:
import os
import datetime
import numpy as np
import pandas as pd
import wave
import sounddevice as sd
from typing import Optional, Dict, List, Tuple
from datetime import datetime
from allosaurus.app import read_recognizer
import pronouncing

In [23]:

# Audio recording function
def record_audio(duration: int = 5, filename: Optional[str] = None, samplerate: int = 16000) -> str:
    """Record audio with error handling and save as a WAV file."""
    if not filename:
        filename = f"recording_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"

    filepath = os.path.join("recordings/", filename)
    os.makedirs(os.path.dirname(filepath), exist_ok=True)

    try:
        print(f"🎙️ Recording for {duration} seconds...")
        audio = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=1, dtype=np.int16, blocking=True)
        sd.wait()

        # Volume checks
        max_amplitude = np.max(np.abs(audio))
        if max_amplitude > 32000:
            print("⚠️ Warning: Audio may be clipping. Speak more quietly.")
        elif max_amplitude < 1000:
            print("⚠️ Warning: Audio volume is very low. Speak louder.")

        # Save as WAV
        with wave.open(filepath, "wb") as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(samplerate)
            wf.writeframes(audio.tobytes())
        print(f"Audio saved to {filepath}")
        return filepath

    except Exception as e:
        print(f"❌ Error recording audio: {e}")
        raise

In [24]:
def process_audio(filepath: str) -> List[Tuple[float, float, str]]:
    """Process the recorded audio file and return phoneme data."""
    try:
        model = read_recognizer("eng2102")
        result = model.recognize(filepath, timestamp=True)
        
        # Debugging: Print the raw result
        print("Raw result:", result)

        # Parse the string into structured data
        phoneme_data = []
        for line in result.strip().split("\n"):
            parts = line.split()
            if len(parts) == 3:
                start_time = float(parts[0])
                duration = float(parts[1])
                phoneme = parts[2]
                phoneme_data.append((start_time, duration, phoneme))
        
        return phoneme_data

    except Exception as e:
        print(f"❌ Error processing audio: {e}")
        raise

In [25]:
# Text-to-phoneme conversion function
def text_to_phonemes(sentence: str) -> Dict[str, str]:
    """Convert a sentence into phonemes word by word."""
    words = sentence.split()
    phonemes_dict = {}

    for word in words:
        phonemes = pronouncing.phones_for_word(word.lower())
        if phonemes:
            phonemes_dict[word] = phonemes[0]
        else:
            phonemes_dict[word] = "N/A"  # If no phonemes found, mark as N/A

    return phonemes_dict

In [26]:
# Improved dynamic phoneme mapping function
def dynamic_map_phonemes(phoneme: str, mapping: Dict[str, str]) -> str:
    """Dynamically map Allosaurus phonemes to CMU phonemes or similar sounds."""
    # Default to original if not found in mapping
    return mapping.get(phoneme, phoneme)

# Modified compare_phonemes function with enhanced reporting
def compare_phonemes(
    text_phonemes: Dict[str, str],
    audio_phonemes: List[Tuple[float, float, str]],
    phoneme_mapping: Dict[str, str]
) -> List[Dict[str, str]]:
    """Compare text phonemes with audio phonemes and provide detailed reporting."""
    comparison_results = []
    audio_index = 0

    for word, text_phoneme in text_phonemes.items():
        text_phoneme_list = text_phoneme.split()
        mapped_audio_phonemes = [
            dynamic_map_phonemes(p[2], phoneme_mapping)
            for p in audio_phonemes[audio_index:audio_index + len(text_phoneme_list)]
        ]

        word_result = {
            "word": word,
            "text_phonemes": " ".join(text_phoneme_list),
            "audio_phonemes": " ".join(mapped_audio_phonemes),
            "phoneme_comparison": [],
            "start_time": None,
            "end_time": None,
            "duration": 0.0,
            "accuracy": 0.0,
            "status": ""
        }

        matches = []
        start_time = None
        end_time = None
        total_duration = 0.0
        correct_phonemes = 0
        total_phonemes = len(text_phoneme_list)

        for i, text_ph in enumerate(text_phoneme_list):
            if audio_index + i < len(audio_phonemes):
                audio_ph = mapped_audio_phonemes[i]
                phoneme_result = {
                    "text_phoneme": text_ph,
                    "audio_phoneme": audio_ph,
                    "match": text_ph == audio_ph,
                    "timestamp": round(audio_phonemes[audio_index + i][0], 2),
                }
                matches.append(phoneme_result)

                if text_ph == audio_ph:
                    correct_phonemes += 1

                if start_time is None:
                    start_time = audio_phonemes[audio_index + i][0]
                end_time = audio_phonemes[audio_index + i][0] + audio_phonemes[audio_index + i][1]
                total_duration += audio_phonemes[audio_index + i][1]

        word_result["start_time"] = round(start_time, 2) if start_time else None
        word_result["end_time"] = round(end_time, 2) if end_time else None
        word_result["duration"] = round(total_duration, 2)
        word_result["accuracy"] = round((correct_phonemes / total_phonemes) * 100, 2) if total_phonemes > 0 else 0.0
        word_result["phoneme_comparison"] = matches

        if word_result["accuracy"] == 100:
            word_result["status"] = "✅ Excellent"
        elif word_result["accuracy"] >= 50:
            word_result["status"] = "⚠️ Needs Improvement"
        else:
            word_result["status"] = "❌ Poor"

        comparison_results.append(word_result)
        audio_index += total_phonemes  # Move index forward

    return comparison_results

# Example phoneme mapping
PHONEME_MAPPING = {
    # Add specific Allosaurus-to-CMU mappings
    "ah": "AA",  # Example
    "eh": "EH",
    "ih": "IH",
    "ow": "OW",
    # Add as many mappings as needed
}

In [28]:
# Main workflow with improvements
if __name__ == "__main__":
    # Step 1: Record audio
    audio_file = record_audio(duration=3, filename="sample.wav")

    # Step 2: Process audio to extract phonemes
    audio_phonemes = process_audio(audio_file)

    # Step 3: Convert text to phonemes
    sentence = "Hello world"
    text_phonemes = text_to_phonemes(sentence)

    # Step 4: Compare phonemes
    comparison_results = compare_phonemes(text_phonemes, audio_phonemes, PHONEME_MAPPING)

    # Step 5: Display results
    for result in comparison_results:
        print(f"Word: {result['word']}")
        print(f"  Text Phonemes: {result['text_phonemes']}")
        print(f"  Audio Phonemes: {result['audio_phonemes']}")
        print(f"  Accuracy: {result['accuracy']}%")
        print(f"  Status: {result['status']}")
        print("  Phoneme Comparison:")
        for pc in result["phoneme_comparison"]:
            match_status = "✔️" if pc["match"] else "❌"
            print(f"    {pc['text_phoneme']} -> {pc['audio_phoneme']} {match_status} (Timestamp: {pc['timestamp']})")
        print()


🎙️ Recording for 3 seconds...
Audio saved to recordings/sample.wav
Raw result: 0.580 0.025 h
0.620 0.025 ɛ
0.810 0.025 l
0.840 0.025 o
0.850 0.025 w
1.380 0.025 ɹ̩
1.530 0.025 t
1.570 0.025 ɪ
1.690 0.025 t
Word: Hello
  Text Phonemes: HH AH0 L OW1
  Audio Phonemes: h ɛ l o
  Accuracy: 0.0%
  Status: ❌ Poor
  Phoneme Comparison:
    HH -> h ❌ (Timestamp: 0.58)
    AH0 -> ɛ ❌ (Timestamp: 0.62)
    L -> l ❌ (Timestamp: 0.81)
    OW1 -> o ❌ (Timestamp: 0.84)

Word: world
  Text Phonemes: W ER1 L D
  Audio Phonemes: w ɹ̩ t ɪ
  Accuracy: 0.0%
  Status: ❌ Poor
  Phoneme Comparison:
    W -> w ❌ (Timestamp: 0.85)
    ER1 -> ɹ̩ ❌ (Timestamp: 1.38)
    L -> t ❌ (Timestamp: 1.53)
    D -> ɪ ❌ (Timestamp: 1.57)

