In [37]:
import os
import shutil
import subprocess
from datetime import datetime
from typing import List, Tuple, Dict, Optional
import numpy as np
import sounddevice as sd
import wave
from praatio import textgrid
import speech_recognition as sr
from phonemizer import phonemize

In [38]:
# def process_audio(filepath: str) -> List[Tuple[float, float, str]]:
#     """Process the recorded audio file and return phoneme data."""
#     try:
#         model = read_recognizer("eng2102")  # Replace with your phoneme recognition model
#         result = model.recognize(filepath, timestamp=True)

#         # Debugging: Print the raw result
#         print("Raw result:", result)

#         # Parse the string into structured data
#         phoneme_data = []
#         for line in result.strip().split("\n"):
#             parts = line.split()
#             if len(parts) == 3:
#                 start_time = float(parts[0])
#                 duration = float(parts[1])
#                 phoneme = parts[2]
#                 phoneme_data.append((start_time, duration, phoneme))

#         return phoneme_data

#     except Exception as e:
#         print(f"\u274c Error processing audio: {e}")
#         raise


In [45]:
class PhonemeAnalyzer:
    def __init__(self, mfa_model: str = "english_mfa", dict_path: str = "path/to/english_us_mfa.dict"):
        """Initialize the analyzer."""
        self.results = []
        self.mfa_model = mfa_model
        self.recordings_dir = "recordings"
        self.textgrid_dir = os.path.join(self.recordings_dir, "textgrids")
        self.dict_path = dict_path

        os.makedirs(self.recordings_dir, exist_ok=True)
        os.makedirs(self.textgrid_dir, exist_ok=True)
        self.recognizer = sr.Recognizer()

    def record_audio(self, duration: int = 5, filename: Optional[str] = None, samplerate: int = 16000) -> str:
        """Record audio."""
        if not filename:
            filename = f"recording_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"

        filepath = os.path.join(self.recordings_dir, filename)

        try:
            print(f"🎙️ Recording for {duration} seconds...")
            audio = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=1, dtype=np.int16, blocking=True)
            sd.wait()

            with wave.open(filepath, "wb") as wf:
                wf.setnchannels(1)
                wf.setsampwidth(2)
                wf.setframerate(samplerate)
                wf.writeframes(audio.tobytes())

            return filepath
        except Exception as e:
            print(f"❌ Error recording audio: {e}")
            raise

    def transcribe_audio(self, audio_path: str) -> str:
        """Transcribe audio using Google Speech Recognition."""
        try:
            with sr.AudioFile(audio_path) as source:
                audio = self.recognizer.record(source)
                transcript = self.recognizer.recognize_google(audio)
                print(f"Transcribed Text: {transcript}")
                return transcript
        except sr.UnknownValueError:
            print("❌ Could not understand the audio.")
            return ""
        except sr.RequestError as e:
            print(f"❌ Could not request results from Google Speech Recognition service; {e}")
            return ""

    def force_align_audio(self, audio_path: str, transcript: str) -> List[Dict]:
        """Perform forced alignment using MFA with debug logs."""
        corpus_dir = os.path.join(self.recordings_dir, "corpus")
        os.makedirs(corpus_dir, exist_ok=True)

        base_name = os.path.splitext(os.path.basename(audio_path))[0]
        corpus_audio = os.path.join(corpus_dir, f"{base_name}.wav")
        shutil.copy2(audio_path, corpus_audio)

        lab_path = os.path.join(corpus_dir, f"{base_name}.lab")
        with open(lab_path, "w", encoding="utf-8") as f:
            f.write(transcript.strip())

        try:
            mfa_command = [
                "mfa", "align",
                corpus_dir,
                self.dict_path,
                self.mfa_model,
                self.textgrid_dir,
                "--clean"
            ]

            result = subprocess.run(mfa_command, capture_output=True, text=True)
            print(f"Command Output: {result.stdout}")
            print(f"Command Error: {result.stderr}")

            if result.returncode != 0:
                print(f"❌ MFA Error: {result.stderr}")
                return []

            textgrid_path = os.path.join(self.textgrid_dir, f"{base_name}.TextGrid")
            if os.path.exists(textgrid_path):
                return self._parse_textgrid(textgrid_path)
            else:
                raise FileNotFoundError(f"TextGrid file not found: {textgrid_path}")
        except Exception as e:
            print(f"❌ MFA alignment error: {e}")
            return []


    def _parse_textgrid(self, textgrid_path: str) -> List[Dict[str, float]]:
        """Parse TextGrid and extract word timestamps."""
        tg = textgrid.fromFile(textgrid_path)
        words_tier = tg.getFirst("words")

        words = [
            {
                "word": interval.mark,
                "start_time": interval.minTime,
                "end_time": interval.maxTime
            }
            for interval in words_tier
            if interval.mark
        ]

        return words

    def extract_phonemes(self, audio_path: str, language: str = "en-us") -> List[Dict]:
        """Extract phonemes from audio using phonemizer."""
        try:
            phonemes = phonemize(audio_path, backend="espeak", language=language, with_timestamps=True)
            return phonemes
        except Exception as e:
            print(f"❌ Error extracting phonemes: {e}")
            return []

    def group_phonemes_by_word(self, words: List[Dict[str, float]], phonemes: List[Dict]) -> List[List[Dict]]:
        """Group phonemes based on word timestamps."""
        grouped_phonemes = []

        for word in words:
            start_time = word["start_time"]
            end_time = word["end_time"]
            grouped = [
                phoneme for phoneme in phonemes
                if start_time <= phoneme[0] <= end_time
            ]
            grouped_phonemes.append(grouped)

        return grouped_phonemes

    def compare_phonemes(self, correct_text: str, grouped_phonemes: List[List[Dict]]):
        """Compare phonemes of text and grouped phonemes."""
        correct_words = correct_text.lower().split()
        for i, (word, phoneme_group) in enumerate(zip(correct_words, grouped_phonemes)):
            print(f"Word {i+1}: {word}")
            print(f"  Expected Phonemes: {phoneme_group}")

    def analyze_pronunciation(self, correct_text: str, duration: int = 5):
        """Analyze pronunciation."""
        try:
            audio_path = self.record_audio(duration)
            transcript = self.transcribe_audio(audio_path)
            if not transcript:
                print("❌ Transcription failed. Analysis aborted.")
                return

            words = self.force_align_audio(audio_path, transcript)
            if not words:
                print("❌ Forced alignment failed.")
                return

            phonemes = self.extract_phonemes(audio_path)
            if not phonemes:
                print("❌ Phoneme extraction failed.")
                return

            grouped_phonemes = self.group_phonemes_by_word(words, phonemes)
            self.compare_phonemes(correct_text, grouped_phonemes)
        except Exception as e:
            print(f"❌ Analysis error: {e}")


In [46]:
if __name__ == "__main__":
    analyzer = PhonemeAnalyzer(
        mfa_model="english_mfa",
        dict_path=r"C:\Users\suraj\.conda\envs\mfa\Lib\site-packages\montreal_forced_aligner\tests\data\dictionaries\english_us_mfa_reduced.dict" # Change to actual dictionary path
    )
    correct_transcription = "hello world"
    analyzer.analyze_pronunciation(correct_transcription, duration=3)

🎙️ Recording for 3 seconds...
Transcribed Text: hello world
Command Output: 
   0% ----------------------------------- 0/100  [ 0:00:00 < -:--:-- , ? it/s ]
   0% ----------------------------------- 0/100  [ 0:00:00 < -:--:-- , ? it/s ]
   0% ----------------------------------- 0/100  [ 0:00:00 < -:--:-- , ? it/s ]
   0% ----------------------------------- 0/100  [ 0:00:00 < -:--:-- , ? it/s ]
   0% ----------------------------------- 0/100  [ 0:00:00 < -:--:-- , ? it/s ]
   0% ----------------------------------- 0/100  [ 0:00:00 < -:--:-- , ? it/s ]
   0% ----------------------------------- 0/100  [ 0:00:00 < -:--:-- , ? it/s ]
   0% ----------------------------------- 0/100  [ 0:00:00 < -:--:-- , ? it/s ]
   0% ----------------------------------- 0/100  [ 0:00:00 < -:--:-- , ? it/s ]
   0% ----------------------------------- 0/100  [ 0:00:01 < -:--:-- , ? it/s ]
   0% ----------------------------------- 0/100  [ 0:00:01 < -:--:-- , ? it/s ]
   0% ----------------------------------- 0