In [2]:
!pip list

Package            Version
------------------ -----------
asttokens          3.0.0
audioread          3.0.1
certifi            2025.10.5
cffi               2.0.0
charset-normalizer 3.4.4
colorama           0.4.6
comm               0.2.3
contourpy          1.3.2
cycler             0.12.1
debugpy            1.8.17
decorator          5.2.1
exceptiongroup     1.3.0
executing          2.2.1
fastdtw            0.3.4
filelock           3.20.0
fonttools          4.60.1
fsspec             2025.9.0
huggingface-hub    0.35.3
idna               3.11
ipykernel          7.0.0
ipython            8.37.0
jedi               0.19.2
Jinja2             3.1.6
joblib             1.5.2
jupyter_client     8.6.3
jupyter_core       5.8.1
kiwisolver         1.4.9
lazy_loader        0.4
librosa            0.11.0
llvmlite           0.45.1
MarkupSafe         3.0.3
matplotlib         3.10.7
matplotlib-inline  0.1.7
mpmath             1.3.0
msgpack            1.1.2
nest-asyncio       1.6.0
networkx           3.4.2
num


[notice] A new release of pip available: 22.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import torch
import torchaudio
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from scipy.spatial.distance import cosine
import librosa
from dataclasses import dataclass
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')
from typing import Optional


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# try:
#     import whisperx
#     WHISPERX_AVAILABLE = True
# except ImportError:
#     WHISPERX_AVAILABLE = False
#     print("WhisperX not available. Install with: pip install whisperx")

In [5]:
@dataclass
class WordAssessment:
    word: str
    start_time: float
    end_time: float
    similarity_score: float
    is_correct: bool
    phonemes_reference: str
    phonemes_learner: str
    
@dataclass
class WordAlignment:
    word: str
    start: float
    end: float
    score: Optional[float] = None

In [6]:
class ForcedAligner:
    """Handles forced alignment to get exact word timings."""
    
    def __init__(self, method="whisperx", device="cuda" if torch.cuda.is_available() else "cpu"):
        """
        Initialize forced aligner.
        
        Args:
            method: "whisperx" 
            device: "cuda" or "cpu"
        """
        self.method = method
        self.device = device
        
        if method == "whisperx":
            # if not WHISPERX_AVAILABLE:
            #     raise ImportError("WhisperX not installed. Run: pip install whisperx")
            # print(f"Loading WhisperX model on {device}...")
            # self.asr_model = whisperx.load_model("base", device, compute_type="float32")
            self.alignment_model = None
            self.metadata = None 
            
    def align_with_whisperx(self, audio_path: str, text: str) -> List[WordAlignment]:
        """Use WhisperX for precise word-level alignment."""
        # Load audio
        audio = whisperx.load_audio(audio_path)
        
        # Transcribe
        result = self.asr_model.transcribe(audio, batch_size=16)
        
        # Load alignment model if not loaded
        if self.alignment_model is None:
            self.alignment_model, self.metadata = whisperx.load_align_model(
                language_code="en", device=self.device
            )
        
        # Align
        result = whisperx.align(
            result["segments"], 
            self.alignment_model, 
            self.metadata, 
            audio, 
            self.device,
            return_char_alignments=False
        )
        
        # Extract word alignments
        alignments = []
        for segment in result["segments"]:
            if "words" in segment:
                for word_info in segment["words"]:
                    alignments.append(WordAlignment(
                        word=word_info["word"].strip(),
                        start=word_info["start"],
                        end=word_info["end"],
                        score=word_info.get("score", 1.0)
                    ))
        
        return alignments

In [7]:
class PronunciationAssessor:
    def __init__(self, model_name="bookbot/wav2vec2-ljspeech-gruut"):
        """Initialize the pronunciation assessor with wav2vec2 phoneme model."""
        print(f"Loading model: {model_name}")
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
        self.model.eval()
        self.aligner = ForcedAligner(method="whisperx")
        
    def load_audio(self, audio_path: str, sr: int = 16000) -> np.ndarray:
        """Load audio file and resample to 16kHz."""
        audio, _ = librosa.load(audio_path, sr=sr)
        return audio
    
    def get_phoneme_embeddings(self, audio: np.ndarray) -> Tuple[torch.Tensor, str]:
        """Extract phoneme-level embeddings and predicted phonemes from audio."""
        # Prepare input
        inputs = self.processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
        
        with torch.no_grad():
            # Get hidden states (embeddings)
            outputs = self.model(**inputs, output_hidden_states=True)
            hidden_states = outputs.hidden_states[-1]  # Last layer
            
            # Get phoneme predictions
            logits = outputs.logits
            predicted_ids = torch.argmax(logits, dim=-1)
            phonemes = self.processor.batch_decode(predicted_ids)[0]
            
        return hidden_states.squeeze(0), phonemes
    
    # def align_words_to_audio(self, text: str, audio_duration: float) -> List[Tuple[str, float, float]]:
    #     """
    #     Simple word alignment based on uniform distribution.
    #     For production, use a forced alignment tool like Montreal Forced Aligner.
    #     """
    #     words = text.split()
    #     print(f"List words: {words}")
    #     word_duration = audio_duration / len(words)
    #     print(f"Word duration: {word_duration}")
        
    #     alignments = []
    #     for i, word in enumerate(words):
    #         start = i * word_duration
    #         end = (i + 1) * word_duration
    #         alignments.append((word, start, end))
        
    #     return alignments
    
    def extract_segment_embeddings(self, embeddings: torch.Tensor, 
                                   start_time: float, end_time: float, 
                                   audio_duration: float) -> torch.Tensor:
        """Extract embeddings for a specific time segment."""
        total_frames = embeddings.shape[0]
        
        # Convert time to frame indices
        start_frame = int((start_time / audio_duration) * total_frames)
        end_frame = int((end_time / audio_duration) * total_frames)
        
        # Ensure valid range
        start_frame = max(0, start_frame)
        end_frame = min(total_frames, end_frame)
        
        # Extract segment and average pool
        segment_embeddings = embeddings[start_frame:end_frame]
        
        if segment_embeddings.shape[0] == 0:
            return embeddings.mean(dim=0)
        
        return segment_embeddings.mean(dim=0)
    
    def compute_similarity(self, emb1: torch.Tensor, emb2: torch.Tensor) -> float:
        """Compute cosine similarity between two embeddings."""
        emb1_np = emb1.cpu().numpy()
        emb2_np = emb2.cpu().numpy()
        
        # Cosine similarity (1 - cosine distance)
        similarity = 1 - cosine(emb1_np, emb2_np)
        return float(similarity)
    
    def assess_pronunciation(self, 
                           reference_audio_path: str,
                           learner_audio_path: str,
                           reference_text: str,
                           similarity_threshold: float = 0.85) -> List[WordAssessment]:
        """
        Compare reference and learner audio to identify pronunciation mistakes.
        
        Args:
            reference_audio_path: Path to correct pronunciation audio
            learner_audio_path: Path to learner's audio
            reference_text: Transcript of the reference audio
            similarity_threshold: Threshold for determining if pronunciation is correct
            
        Returns:
            List of WordAssessment objects with detailed results
        """
        print("\n" + "="*60)
        print("PRONUNCIATION ASSESSMENT")
        print("="*60)
        
        # Load audio files
        print("\n1. Loading audio files...")
        ref_audio = self.load_audio(reference_audio_path)
        learner_audio = self.load_audio(learner_audio_path)
        
        ref_duration = len(ref_audio) / 16000
        learner_duration = len(learner_audio) / 16000
        
        print(f"   Reference duration: {ref_duration:.2f}s")
        print(f"   Learner duration: {learner_duration:.2f}s")
        
        # # Get exact word alignments using forced alignment
        # print("\n2. Performing forced alignment...")
        # ref_alignments = self.aligner.align_with_whisperx(reference_audio_path, reference_text)
        # learner_alignments = self.aligner.align_with_whisperx(learner_audio_path, reference_text)
        
        # print(f"   Found {len(ref_alignments)} words in reference")
        # print(f"   Found {len(learner_alignments)} words in learner audio")
        # print(f"Learner alignments: {learner_alignments}")
        
        # # Display word timings
        # print("\n   Reference word timings:")
        # for align in ref_alignments:  # Show first 5
        #     print(f"     '{align.word}': {align.start:.2f}s - {align.end:.2f}s")

        # Get embeddings and phonemes
        print("\n3. Extracting phoneme embeddings...")
        ref_embeddings, ref_phonemes = self.get_phoneme_embeddings(ref_audio)
        learner_embeddings, learner_phonemes = self.get_phoneme_embeddings(learner_audio)
        
        print(f"   Reference phonemes: {ref_phonemes}")
        print(f"   Learner phonemes: {learner_phonemes}")
        
        # # Align words to audio
        # print("\n3. Aligning words to audio segments...")
        # word_alignments = self.align_words_to_audio(reference_text, ref_duration)
        # learner_word_alignments = self.align_words_to_audio(reference_text, learner_duration)
        
        # Assess each word
        print("\n4. Assessing word-level pronunciation...\n")
        assessments = []
        
        # # Match words from both alignments
        # min_length = min(len(ref_alignments), len(learner_alignments))
        
        # # for (word, ref_start, ref_end), (_, learn_start, learn_end) in zip(
        # #     word_alignments, learner_word_alignments
        # # ):
        # for i in range(min_length):
        #     ref_align = ref_alignments[i]
        #     learner_align = learner_alignments[i]
            
        #     # Extract word-level embeddings using exact timings
        #     ref_word_emb = self.extract_segment_embeddings(
        #         ref_embeddings, ref_align.start, ref_align.end, ref_duration
        #     )
        #     learner_word_emb = self.extract_segment_embeddings(
        #         learner_embeddings, learner_align.start, learner_align.end, learner_duration
        #     )
            
        #     # Compute similarity
        #     similarity = self.compute_similarity(ref_word_emb, learner_word_emb)
        #     is_correct = similarity >= similarity_threshold
            
        #     assessment = WordAssessment(
        #         word=ref_align.word,
        #         start_time=learner_align.start,
        #         end_time=learner_align.end,
        #         similarity_score=similarity,
        #         is_correct=is_correct,
        #         phonemes_reference=ref_phonemes,  # In production, extract per-word phonemes
        #         phonemes_learner=learner_phonemes
        #     )
            
        #     assessments.append(assessment)
            
        #    # Print result with exact timings
        #     status = "✓ CORRECT" if is_correct else "✗ INCORRECT"
        #     print(f"   {ref_align.word:15s} | {learner_align.start:.2f}s-{learner_align.end:.2f}s | Score: {similarity:.3f} | {status}")
        
        return assessments
    
    def print_summary(self, assessments: List[WordAssessment]):
        """Print a summary of the assessment results."""
        total_words = len(assessments)
        correct_words = sum(1 for a in assessments if a.is_correct)
        accuracy = (correct_words / total_words * 100) if total_words > 0 else 0
        
        print("\n" + "="*60)
        print("SUMMARY")
        print("="*60)
        print(f"Total words: {total_words}")
        print(f"Correct pronunciations: {correct_words}")
        print(f"Incorrect pronunciations: {total_words - correct_words}")
        print(f"Accuracy: {accuracy:.1f}%")
        
        if total_words - correct_words > 0:
            print("\nWords that need practice:")
            for assessment in assessments:
                if not assessment.is_correct:
                    print(f"  - {assessment.word} (score: {assessment.similarity_score:.3f})")


In [8]:
!ffmpeg -version

ffmpeg version 8.0-full_build-www.gyan.dev Copyright (c) 2000-2025 the FFmpeg developers
built with gcc 15.2.0 (Rev8, Built by MSYS2 project)
configuration: --enable-gpl --enable-version3 --enable-static --disable-w32threads --disable-autodetect --enable-fontconfig --enable-iconv --enable-gnutls --enable-lcms2 --enable-libxml2 --enable-gmp --enable-bzlib --enable-lzma --enable-libsnappy --enable-zlib --enable-librist --enable-libsrt --enable-libssh --enable-libzmq --enable-avisynth --enable-libbluray --enable-libcaca --enable-libdvdnav --enable-libdvdread --enable-sdl2 --enable-libaribb24 --enable-libaribcaption --enable-libdav1d --enable-libdavs2 --enable-libopenjpeg --enable-libquirc --enable-libuavs3d --enable-libxevd --enable-libzvbi --enable-liboapv --enable-libqrencode --enable-librav1e --enable-libsvtav1 --enable-libvvenc --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxavs2 --enable-libxeve --enable-libxvid --enable-libaom --enable-libjxl --enable-libvpx --enable

In [9]:
import os
ffmpeg_path = r'C:\Users\tyan\AppData\Local\Microsoft\WinGet\Packages\Gyan.FFmpeg_Microsoft.Winget.Source_8wekyb3d8bbwe\ffmpeg-8.0-full_build\bin'
os.environ['PATH'] = ffmpeg_path + os.pathsep + os.environ.get('PATH', '')
# Now run your test
import subprocess
result = subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True)
print(result.stdout[:200])

ffmpeg version 8.0-full_build-www.gyan.dev Copyright (c) 2000-2025 the FFmpeg developers
built with gcc 15.2.0 (Rev8, Built by MSYS2 project)
configuration: --enable-gpl --enable-version3 --enable-sta


In [11]:
# Initialize assessor
assessor = PronunciationAssessor()

# Define your audio files and reference text
reference_audio = "./audio_files/word_february.mp3"  # Correct pronunciation
learner_audio = "./audio_files/word_january.mp3"      # Audio to be assessed
reference_text = "february"  # Transcript

# Perform assessment
assessments = assessor.assess_pronunciation(
    reference_audio_path=reference_audio,
    learner_audio_path=learner_audio,
    reference_text=reference_text,
    similarity_threshold=0.90  # Adjust based on your needs (0.0-1.0)
)

# Print summary
assessor.print_summary(assessments)

# # Access individual word results
# print("\n" + "="*60)
# print("DETAILED RESULTS")
# print("="*60)
# for assessment in assessments:
#     print(f"\nWord: {assessment.word}")
#     print(f"  Time: {assessment.start_time:.2f}s - {assessment.end_time:.2f}s")
#     print(f"  Similarity: {assessment.similarity_score:.3f}")
#     print(f"  Status: {'Correct' if assessment.is_correct else 'Needs improvement'}")

Loading model: bookbot/wav2vec2-ljspeech-gruut

PRONUNCIATION ASSESSMENT

1. Loading audio files...
   Reference duration: 1.05s
   Learner duration: 1.00s

3. Extracting phoneme embeddings...
   Reference phonemes: f ɛ b j u ɛ ɹ i
   Learner phonemes: d͡ʒæ n j u ɛ ɹ i

4. Assessing word-level pronunciation...


SUMMARY
Total words: 0
Correct pronunciations: 0
Incorrect pronunciations: 0
Accuracy: 0.0%
