# Pronunciation Analysis and Improvements

In this notebook, we analyze the goodness of pronunciation using advanced algorithms and tools. Building on the existing notebook, we introduce new phoneme-level evaluation techniques that provide detailed insights into pronunciation. The key improvements include:
- Phoneme-level goodness measurement using advanced scoring methods.
- Visualization tools for feedback, such as heatmaps and color-coded results.
- Enhanced metrics for assessing pronunciation accuracy interactively.

The aim is to help users better understand their pronunciation strengths and weaknesses, providing targeted feedback for improvement.

In [11]:
# !pip install montreal-forced-aligner sounddevice numpy wave
# !mfa model download acoustic english_mfa
# !mfa model download dictionary english_mfa
# !mfa model download g2p english_us_mfa

In [1]:
import sounddevice as sd
import numpy as np
import wave
import os

# Directory to save recordings
recording_dir = "recordings"
os.makedirs(recording_dir, exist_ok=True)

def record_audio(filename="test_recording.wav", duration=5, samplerate=16000):
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=1, dtype=np.int16)
    sd.wait()
    filepath = os.path.join(recording_dir, filename)

    # Save as WAV
    with wave.open(filepath, "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(samplerate)
        wf.writeframes(audio.tobytes())

    print(f"Saved recording to {filepath}")
    return filepath

# Record audio
audio_file = record_audio("test_recording.wav", duration=5)
# Manually input transcript for alignment
transcript = "Hey Good Morning"

Recording for 5 seconds...
Saved recording to recordings\test_recording.wav


In [2]:
import subprocess
# Save transcript to a file in MFA format
transcript_path = os.path.join(recording_dir, "test_recording.txt")
with open(transcript_path, "w") as f:
    f.write(f"{transcript}")

print(f"Transcript saved: {transcript_path}")

# Directory to store alignment results
aligned_output_dir = os.path.join(recording_dir, "aligned_output")
os.makedirs(aligned_output_dir, exist_ok=True)

print("Running Montreal Forced Aligner...")
mfa_command = f"mfa align {recording_dir} english_mfa english_mfa {aligned_output_dir} --output_format json"
subprocess.run(mfa_command, shell=True, check=True)

print("Alignment complete! Results saved in:", aligned_output_dir)


Transcript saved: recordings\test_recording.txt
Running Montreal Forced Aligner...
Alignment complete! Results saved in: recordings\aligned_output


In [3]:
import json
# Load phoneme alignment results
alignment_file = os.path.join(aligned_output_dir, "test_recording.json")
with open(alignment_file, "r", encoding="utf-8") as f:
    alignment_data = json.load(f)

# Print the first entry to understand its structure
print("First entry in 'phones':", alignment_data["tiers"]["phones"]["entries"][0])


First entry in 'phones': [0.48, 3.77, 'spn']


In [4]:
# Print the keys within the "tiers" key
print("Keys in 'tiers':", alignment_data["tiers"].keys())

Keys in 'tiers': dict_keys(['words', 'phones'])


In [5]:
# Load phoneme alignment results
alignment_file = os.path.join(aligned_output_dir, "test_recording.json")
with open(alignment_file, "r", encoding="utf-8") as f:
    alignment_data = json.load(f)

# Print phoneme timings
print("\nPhoneme Alignment Results:")
for segment in alignment_data["tiers"]["phones"]["entries"]:
    print(f"Phoneme: {segment[2]} | Start: {segment[0]}s | End: {segment[1]}s")


Phoneme Alignment Results:
Phoneme: spn | Start: 0.48s | End: 3.77s
Phoneme: h | Start: 4.1s | End: 4.13s
Phoneme: ə | Start: 4.13s | End: 4.16s
Phoneme: l | Start: 4.16s | End: 4.19s
Phoneme: əw | Start: 4.19s | End: 4.26s
Phoneme: ʋ | Start: 4.3s | End: 4.35s
Phoneme: ɜː | Start: 4.35s | End: 4.38s
Phoneme: l | Start: 4.38s | End: 4.46s
Phoneme: ɖ | Start: 4.46s | End: 4.82s


In [6]:
alignment_data["tiers"]["phones"]["entries"]

[[0.48, 3.77, 'spn'],
 [4.1, 4.13, 'h'],
 [4.13, 4.16, 'ə'],
 [4.16, 4.19, 'l'],
 [4.19, 4.26, 'əw'],
 [4.3, 4.35, 'ʋ'],
 [4.35, 4.38, 'ɜː'],
 [4.38, 4.46, 'l'],
 [4.46, 4.82, 'ɖ']]

In [7]:
def compare_phonemes(alignment_data, expected_phonemes):
    user_phonemes = [segment[2] for segment in alignment_data["tiers"]["phones"]["entries"]]
    total_phonemes = len(expected_phonemes)
    correct_phonemes = sum(1 for i in range(min(len(user_phonemes), total_phonemes)) if user_phonemes[i] == expected_phonemes[i])
    accuracy = correct_phonemes / total_phonemes * 100
    
    report = {
        "total_phonemes": total_phonemes,
        "correct_phonemes": correct_phonemes,
        "accuracy": accuracy,
        "user_phonemes": user_phonemes,
        "expected_phonemes": expected_phonemes
    }
    return report

# Example usage
expected_phonemes = ['h', 'eɪ', 'ɡ', 'ʊ', 'd', 'm', 'ɔr', 'n', 'ɪ', 'ŋ'] # Expected phonemes for 'hey Good Morning'
report = compare_phonemes(alignment_data, expected_phonemes)
print("Pronunciation Report:")
print(f"Total Phonemes: {report['total_phonemes']}")
print(f"Correct Phonemes: {report['correct_phonemes']}")
print(f"Accuracy: {report['accuracy']:.2f}%")
print("User Phonemes:", report['user_phonemes'])
print("Expected Phonemes:", report['expected_phonemes'])

Pronunciation Report:
Total Phonemes: 10
Correct Phonemes: 0
Accuracy: 0.00%
User Phonemes: ['spn', 'h', 'ə', 'l', 'əw', 'ʋ', 'ɜː', 'l', 'ɖ']
Expected Phonemes: ['h', 'eɪ', 'ɡ', 'ʊ', 'd', 'm', 'ɔr', 'n', 'ɪ', 'ŋ']
