Install libraries

In [13]:
!pip install openai-whisper pronouncing g2p_en
!apt-get install -y ffmpeg


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [None]:
Import and download nltk-averaged_perceptron_tagger_eng

In [14]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

Import necessary libraries and load model

In [15]:
import whisper
import pronouncing
from g2p_en import G2p
from google.colab import files


print("Loading models (takes ~1 min)...")
model = whisper.load_model("tiny")
g2p = G2p()
print(" Models loaded successfully")


Loading models (takes ~1 min)...
 Models loaded successfully


Upload audio file

In [16]:
print(" Please upload an audio file (e.g. WAV, MP3, M4A)")
uploaded = files.upload()

filename = list(uploaded.keys())[0]
print(f" Uploaded: {filename}")


 Please upload an audio file (e.g. WAV, MP3, M4A)


Saving bait.ogg to bait (2).ogg
 Uploaded: bait (2).ogg


Get Phonemes

In [17]:
import re
from difflib import SequenceMatcher

def get_phonemes(text):
    """Convert text to phonemes while ignoring punctuation."""
    text = re.sub(r"[^\w\s]", "", text)
    return [p for p in g2p(text) if p != " "]



Calculating Pronounciation

In [None]:
def pronunciation_score(expected_phones, spoken_phones):
    """Compute similarity between two phoneme sequences."""
    matcher = SequenceMatcher(None, expected_phones, spoken_phones)
    return round(matcher.ratio() * 100, 1)

Checking Pronounciation

In [24]:
def check_pronunciation_from_file(expected_word, audio_file):
    print(f" Expected word: {expected_word}")
    result = model.transcribe(audio_file, prompt=f"The word is '{expected_word}'")
    spoken_text = result["text"].strip().lower()

    if not spoken_text:
        print("No clear speech detected. Try again.")
        return

    expected_phones = get_phonemes(expected_word)
    spoken_phones   = get_phonemes(spoken_text)

    score = pronunciation_score(expected_phones, spoken_phones)


    vowel_groups = [
        ('EY', 'IY'), ('IY', 'EY'),
        ('AE', 'AH'), ('AH', 'AE'),
        ('EH', 'IH'), ('IH', 'EH'),
        ('OW', 'UH'), ('UH', 'OW'),
        ('AA', 'AH'), ('AH', 'AA'),
        ('AO', 'AA'), ('AA', 'AO'),
    ]
    for v1, v2 in vowel_groups:
        if v1 in expected_phones and v2 in spoken_phones:
            print("Sound is phonetically close â€” acceptable pronunciation! ")
            score = max(score, 90)


    print(f" Correct phonemes: {expected_phones}")
    print(f"ðŸŽ§ Your phonemes: {spoken_phones}")
    print(f" Pronunciation Score: {score}%")

    if score > 85:
        print(" Excellent pronunciation! ðŸŽ‰")
    elif score > 60:
        print("Fair â€” close enough, just adjust slightly.")
    else:
        print(" Needs improvement.")
        print(f"Correct way to say '{expected_word}' â†’ /{' '.join(expected_phones)}/")
check_pronunciation_from_file("bait", filename)

 Expected word: bait




 Correct phonemes: ['B', 'EY1', 'T']
ðŸŽ§ Your phonemes: ['B', 'IY1', 'T']
 Pronunciation Score: 66.7%
Fair â€” close enough, just adjust slightly.
