In [11]:
pip install --user pronouncing SpeechRecognition

Collecting SpeechRecognition
  Using cached SpeechRecognition-3.14.0-py3-none-any.whl.metadata (31 kB)
Collecting standard-aifc (from SpeechRecognition)
  Using cached standard_aifc-3.13.0-py3-none-any.whl.metadata (969 bytes)
Collecting audioop-lts (from SpeechRecognition)
  Using cached audioop_lts-0.2.1-cp313-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting standard-chunk (from standard-aifc->SpeechRecognition)
  Using cached standard_chunk-3.13.0-py3-none-any.whl.metadata (860 bytes)
Using cached SpeechRecognition-3.14.0-py3-none-any.whl (32.9 MB)
Using cached audioop_lts-0.2.1-cp313-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (82 kB)
Using cached standard_aifc-3.13.0-py3-none-any.whl (10 kB)
Using cached standard_chunk-3.13.0-py3-none-any.whl (4.9 kB)
Installing collected packages: standard-chunk, audioop-lts, standard-aifc, SpeechRecognition
Successfully installed

In [6]:
import speech_recognition as sr

def wav_to_text(file_path):
    recognizer = sr.Recognizer()
    
    with sr.AudioFile(file_path) as source:
        print("Processing audio file...")
        audio_data = recognizer.record(source)
    
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except sr.UnknownValueError:
        return "Speech recognition could not understand the audio."
    except sr.RequestError as e:
        return f"Could not request results from Google Speech Recognition: {e}"

file_path = "./audio/harvard.wav"
transcription = wav_to_text(file_path)
print("Transcription:", transcription)


Processing audio file...
Transcription: the stale smell of old beer lingers it takes heat to bring out the odor a cold dip restores health and zest a salt pickle taste fine with ham tacos al pastor are my favorite a zestful food is the hot cross bun


In [9]:
import pronouncing

# Phoneme-to-viseme mapping
phoneme_to_viseme = {
    "AA": "Open Back", "AE": "Open Wide", "AH": "Open Back", "AO": "Open Back",
    "AW": "Open to Round", "AY": "Transition Smile",
    "B": "Closed Lips", "P": "Closed Lips", "M": "Closed Lips",
    "CH": "Alveopalatal Stop", "JH": "Alveopalatal Stop",
    "D": "Alveolar", "T": "Alveolar",
    "DH": "Tongue Between", "TH": "Tongue Between",
    "EH": "Relaxed Front", "IH": "Relaxed Front", "IY": "Relaxed Front",
    "ER": "Neutral Mid", "UH": "Neutral Mid",
    "EY": "Slight Smile",
    "F": "Inner Lip Bite", "V": "Inner Lip Bite",
    "G": "Open Back", "K": "Open Back", "NG": "Open Back",
    "HH": "Open Mid",
    "L": "Tongue Front",
    "N": "Alveolar",
    "OW": "Round to Closed", "OY": "Round to Smile",
    "R": "Rounded Mid",
    "S": "Slight Smile", "Z": "Slight Smile",
    "SH": "Rounded Front", "ZH": "Rounded Front",
    "UW": "Rounded Small",
    "W": "Rounded Lips",
    "Y": "Rounded Smile",
    "SIL": "Silent"
}


def strip_stress(phoneme):
    return ''.join([char for char in phoneme if not char.isdigit()])

# Function to extract phonemes and map them to visemes
def text_to_visemes(sentence):
    words = sentence.lower().split()
    viseme_list = []

    for word in words:
        # Get the phonemes for the word
        pronunciations = pronouncing.phones_for_word(word)
        if pronunciations:
            phonemes = [strip_stress(phoneme) for phoneme in pronunciations[0].split()]
            word_visemes = []
            for phoneme in phonemes:
                viseme = phoneme_to_viseme.get(phoneme, 'Unknown')
                word_visemes.append(viseme)
            viseme_list.append((word, phonemes, word_visemes))
        else:
            viseme_list.append((word, ['Unknown']))

    return viseme_list

visemes = text_to_visemes(transcription)


for word, phoneme, viseme_group in visemes:
    print(f"Word: {word}\nPhoneme: {phoneme}, Visemes: {viseme_group}\n")


Word: the
Phoneme: ['DH', 'AH'], Visemes: ['Tongue Between', 'Open Back']

Word: stale
Phoneme: ['S', 'T', 'EY', 'L'], Visemes: ['Slight Smile', 'Alveolar', 'Slight Smile', 'Tongue Front']

Word: smell
Phoneme: ['S', 'M', 'EH', 'L'], Visemes: ['Slight Smile', 'Closed Lips', 'Relaxed Front', 'Tongue Front']

Word: of
Phoneme: ['AH', 'V'], Visemes: ['Open Back', 'Inner Lip Bite']

Word: old
Phoneme: ['OW', 'L', 'D'], Visemes: ['Round to Closed', 'Tongue Front', 'Alveolar']

Word: beer
Phoneme: ['B', 'IH', 'R'], Visemes: ['Closed Lips', 'Relaxed Front', 'Rounded Mid']

Word: lingers
Phoneme: ['L', 'IH', 'NG', 'G', 'ER', 'Z'], Visemes: ['Tongue Front', 'Relaxed Front', 'Open Back', 'Open Back', 'Neutral Mid', 'Slight Smile']

Word: it
Phoneme: ['IH', 'T'], Visemes: ['Relaxed Front', 'Alveolar']

Word: takes
Phoneme: ['T', 'EY', 'K', 'S'], Visemes: ['Alveolar', 'Slight Smile', 'Open Back', 'Slight Smile']

Word: heat
Phoneme: ['HH', 'IY', 'T'], Visemes: ['Open Mid', 'Relaxed Front', 'Alveol