In [None]:
!pip install allosaurus

# Input Variables
change these according to what you need.
- Max Duration: the amount of time a phoneme can go before defaulting to the "basis" mouth shape
- Filepath: the filepath of the .wav file

In [38]:
MAX_DURATION = 0.5
FILEPATH = "fortnite.wav"

# Initialization
just some functions and variables we need to declare before running the script. i put it in another cell just so you don't have to rerun import code if you want to process another file.
not all of this is essential for blender usage

In [39]:
import pandas as pd
import numpy as np
import math
from allosaurus.app import read_recognizer
model = read_recognizer("eng2102")

def replace(s, r, leading=""):
    return [r.get(x, x) for x in s]

def findPhonemeGroup(phoneme: str):
    phonemeList =   [["e", "æ", "eː", "ɻ", "ʁ", "ɹ̩", "ɻ̩", "a", "ɒ", "ɑ", "ʌ", "x", "ɾ", "ɾʲ", "ɪ", "ɛ", "v̞ʲ", "ɕ", "ɛ̃", "h"], # aei 
                    ["k", "d", "d͡ʒ", "kʰ", "dʒ", "k̟ʲ", "dʲ", "t", "tʰ", "tʂ", "tʲ", "t͡ʃʲ", "t̪", "t͡ʃ", "ts", "tɕ", "tɕʰ", "tʂʰ"], # cdkg 
                    ["b", "m", "p", "mʲ", "b̞", "b̤", "pʲ"], # m 
                    ["n", "ŋ", "ɡ", "ɴ", "ɳ", "ɲ", "ŋ̟"], # n 
                    ["i", "j", "s", "ʃ", "z", "iː", "c", "zʲ", "s̪", "ʂ", "ʒ", "ɨ", "ʐ"], # szeay  
                    ["l", "ð", "ɔ", "lʲ", "l̪", "θ"], # l/th 
                    ["u", "o", "w", "uː", "uə", "œ"], # oo 
                    ["f", "v", "ɯ", "y", "ʏ"], # fv 
                    ["r", "ɹ", "ɹ̩"],# r 
                    ["ə", "ʊ"]] # o      
    for n, i in enumerate(phonemeList):
        if phoneme in i:
            return n
    print(phoneme + " not found. Please place this phoneme in the appropriate mouth shape group.")

phoneme_groups = ["aei", "cdkg", "m", "n", "szeay", "l/th", "oo", "fv", "r", "o"] # should match the order in which the phonemeList is ordered

phoneme_dict = {
    "a": "uh",
    "aː": "uh",
    "b": "b",
    "d": "d",
    "d̠": "d",
    "e": "e",
    "eː": "er",
    "e̞": "e",
    "f": "f",
    "h": "h",
    "ɪ": "i",
    "iː": "ee",
    "j": "y",
    "k": "k",
    "ɡ": "g",
    "kʰ": "kh",
    "l": "l",
    "m": "m",
    "n": "n",
    "o": "o",
    "oː": "o",
    "p": "p",
    "pʰ": "p",
    "r": "r",
    "ɹ̩": "r",
    "s": "s",
    "t": "t",
    "tʰ": "t",
    "t̠": "t",
    "u": "oo",
    "uː": "oo",
    "v": "v",
    "w": "w",
    "x": "x",
    "z": "z",
    "æ": "ah",
    "ð": "th",
    "øː": "uu",
    "ŋ": "ng",
    "ɐ": "uh",
    "ɐː": "uh",
    "ɑ": "ah",
    "ɑː": "ah",
    "ɔ": "aw",
    "ɔː": "aw",
    "ɘ": "uh",
    "ə": "eh",
    "əː": "eh",
    "ɛ": "eh",
    "ɛː": "eh",
    "ɜː": "er",
    "ɹ": "r",
    "ɻ": "r",
    "ʃ": "sh",
    "ʉ": "oo",
    "ʉː": "oo",
    "ʊ": "oo",
    "ʌ": "oo",
    "ʍ": "",
    "ʒ": "j",
    "ʔ": "tt",
    "θ": "th",
    "t͡ʃ": "tsh", #blended cases
    "d͡ʒ": "j",
}

# Process

In [41]:
time = model.recognize(FILEPATH, "eng", timestamp=True, emit=1)

dtypes = {"StartTime": "float64", "Duration": "float64", "Phoneme": "object"}
col_names = ["StartTime", "Duration", "Phoneme"] 

time_df = pd.DataFrame([i.split() for i in time.split("\n")], columns=col_names)


# processing data
time_df["StartTime"] = time_df["StartTime"].astype("float")
time_df["Duration"] = time_df["StartTime"].diff().shift(-1)
time_df["Grapheme"] = replace(time_df["Phoneme"], phoneme_dict, "-")
time_df["GroupNo"] = time_df["Phoneme"].apply(findPhonemeGroup)
time_df["Group"] = time_df["GroupNo"].apply(lambda x: phoneme_groups[int(x)])

# creating closed mouth keyframes for phonemes that have a duration longer than the given max duration
mask = np.where(time_df["Duration"] > MAX_DURATION)
long = time_df.iloc[mask].copy()
long["StartTime"] = long["StartTime"] + MAX_DURATION - 0.1
long["Duration"] = long["Duration"] - MAX_DURATION
long["Phoneme"] = '. '
long["Grapheme"] = '. '
long["GroupNo"] = -1
long["Group"] = "close"

time_df.loc[time_df["Duration"] > MAX_DURATION, "Duration"] = MAX_DURATION

time_df = pd.concat([time_df, long]).sort_values(["StartTime"]).reset_index(drop=True)

# initializing the first keyframe as a closed mouth keyframe
time_df.loc[-1] = [0, time_df.loc[0]["StartTime"], '.', '.', -1, 'close']
time_df.index += 1
time_df = time_df.sort_index()

# adding last keyframe as a closed mouth keyframe
time_df["Duration"].loc[time_df.shape[0]-1] = MAX_DURATION # setting last phoneme's duration
time_df.loc[time_df.shape[0]] = [time_df.loc[time_df.shape[0]-1]["StartTime"] + MAX_DURATION, 1.0, '.', '.', -1, 'close']

time_df.to_csv("test.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  time_df["Duration"].loc[time_df.shape[0]-1] = MAX_DURATION # setting last phoneme's duration


follow similar procedure to [this video](https://www.youtube.com/watch?v=Kuw9xoS5wxw&list=PLg4tPAeoYVcUnzH8xHBfFhrtiy2rvjxaH&index=1)