In [None]:
import misaki
import re
from langdetect import detect
from misaki import vi, zh, espeak
espeak.EspeakWrapper.set_library('C:\Program Files\eSpeak NG\libespeak-ng.dll')

def g2p(text, g2p_dict):
    phonemes, tokens = g2p_dict["main"](text)
    # Use a regex to extract all substrings enclosed in square brackets.
    bracketed_texts = re.findall(r'\[(.*?)\]', phonemes)
    # Iterate through each detected bracketed text.
    for segment in bracketed_texts:
        try:
            # Detect language of the segment. Need to to manually because Espeak lang detection is not reliable.
            detected_lang = detect(segment)
        except Exception as e:
            fallback = g2p_dict['fallback'](segment)[0]
            phonemes = phonemes.replace(f"[{segment}]", fallback)
            continue

        if detected_lang.startswith("zh"):
            zh_phonemes, zh_tokens = g2p_dict['zh'](segment)
            converted = zh_phonemes
        elif detected_lang.startswith("zh"):
            #espeak fallback for languages
            ja_phonemes = g2p_dict['ja'](segment)[0]
            converted = ja_phonemes
        else:
            fallback = g2p_dict['fallback'](segment)[0]
            converted = fallback
        phonemes = phonemes.replace(f"[{segment}]", converted)
    return phonemes

main_g2p = vi.VIG2P()
zh_g2p = zh.ZHG2P()
ja_g2p = espeak.EspeakG2P(language='ja')
fallback_vi = espeak.EspeakG2P(language='vi')
g2p_dict = {"main":main_g2p, "zh":zh_g2p, "ja":ja_g2p, "fallback":fallback_vi}

In [16]:
#text = 'Hello 你好 xin chào にっぽん'
text = 'hao hảo háo hào hạo a á â o ơ u ư e ê'
phonemes = g2p(text, g2p_dict)

espeak_vn = espeak.EspeakG2P(language='vi')

print(text)
print(phonemes)
print(espeak_vn(text)[0])

hao hảo háo hào hạo a á â o ơ u ư e ê
haw1 haw4 haw5 haw2 haw6 a1 a5 ɤ̆1 ɔ1 ɤ1 u1 ɯ1 ɛ1 e1
hˈaːw hˈaː4w hˈaːɜw hˈaː2w hˈaː6w ˈaː ˈaːɜ ˈə ˈɔ ˈəː ˈu ˈy ˈɛ ˈe


In [8]:
_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
_extend = "∫̆ăη͡123456"

# Export all symbols:
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) + list(_extend)

dicts = {}
for i in range(len((symbols))):
    dicts[symbols[i]] = i

len(dicts)


188

In [51]:
vi_syms = ['ɯəj', 'ɤ̆j', 'ʷiə', 'ɤ̆w', 'ɯəw', 'ʷet', 'iəw', 'uəj', 'ʷen', 'tʰw', 'ʷɤ̆', 'ʷiu', 'kwi', 'ŋ͡m', 'k͡p', 'cw', 'jw', 'uə', 'eə', 'bw', 'oj', 'ʷi', 'vw', 'ăw', 'ʈw', 'ʂw', 'aʊ', 'fw', 'ɛu', 'tʰ', 'tʃ', 'ɔɪ', 'xw', 'ʷɤ', 'ɤ̆', 'ŋw', 'ʊə', 'zi', 'ʷă', 'dw', 'eɪ', 'aɪ', 'ew', 'iə', 'ɣw', 'zw', 'ɯj', 'ʷɛ', 'ɯw', 'ɤj', 'ɔ:', 'əʊ', 'ʷa', 'mw', 'ɑ:', 'hw', 'ɔj', 'uj', 'lw', 'ɪə', 'ăj', 'u:', 'aw', 'ɛj', 'iw', 'aj', 'ɜ:', 'kw', 'nw', 't∫', 'ɲw', 'eo', 'sw', 'tw', 'ʐw', 'iɛ', 'ʷe', 'i:', 'ɯə', 'dʒ', 'ɲ', 'θ', 'ʌ', 'l', 'w', '1', 'ɪ', 'ɯ', 'd', '∫', 'p', 'ə', 'u', 'o', '3', 'ɣ', '!', 'ð', 'ʧ', '6', 'ʒ', 'ʐ', 'z', 'v', 'g', 'ă', 'æ', 'ɤ', '2', 'ʤ', 'i', '.', 'ɒ', 'b', 'h', 'n', 'ʂ', 'ɔ', 'ɛ', 'k', 'm', '5', ' ', 'c', 'j', 'x', 'ʈ', ',', '4', 'ʊ', 's', 'ŋ', 'a', 'ʃ', '?', 'r', ':', 'η', 'f', ';', 'e', 't', "'"]

In [53]:
unk_char = []
for char in "".join(vi_syms):
    if char in dicts.keys():
        continue
    else:
        print(char)
        unk_char.append(char)

In [55]:
test = "ʷɤ̆ ʷă k͡p"

[dicts[char] for char in test]

[165, 140, 179, 16, 165, 180, 16, 53, 182, 58]