In [None]:
import pandas as pd
import numpy as np
import json
import re

In [None]:
monopthongs = {
    'AO': 'ɔ',
    'AA': 'ɑ',
    'IY': 'i',
    'UW': 'u',
    'EH': 'e',
    'IH': 'ɪ',
    'UH': 'ʊ',
    'AH': 'ʌ',
    'AX': 'ə',
    'AE': 'æ',
}

dipthongs = {
    'EY': 'eɪ',
    'AY': 'aɪ',
    'OW': 'oʊ',
    'AW': 'aʊ',
    'OY': 'ɔɪ'
}

r_colored_vowels = {
    'ER': 'ɜr'
}

stops = {
    'P': 'p',
    'B': 'b',
    'T': 't',
    'D': 'd',
    'K': 'k',
    'G': 'g',
}

affricates = {
    'CH': 'tʃ',
    'JH': 'dʒ',
}

fricatives = {
    'F': 'f',
    'V': 'v',
    'TH': 'θ',
    'DH': 'ð',
    'S': 's',
    'Z': 'z',
    'SH': 'ʃ',
    'ZH': 'ʒ',
    'HH': 'h',
}

nasals = {
    'M': 'm',
    'N': 'n',
    'NG': 'ŋ'
}

liquids = {
    'L': 'l',
    'R': 'r'
}

semivowels = {
    'W': 'w',
    'Y': 'j'
}

arpa_to_ipa_lookup = {}
arpa_to_ipa_lookup.update(monopthongs)
arpa_to_ipa_lookup.update(dipthongs)
arpa_to_ipa_lookup.update(r_colored_vowels)
arpa_to_ipa_lookup.update(stops)
arpa_to_ipa_lookup.update(affricates)
arpa_to_ipa_lookup.update(fricatives)
arpa_to_ipa_lookup.update(nasals)
arpa_to_ipa_lookup.update(liquids)
arpa_to_ipa_lookup.update(semivowels)

def arpa_to_ipa(arpa):
    return ' '.join(arpa_to_ipa_lookup[phoneme] for phoneme in arpa.split(' '))


In [None]:
path = "/data/codes/apa/train/resources/phone_dict.json"
phone_dict = json.load(open(path, "r", encoding="utf-8"))
print(phone_dict)

In [None]:
for key, value in arpa_to_ipa_lookup.items():
    key = re.sub("\d", " ", key).strip()
    if key not in phone_dict:
        print(key)

In [None]:
ipa_to_arpa = {}
for key, value in arpa_to_ipa_lookup.items():
    assert key not in ipa_to_arpa

    ipa_to_arpa[value] = key

print(ipa_to_arpa)

In [None]:
same_pron_ipa_list = [
    ('/ɪ/', '/i/'),
    ('/e/', '/ɪ/'),
    ('/e/', '/eɪ/'),
    ('/æ/', '/ʌ/'),
    ('/oʊ/', '/ɔ/'),
    ('/ɑ/', '/oʊ/'),
    ('/æ/', '/e/'),
    ('/ɑ/', '/ɜr/'),
    ('/æ/', '/ɑ/'),
    ('/ɑ/', '/ɔ/'),
    ('/oʊ/', '/aʊ/'),
    ('/b/', '/v/'),
    ('/b/', '/p/'),
    ('/n/', '/ŋ/'),
    ('/l/', '/r/'),
    ('/tʃ/', '/t/'),
    ('/s/', '/ʃ/'),
    ('/f/', '/v/'),
    ('/f/', '/h/'),
    ('/f/', '/θ/'),
    ('/s/', '/θ/'),
    ('/ð/', '/z/'),
    ('/dʒ/', '/z/'),
    ('/d/', '/dʒ/'),
    ('/f/', '/p/'),
    ('/tʃ/', '/dʒ/'),
    ('/tʃ/', '/ʃ/'),
    ('/d/', '/ð/'),
    ('/t/', '/θ/'),
    ('/f/', '/p/'),
    ('/k/', '/g/'),
    ('/t/', '/d/'),
    ('/v/', '/w/'),
    ('/g/', '/w/'),
    ('/h/', '/r/'),
    ('/r/', '/w/'),
    ('/dʒ/', '/j/'),
    ('/k/', '/g/'),
    ('/m/', '/n/'),
    ('/t/', '/d/'),
    ('/s/', '/z/'),
    ('/n/', '/ŋ/'),
]
same_pron_ipa_dict = {}
for ipa_1, ipa_2 in same_pron_ipa_list:
    ipa_1 = re.sub(r"\/", "", ipa_1).strip()
    ipa_2 = re.sub(r"\/", "", ipa_2).strip()

    assert ipa_2 in ipa_to_arpa
    assert ipa_1 in ipa_to_arpa

    if ipa_1 not in same_pron_ipa_dict:
        same_pron_ipa_dict[ipa_1] = [ipa_2, ]
    else:
        same_pron_ipa_dict[ipa_1].append(ipa_2)

    if ipa_2 not in same_pron_ipa_dict:
        same_pron_ipa_dict[ipa_2] = [ipa_1, ]
    else:
        same_pron_ipa_dict[ipa_2].append(ipa_1)


for key in same_pron_ipa_dict.keys():
    same_pron_ipa_dict[key] = set(same_pron_ipa_dict[key])

print(same_pron_ipa_dict)

In [None]:
same_pron_arpa_dict = {}
for key, values in same_pron_ipa_dict.items():
    key = ipa_to_arpa[key]
    if key not in same_pron_arpa_dict:
        same_pron_arpa_dict[key] = []
    for value in values:
        value = ipa_to_arpa[value]
        same_pron_arpa_dict[key].append(value)

print(same_pron_arpa_dict)


In [None]:
path = "/data/codes/apa/train/resources/same_pron_arpa_dict.json"
with open(path, "w", encoding="utf-8") as f:
    json_obj = json.dumps(same_pron_arpa_dict, indent=4, ensure_ascii=False)
    f.write(json_obj)