In [17]:
from glob import glob
from tqdm import tqdm
import pandas as pd
import json
import re
import os

from arpa_to_ipa import arpa_to_ipa

In [18]:
def load_cmu_lexicon(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    lexicon = []
    for line in lines:
        word = line.split()[0].upper()
        arpa = " ".join(line.strip().split()[1:])
        if len(arpa.strip()) == 0:
            continue
        lexicon.append([word, arpa])

    lexicon = pd.DataFrame(lexicon, columns=["word", "arpa"])
    lexicon["ipa"] = lexicon.arpa.apply(lambda arpa: arpa_to_ipa(arpa).replace(" ", ""))

    return lexicon

def load_cam_lexicon(path):
    lexicon = json.load(open(path, "r", encoding="utf-8"))

    for key, value in lexicon.items():
        ipa = set(value["us"] + value["uk"])
        lexicon[key] = ipa
    return lexicon

In [19]:
path = "/data/codes/apa/kaldi/g2p/lexicon/processed/elsa-lexicon-9.txt"
elsa_lexicon_9 = load_cmu_lexicon(path)

path = "/data/codes/apa/kaldi/g2p/lexicon/processed/elsa-lexicon-10.txt"
elsa_lexicon_10 = load_cmu_lexicon(path)

path = "/data/codes/apa/kaldi/g2p/lexicon/processed/elsa-lexicon-12.txt"
elsa_lexicon_12 = load_cmu_lexicon(path)

elsa_lexicon = pd.concat([elsa_lexicon_9, elsa_lexicon_10, elsa_lexicon_12])

path = "/data/codes/apa/kaldi/g2p/lexicon/processed/cambridge-lexicon.json"
cam_lexicon = load_cam_lexicon(path)

In [20]:
elsa_lexicon = elsa_lexicon.reset_index()[["word", "arpa", "ipa"]]
elsa_lexicon = elsa_lexicon.drop_duplicates()

In [21]:
words = []
valid_lexicon = []

oov, count, total = 0, 0, 0
for index in tqdm(elsa_lexicon.index):
    word = elsa_lexicon["word"][index]
    ipa = elsa_lexicon["ipa"][index]
    arpa = elsa_lexicon["arpa"][index]

    if word.lower() not in cam_lexicon:
        oov += 1
        continue

    total += 1
    if ipa not in cam_lexicon[word.lower()]:
        count += 1
        continue

    valid_lexicon.append([word, arpa])
    words.append(word)

print(len(set(words)))
print(count, total, oov)

 39%|███▉      | 16572/42092 [00:00<00:00, 165713.44it/s]

100%|██████████| 42092/42092 [00:00<00:00, 172898.49it/s]

12934
5700 19588 22504





In [38]:
lexicon = pd.DataFrame(valid_lexicon, columns=["word", "arpa"])

In [39]:
lexicon = lexicon.sort_values("word").reset_index()
lexicon = lexicon[["word", "arpa"]]

In [40]:
path = "/data/codes/apa/kaldi/g2p/lexicon/elsa-lexicon.txt"
with open(path, "w", encoding="utf-8") as f:
    for index in lexicon.index:
        word = lexicon["word"][index]
        arpa = lexicon["arpa"][index]
        f.write(f'{word}\t{arpa}\n')