In [1]:
from glob import glob
from tqdm import tqdm
import pandas as pd
import json
import re
import os

from arpa_to_ipa import arpa_to_ipa

In [2]:
def load_cmu_lexicon(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    lexicon = []
    for line in lines:
        word = line.split()[0].upper()
        arpa = " ".join(line.strip().split()[1:])
        if len(arpa.strip()) == 0:
            continue
        lexicon.append([word, arpa])

    lexicon = pd.DataFrame(lexicon, columns=["word", "arpa"])
    lexicon["ipa"] = lexicon.arpa.apply(lambda arpa: arpa_to_ipa(arpa).replace(" ", ""))

    return lexicon

def load_cam_lexicon(path):
    lexicon = json.load(open(path, "r", encoding="utf-8"))

    for key, value in lexicon.items():
        ipa = set(value["us"] + value["uk"])
        lexicon[key] = ipa
    return lexicon

In [3]:
path = "/data/codes/apa/kaldi/g2p/data/output.txt"
cmu_lexicon = load_cmu_lexicon(path)

path = "/data/codes/apa/kaldi/g2p/lexicon/processed/cambridge-lexicon.json"
cam_lexicon = load_cam_lexicon(path)

In [4]:
words = []
valid_lexicon = []

oov, count, total = 0, 0, 0
for index in tqdm(cmu_lexicon.index):
    word = cmu_lexicon["word"][index]
    ipa = cmu_lexicon["ipa"][index]
    arpa = cmu_lexicon["arpa"][index]

    if word.lower() not in cam_lexicon:
        oov += 1
        continue

    total += 1
    if ipa not in cam_lexicon[word.lower()]:
        count += 1
        continue

    valid_lexicon.append([word, arpa])
    words.append(word)

print(len(set(words)))
print(count, total, oov)

 16%|█▌        | 15134/96197 [00:00<00:00, 151331.85it/s]

100%|██████████| 96197/96197 [00:00<00:00, 173289.94it/s]

35945
52507 96117 80





In [38]:
lexicon = pd.DataFrame(valid_lexicon, columns=["word", "arpa"])

In [39]:
lexicon = lexicon.sort_values("word").reset_index()
lexicon = lexicon[["word", "arpa"]]

In [40]:
path = "/data/codes/apa/kaldi/g2p/lexicon/cambridge-lexicon.txt"
with open(path, "w", encoding="utf-8") as f:
    for index in lexicon.index:
        word = lexicon["word"][index]
        arpa = lexicon["arpa"][index]
        f.write(f'{word}\t{arpa}\n')