In [None]:
import json
import os
import pandas as pd
from glob import glob
from tqdm import tqdm
import re

In [None]:
root_path = "/data/codes/apa/kaldi/g2p/lexicon/raw/cambridge-dictionary"

files = glob(f'{root_path}/*.json')


In [None]:
cam2prep = {
    "ɔ": "ɔ",
    "ɑ": "ɑ",
    "i": "i",
    "u": "u",
    "e": "e",
    "ɪ": "ɪ",
    "ʊ": "ʊ",
    "ʌ": "ʌ",
    "ə": "ə",
    "æ": "æ",
    "eɪ": "eɪ",
    "aɪ": "aɪ",
    "oʊ": "oʊ",
    "aʊ": "aʊ",
    "ɔɪ": "ɔɪ",
    "ɝ": "ɜr",
    "ɚ": "ər",
    "p": "p",
    "b": "b",
    "t": "t",
    "d": "d",
    "k": "k",
    "g": "g",
    "tʃ": "tʃ",
	"dʒ": "dʒ",
    "f": "f",
    "v": "v",
    "θ": "θ",
	"ð": "ð",
	"s": "s",
	"z": "z",
	"ʃ": "ʃ",
    "ʒ": "ʒ",
    "h": "h",
	"m": "m",
    "n": "n",
    "ŋ": "ŋ",
    "l": "l",
    "t̬": "ɾ",
    "j": "j",
    "w": "w",
    "ʔ": "ʔ"
}

def convert2prep(ipa):
    for key, value in cam2prep.items():
        ipa = ipa.replace(key, value)

    return ipa

In [None]:
camdict = {
    "ɒ": "ɑ",
    "ɚ": "ɝ",
    "ər": "ɝ",
    "t̬": "ɾ",
    "ɹ": "r"
}

def norm_us_uk(ipa):
    for key, value in camdict.items():
        ipa = ipa.replace(key, value)
    return ipa

In [None]:
cambridge_characters = "a b d e f h i j k l m n o p r ɾ s t u v w x z æ ð ŋ ɑ ɒ ɔ ə ɚ ɜ ɡ ɪ ʃ ʊ ʌ ʒ ʔ ʤ θ ɝ"

def norm(ipa):
    ipa = norm_us_uk(ipa)
    
    processed_ipa = []
    for char in ipa:
        if char not in cambridge_characters:
            continue
        processed_ipa.append(char)

    processed_ipa = "".join(processed_ipa)
    processed_ipa = convert2prep(processed_ipa)
    return processed_ipa

In [None]:
lexicon = {}

for file in tqdm(files):
    word = json.load(open(file, "r", encoding="utf-8"))
    
    text = word["word"]
    if len(text.split(" ")) > 1:
        continue
    uk_ipas, us_ipas = [], []
    is_valid = True
    for meaning_case in word["meaning_cases"]:
        for word_type_case in meaning_case["word_type_cases"]:
            head_content = word_type_case["head_content"]

            if head_content["word_type"] == "_":
                is_valid = False
                break
            uk_ipa = head_content["ipa"]["uk"]
            us_ipa = head_content["ipa"]["us"]

            uk_ipa = [norm(ipa) for ipa in uk_ipa]
            us_ipa = [norm(ipa) for ipa in us_ipa]

            uk_ipas += uk_ipa
            us_ipas += us_ipa

    if is_valid == True:
        lexicon[text] = {
            "us": us_ipas,
            "uk": uk_ipas
        }
len(lexicon)

In [None]:
path = "/data/codes/apa/kaldi/g2p/lexicon/processed/cambridge-lexicon.json"
with open(path, "w", encoding="utf-8") as f:
    json_obj = json.dumps(lexicon, indent=4, ensure_ascii=False)
    f.write(json_obj)

In [None]:
new_lexicon = {}
for key, value in lexicon.items():    
    new_lexicon[key] = value["us"] + value["uk"]

In [None]:
path = "/data/codes/apa/kaldi/g2p/lexicon/processed/cambridge-words.txt"
with open(path, "w", encoding="utf-8") as f:
    f.write("\n".join(new_lexicon.keys()))