In [1]:
import pandas as pd
import os
from glob import glob
import json
from pandarallel import pandarallel
import random
import re

pandarallel.initialize(nb_workers=10, progress_bar=True)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
json_dir = "/data/audio_data/pronunciation_scoring_result/marking_data/9"
metadata_path="/data/audio_data/pronunciation_scoring_result/merged_info/info_question_type-9_01082022_18092023.csv"
metadata = pd.read_csv(metadata_path)
metadata.head(2)

Unnamed: 0,id,is_deleted,user_id,question_id,question_type,question_content,url,score,fidelity_class,created_at,total_time,word_count
0,5580125,0,105954.0,224272,9,"To be honest with you, I have never watched a ...",https://storage.googleapis.com/materials-eleme...,,,2023-09-18 21:18:30,23.66,62.0
1,5580126,0,105954.0,224271,9,Definitely yes. When you have been robot aroun...,https://storage.googleapis.com/materials-eleme...,,,2023-09-18 21:18:30,23.37,66.0


In [3]:
def parse_metadata_data(json_path, user_id):
        try:
            with open(json_path, "r") as f:
                content = json.load(f)

            utterances = []
            for raw_utterance in content["utterances"]:
                for word_id, word in enumerate(raw_utterance["result"]["words"]):
                    trans_arpabet = word["trans_arpabet"]
                    text = word["text"]

                    utterances.append([text, trans_arpabet])
            return utterances
        except:
             return []

extracted_data = metadata.parallel_apply(lambda x: parse_metadata_data(json_path=os.path.join(json_dir, f'{x["id"]}.json'), user_id=x["user_id"] ), axis=1)
extracted_data.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=23806), Label(value='0 / 23806')))…

0    [[To, T AH0], [be, B IY0], [honest, AA1 N AH0 ...
1    [[Definitely, D EH1 F AH0 N AH0 T L IY0], [yes...
2    [[In, AH0 N], [my, M AY1], [opinion, AH0 P IH1...
3    [[Apart, AH0 P AA1 R T], [from, F R AH1 M], [t...
4    [[Considering, K AH0 N S IH1 DX ER0 IH0 NG], [...
dtype: object

In [4]:
lexicon = extracted_data.explode().reset_index()
lexicon.dropna(inplace=True)

lexicon["word"] = lexicon[0].apply(lambda x: x[0])
lexicon["arpa"] = lexicon[0].apply(lambda x: x[1])

In [5]:
from tqdm import tqdm

count = {}
vocab = {}
for index in tqdm(lexicon.index):
    word = lexicon["word"][index].upper()
    arpa = lexicon["arpa"][index]
    key = f'{word}-{arpa}'

    if key not in count:
        count[key] = 1
    else:
        count[key] += 1

    if word not in vocab:
        vocab[word] = [arpa, ]
    else:
        if arpa not in vocab[word]:
            vocab[word].append(arpa)
    

  0%|          | 0/12547683 [00:00<?, ?it/s]

100%|██████████| 12547683/12547683 [01:06<00:00, 188550.62it/s]


In [6]:
lexicon = []
for key, value in vocab.items():
    if len(key.split()) > 1:
        continue
    
    for arpa in value:
        tmp = f'{key}-{arpa}'
        # if count[tmp] < 2:
        #     continue
        lexicon.append([key, arpa])

In [7]:
path = "/data/codes/apa/kaldi/g2p/lexicon/processed/elsa-lexicon.txt"
with open(path, "w", encoding="utf-8") as f:
    for line in lexicon:
        f.write("\t".join(line))
        f.write("\n")