In [None]:
import json
import pandas as pd
from tqdm import tqdm

In [None]:
def load_jsonl_data(path):
    with open(path, "r", encoding="utf-8") as f:
        content = f.readlines()
        lines = [json.loads(line.strip()) for line in content]
    data = pd.DataFrame(lines)

    return data

In [None]:
path = "/home/tuyendv/E2E-R/data/raw/info_question_type-12_01082022_18092023/metadata.jsonl"

metadata = load_jsonl_data(path)
metadata.head(1)

In [None]:
import re

def convert_to_phone_pure(arpas):
    phone_pures = []
    for phone in arpas:
        if phone == "AX":
            phone_pures.append("AH")
        elif phone == "DX":
            return None
        else:
            phone_pures.append(re.sub("\d", "", phone))

    phone_pures = " ".join(phone_pures)
    return phone_pures.lower()

In [None]:
metadata = metadata.sample(frac=1)

In [None]:
metadata["word_scores"] = metadata.apply(lambda row: [row["word_scores"][index] for index in row["word_ids"]], axis=1)

In [None]:
metadata.head()

In [None]:
train, test, val = {}, {}, {}

count = 0
for index in tqdm(metadata.index):
    utterance_id = metadata["id"][index]
    wav = metadata["audio_path"][index]
    utt_score = str(metadata["utterance_score"][index] / 50)
    
    text = metadata["text"][index].lower()
    spk_id = ""
    phn = convert_to_phone_pure(metadata["arpas"][index])
    phn_canonical = convert_to_phone_pure(metadata["arpas"][index])

    scores = [str(score/50) for score in metadata["phone_scores"][index]]
    wrd_score = [str(score/50) for score in metadata["word_scores"][index]]
    wrd_ids = [str(score + 1) for score in metadata["word_ids"][index]]

    if len(scores) > 32:
        continue
    
    duration = 0.0
    phn_ali = ""
    phn_ali_start = ""
    phn_ali_duration = ""

    if phn is None:
        continue

    scores = " ".join(scores)
    wrd_score = " ".join(wrd_score)
    wrd_ids= " ".join(wrd_ids)

    sample = {
        "utterance_id": utterance_id,
        "wav": wav,
        "text": text,
        "spk_id": spk_id,
        "phn": phn,
        "phn_canonical": phn_canonical,
        "phn_score": scores,
        "wrd_score": wrd_score,
        "utt_score": utt_score,
        "wrd_id": wrd_ids,
        "duration": duration,
        "phn_ali": phn_ali,
        "phn_ali_start": phn_ali_start,
        "phn_ali_duration": phn_ali_duration
    }

    if count < 50000:
        train[utterance_id] = sample
    elif count > 115000 and count <= 120000:
        test[utterance_id] = sample
    elif count > 125000 and count <= 130000:
        val[utterance_id] = sample

    count += 1

In [None]:
def save_jsonl_data_row_level(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json_obj = json.dumps(data, indent=4, ensure_ascii=False)

        f.write(f'{json_obj}\n')

    print(f'###saved jsonl data to: {path}')

out_path = "/home/tuyendv/E2E-R/data/scoring/train.json"
save_jsonl_data_row_level(data=train, path=out_path)
out_path = "/home/tuyendv/E2E-R/data/scoring/test.json"
save_jsonl_data_row_level(data=test, path=out_path)
out_path = "/home/tuyendv/E2E-R/data/scoring/val.json"
save_jsonl_data_row_level(data=val, path=out_path)