In [18]:
import json
import pandas as pd
from tqdm import tqdm

In [19]:
def load_jsonl_data(path):
    with open(path, "r", encoding="utf-8") as f:
        content = f.readlines()
        lines = [json.loads(line.strip()) for line in content]
    data = pd.DataFrame(lines)

    return data

In [20]:
path = "/home/tuyendv/E2E-R/data/apr/train-type-12/metadata-raw.jsonl"

metadata = load_jsonl_data(path)
metadata.head(1)

Unnamed: 0,id,raw_text,text,utt_id,start_time,end_time,arpas,trans,phone_scores,word_scores,decisions,word_ids,utterance_score,intonation_score,fluency_score,audio_path
0,2653235,sweaty,SWEATY,,,,"[S, W, EH1, T, IY0]","[S, W, EH, T, IY]","[98, 98, 0, 99, 96]",[69],"[2, 2, 0, 2, 2]","[0, 0, 0, 0, 0]",69.0,0,0,/data/audio/prep-submission-audio/apa-type-10/...


In [21]:
import re

def convert_to_phone_pure(arpas):
    phone_pures = []
    for phone in arpas:
        if phone == "AX":
            phone_pures.append("AH")
        elif phone == "DX":
            return None
        else:
            phone_pures.append(re.sub("\d", "", phone))

    phone_pures = " ".join(phone_pures)
    return phone_pures.lower()

In [22]:
metadata = metadata.sample(frac=1)

In [23]:
train, test, val = {}, {}, {}

count = 0
for index in tqdm(metadata.index):
    utterance_id = metadata["id"][index]
    wav = metadata["audio_path"][index]
    text = metadata["text"][index].lower()
    spk_id = ""
    phn = convert_to_phone_pure(metadata["arpas"][index])
    phn_canonical = convert_to_phone_pure(metadata["arpas"][index])
    scores = [str(score/50) for score in metadata["phone_scores"][index]]

    if len(scores) > 32:
        continue
    
    duration = 0.0
    phn_ali = ""
    phn_ali_start = ""
    phn_ali_duration = ""

    if phn is None:
        continue

    scores = " ".join(scores)

    sample = {
        "utterance_id": utterance_id,
        "wav": wav,
        "text": text,
        "spk_id": spk_id,
        "phn": phn,
        "phn_canonical": phn_canonical,
        "scores": scores,
        "duration": duration,
        "phn_ali": phn_ali,
        "phn_ali_start": phn_ali_start,
        "phn_ali_duration": phn_ali_duration
    }

    if count < 60000:
        train[utterance_id] = sample
    elif count > 60000 and count <= 70000:
        test[utterance_id] = sample
    elif count > 70000 and count <= 75000:
        val[utterance_id] = sample

    count += 1

100%|██████████| 76312/76312 [00:02<00:00, 29807.23it/s]


In [24]:
def save_jsonl_data_row_level(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json_obj = json.dumps(data, indent=4, ensure_ascii=False)

        f.write(f'{json_obj}\n')

    print(f'###saved jsonl data to: {path}')

out_path = "/home/tuyendv/E2E-R/data/prep/train.json"
save_jsonl_data_row_level(data=train, path=out_path)
out_path = "/home/tuyendv/E2E-R/data/prep/test.json"
save_jsonl_data_row_level(data=test, path=out_path)
out_path = "/home/tuyendv/E2E-R/data/prep/val.json"
save_jsonl_data_row_level(data=val, path=out_path)

###saved jsonl data to: /home/tuyendv/E2E-R/data/prep/train.json
###saved jsonl data to: /home/tuyendv/E2E-R/data/prep/test.json
###saved jsonl data to: /home/tuyendv/E2E-R/data/prep/val.json
