In [1]:
from pathlib import Path
import json

input_path = Path("./output/merged_koksl2022.json")

with open(input_path, "r", encoding="utf-8") as file:
    input_records = json.loads(file.read())


In [2]:
from itertools import chain

def normalize_nms(nms_dict: dict):
    if nms_dict:
        activated_nmss = filter(lambda item: item[1], nms_dict.items())

        def add_type(nms_type, tag_dict):
            return { **tag_dict, "gloss_id": nms_type}

        def normalize_mapper(item):
            nms_type, tag_dicts = item
            return (add_type(nms_type, tag_dict) for tag_dict in tag_dicts)

        return [*chain(*(normalize_mapper(nms) for nms in activated_nmss))]
    else:
        return []

normalize_nms(input_records[0]["nms_script"])


[{'descriptor': '', 'start': 2.931, 'end': 3.413, 'gloss_id': 'Hno'},
 {'descriptor': '', 'start': 3.687, 'end': 4.068, 'gloss_id': 'Hno'},
 {'descriptor': '', 'start': 1.826, 'end': 4.131, 'gloss_id': 'EBf'}]

In [None]:
def flatten_ann(record):
    idx = record["id"]
    ko = record["krlgg_sntenc"]["koreanText"]
    ksl_simple = record["sign_lang_sntenc"]
    ms_strong = record["sign_script"]["sign_gestures_strong"] or []
    ms_weak = record["sign_script"]["sign_gestures_weak"] or []
    nms = record["nms_script"]

    return {
        "id": idx,
        "ko": ko,
        "ksl_simple": ksl_simple,
        "ms_strong": sorted(ms_strong, key=lambda tag: tag["start"]),
        "ms_weak": sorted(ms_weak, key=lambda tag: tag["start"]),
        "nms": sorted(normalize_nms(nms), key=lambda tag: tag["start"]),
    }

flatten_ann(input_records[0])


{'id': 'SCUSHPAKOKSL2200000001',
 'ko': '네 많이 걱정하셨을 텐데요',
 'ksl_simple': '걱정/많다/같다[맞다]',
 'ms_strong': [{'start': 1.826,
   'end': 2.381,
   'gloss_id': '걱정',
   'express': 's',
   'direction': {'source': '', 'target': ''},
   'sentence_loc': {'start': '', 'end': ''}},
  {'start': 2.734,
   'end': 3.45,
   'gloss_id': '많다',
   'express': 's',
   'direction': {'source': '', 'target': ''},
   'sentence_loc': {'start': '', 'end': ''}},
  {'start': 3.568,
   'end': 4.132,
   'gloss_id': '같다[맞다]',
   'express': 's',
   'direction': {'source': '', 'target': ''},
   'sentence_loc': {'start': '', 'end': ''}}],
 'ms_weak': [{'start': 2.734,
   'end': 3.45,
   'gloss_id': '많다',
   'express': 's',
   'direction': None,
   'sentence_loc': None},
  {'start': 3.568,
   'end': 4.132,
   'gloss_id': '같다[맞다]',
   'express': 's',
   'direction': None,
   'sentence_loc': None}],
 'nms': [{'descriptor': '', 'start': 1.826, 'end': 4.131, 'gloss_id': 'EBf'},
  {'descriptor': '', 'start': 2.931, 'end': 3.413

In [None]:
def flatten_meta(record):
    idx = record["id"]
    realm = record["krlgg_sntenc"]["realm"]
    thema = record["krlgg_sntenc"]["thema"]
    trnslator = record["sign_lang_trnslator"]
    vido_file_nm = record["vido_file_nm"]
    sl_speaker_id = record["potogrf"]["sl_speaker_id"]
    sl_speaker_age = record["potogrf"]["sl_speaker_age"]
    sl_speaker_sex = record["potogrf"]["sl_speaker_sex"]
    sl_speaker_legion = record["potogrf"]["sl_speaker_legion"]
    sl_speaker_hand = record["potogrf"]["sl_speaker_hand"]

    return {
        "id": idx,
        "realm": realm,
        "thema": thema,
        "trnslator": trnslator,
        "vido_file_nm": vido_file_nm,
        "sl_speaker_id": sl_speaker_id,
        "sl_speaker_age": sl_speaker_age,
        "sl_speaker_sex": sl_speaker_sex,
        "sl_speaker_legion": sl_speaker_legion,
        "sl_speaker_hand": sl_speaker_hand,
    }

flatten_meta(input_records[0])


{'id': 'SCUSHPAKOKSL2200000001',
 'realm': '문화',
 'thema': '쇼핑',
 'trnslator': 'LYJ',
 'vido_file_nm': 'SCUSHPAKOKSL2200000001',
 'sl_speaker_id': 'YJ',
 'sl_speaker_age': 50,
 'sl_speaker_sex': 'Male',
 'sl_speaker_legion': 'Seoul',
 'sl_speaker_hand': 'right_handed'}

In [None]:
ann_records = [flatten_ann(record) for record in input_records]

ann_path = Path("./output/flattened_koksl2022_ann.json")

with open(ann_path, "w", encoding="utf-8") as file:
    file.write(json.dumps(ann_records, ensure_ascii=False, indent=4))


meta_records = [flatten_meta(record) for record in input_records]

meta_path = Path("./output/flattened_koksl2022_meta.json")

with open(meta_path, "w", encoding="utf-8") as file:
    file.write(json.dumps(meta_records, ensure_ascii=False, indent=4))
    