# Get Harmonization Training Data

## Setup

We will use the same source of data as we did for v3 (the latest) of the synthetic data from objective 1. Using only the mutated versions here: https://uchicago.app.box.com/folder/318234520321?s=j0oq2pfhe59p3caqq3ss32kxqwciwbsd

You can limit the training data generation to utilize only one of the above sub-folders, for example, we will extract and use only one: `mutated_sdc_v3_nmax4_nmin2_pmax75_pmin25_limit20_dmax1000_20250423.zip` and ignore others. It's unclear how much data will be needed for training, so we could add more in the future.

Ensure the above is extracted into a `../data/Mutated_SDCs_v3_20250423/mutated_sdc_v3_nmax4_nmin2_pmax75_pmin25_limit20_dmax1000_20250423` folder.

Also we need the required the synthetic data dictionary information: https://uchicago.app.box.com/folder/318236025225

Extract the above in a `../data/SDMs_nodes6-15_props50-100_20250423` folder.


In [None]:
import os
import copy
import json
import shutil
import logging

from harmonization.utils import (
    TEMP_DIR,
    get_data_model_as_node_prop_descriptions,
    get_gen3_json_schemas_and_templates,
)

GEN3_MANUAL_DD_PATH = {
    "aids.diseasedatahub.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/aids.diseasedatahub.org__jsonschema_dd_modified.json",
    "bihstaging.data-commons.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/bihstaging.data-commons.org__jsonschema_dd_modified.json",
    "caninedc.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/caninedc.org__jsonschema_dd_modified.json",
    "chicagoland.pandemicresponsecommons.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/chicagoland.pandemicresponsecommons.org__jsonschema_dd_modified.json",
    "chordshealth.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/chordshealth.org__jsonschema_dd_modified.json",
    "data.bloodpac.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/data.bloodpac.org__jsonschema_dd_modified.json",
    "data.midrc.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/data.midrc.org__jsonschema_dd_modified.json",
    "diseasedatahub.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/diseasedatahub.org__jsonschema_dd_modified.json",
    "flu.diseasedatahub.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/flu.diseasedatahub.org__jsonschema_dd_modified.json",
    "gen3.biodatacatalyst.nhlbi.nih.gov": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/gen3.biodatacatalyst.nhlbi.nih.gov__jsonschema_dd_modified.json",
    "gen3.datacommons.io": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/gen3.datacommons.io__jsonschema_dd_modified.json",
    "genomel": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/genomel__jsonschema_dd_modified.json",
    "healdata.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/healdata.org__jsonschema_dd_modified.json",
    "hnc": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/hnc__jsonschema_dd_modified.json",
    "ibdgc": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/ibdgc__jsonschema_dd_modified.json",
    "icgc.bionimbus.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/icgc.bionimbus.org__jsonschema_dd_modified.json",
    "jcoin.datacommons.io": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/jcoin.datacommons.io__jsonschema_dd_modified.json",
    "kidsfirst": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/kidsfirst__jsonschema_dd_modified.json",
    "microbiome": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/microbiome__jsonschema_dd_modified.json",
    "mmrf": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/mmrf__jsonschema_dd_modified.json",
    "nci-crdc.datacommons.io": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/nci-crdc.datacommons.io__jsonschema_dd_modified.json",
    "pdp": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/pdp__jsonschema_dd_modified.json",
    "portal.occ-data.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/portal.occ-data.org__jsonschema_dd_modified.json",
    "rerf": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/rerf__jsonschema_dd_modified.json",
    "tb.diseasedatahub.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/tb.diseasedatahub.org__jsonschema_dd_modified.json",
    "toxdatacommons": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/toxdatacommons__jsonschema_dd_modified.json",
    "vpodc.data-commons.org": "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/vpodc.data-commons.org__jsonschema_dd_modified.json",
}

SYNONYMOUS_NODES = {
    "study": ["study", "dataset", "clinical_trial", "collection", "research"],
    "dataset": ["study", "dataset", "clinical_trial", "collection", "research"],
    "clinical_trial": ["study", "dataset", "clinical_trial", "collection", "research"],
    "collection": ["study", "dataset", "clinical_trial", "collection", "research"],
    "research": ["study", "dataset", "clinical_trial", "collection", "research"],
    "subject": ["subject", "patient", "case", "participant"],
    "patient": ["subject", "patient", "case", "participant"],
    "case": ["subject", "patient", "case", "participant"],
    "participant": ["subject", "patient", "case", "participant"],
    "biospecimen": ["biospecimen", "specimen"],
    "specimen": ["biospecimen", "specimen"],
    "project": ["project"],  # this forces a search over other DDs later
    "program": ["program"],  # this forces a search over other DDs later
}
SYNONYMOUS_NODES_TO_GEN3_MANUAL_DD = {
    # will be populated programatically
}
gen3_dd_schemas = {
    # will be populated programatically
}

In [None]:
# creating mapping from nodes that have snyns to a list of dicts with that particular node name
for node, _ in SYNONYMOUS_NODES.items():
    for schema_name, schema_path in GEN3_MANUAL_DD_PATH.items():
        with open(schema_path, "r") as file:
            gen3_dd_schema = json.load(file)
            gen3_dd_schemas[schema_name] = gen3_dd_schema
            if node in gen3_dd_schema:
                SYNONYMOUS_NODES_TO_GEN3_MANUAL_DD.setdefault(node, set()).add(
                    schema_name
                )

### Convert original data model schema format to more efficient dictionary-based format

In [None]:
sdm_input_schemas_dir = "../data/SDMs_nodes6-15_props50-100_20250423/input_schemas/"
for root, dirs, files in os.walk(sdm_input_schemas_dir):
    for file in files:
        if "_modified.json" in file:
            continue

        output_filename = os.path.join(root, file.replace(".json", "_modified.json"))
        if os.path.exists(output_filename):
            continue

        with open(os.path.join(root, file)) as input_file:
            original_data = json.load(input_file)
            final_output_structure = {
                # "{{node_name}}": {
                # "properties": {
                # "{{property_name}}": {}
                # }
                # }
            }
            for node in original_data["nodes"]:
                new_node = copy.deepcopy(node)

                # is currently a list, we want a dict
                del new_node["properties"]
                new_node["properties"] = {}

                for node_property in node["properties"]:
                    new_node["properties"][node_property["name"]] = node_property

                final_output_structure[node["name"]] = new_node

            with open(output_filename, "w") as output_file:
                output_file.write(json.dumps(final_output_structure))

In [None]:
# Obtain mutated real data dicts: https://uchicago.app.box.com/folder/318234520321?s=j0oq2pfhe59p3caqq3ss32kxqwciwbsd
# and place as input_dir
# input_dir = "../data/Mutated_SDCs_v3_20250423"
input_dir = "../data/Mutated_SDCs_v3_20250423"

# synth_dd_to_real_info_file = (
#     "../data/SDMs_nodes6-15_props50-100_20250423/_modified_SDM_stats.json"
# )
sdm_stats_filepath = "../data/SDMs_nodes6-15_props50-100_20250423/_SDM_stats.json"
output_dir = "../datasets/harmonization_training_Mutated_SDCs_v3_20250423_v0.0.2"

In [None]:
def get_node_prop_desc_from_real_data_model(
    gen3_dd_schemas, gen3_domain, node_name, property_name
):
    description = get_desc_from_real_data_model(
        gen3_dd_schemas, gen3_domain, node_name, property_name
    )
    if description is None:
        logging.warning(
            f"could not get desc for property {property_name} in node {node_name} from {gen3_domain}"
        )

    return f"{node_name}.{property_name}: {description}"


def get_desc_from_real_data_model(
    gen3_dd_schemas, gen3_domain, node_name, property_name
):
    # Extract descriptions from Gen3 dictionary domain
    gen3_node = gen3_dd_schemas[gen3_domain].get(node_name, {})

    gen3_node_prop = gen3_node.get("properties", {}).get(property_name, {})

    if not gen3_node_prop:
        # logging.warning(
        #     f"Property {property_name} not in node {node_name} in {gen3_domain}"
        # )
        return None

    gen3_description = (
        gen3_node_prop.get("description", "").replace("\t", "    ").replace("\n", " ")
    )

    return gen3_description


def get_synth_model_to_original(sdm_stats_filepath):
    synth_model_to_original = {}

    with open(sdm_stats_filepath) as file:
        sdm_stats = json.load(file)

    for model_num, model_info in sdm_stats["models"].items():
        # we will populate this information per synth model
        synth_model_to_original_info = {
            "node_to_original_info": None,
            "synth_model_node_to_properties": None,
            "synth_model_prop_to_original_models": None,
        }

        # todo: helper func?
        node_to_original_info = {}
        for node_name, node_info in model_info["nodes"].items():
            real_data_model = node_info["dm_name"]

            node_to_original_info[node_name] = node_info

        synth_model_to_original_info["node_to_original_info"] = node_to_original_info

        # now we need to get the actual nodes and props used in this synth model
        # need to read in synthmodel.json
        synth_model_path = os.path.join(
            os.path.dirname(sdm_stats_filepath), f"SDM_{model_num}.json"
        )
        synth_model_node_to_properties = {}
        with open(synth_model_path) as synth_model_file:
            synth_model = json.load(synth_model_file)
            for node_info in synth_model["nodes"]:
                for node_property in node_info["properties"]:
                    synth_model_node_to_properties.setdefault(
                        node_info["name"], []
                    ).append(node_property)

        synth_model_to_original_info["synth_model_node_to_properties"] = (
            synth_model_node_to_properties
        )

        # next step

        synth_model_prop_to_original_models = {
            # "node.property": [{
            #                       "gen3_domain": gen3_domain
            #                       "original_node": gen3_node
            #                       "original_property": gen3_node_prop
            #                   }, ...]
        }
        for node_name, node_info in synth_model_to_original_info[
            "synth_model_node_to_properties"
        ].items():
            for node_property_info in node_info:
                property_name = node_property_info["name"]
                original_sources_already_seen_for_prop = set()

                # find which original dd and node property came from

                # node is not synonymous so we just check the original
                if node_name not in SYNONYMOUS_NODES:
                    if (
                        node_name
                        in synth_model_to_original_info["node_to_original_info"]
                    ):
                        original_model = synth_model_to_original_info[
                            "node_to_original_info"
                        ][node_name]["dm_name"]

                        synth_node_prop_desc = node_property_info["description"]

                        gen3_node = gen3_dd_schemas[original_model].get(node_name, {})
                        gen3_node_prop = gen3_node["properties"].get(property_name, {})

                        if gen3_node_prop:
                            original_info = {
                                "gen3_domain": gen3_domain,
                                "original_node": gen3_node,
                                "original_property": gen3_node_prop,
                            }
                            synth_model_prop_to_original_models.setdefault(
                                f"{node_name}.{property_name}", []
                            ).append(original_info)
                    else:
                        # node not found in specified original model and
                        # not a synonmous node.
                        # This means we need to check parent_nodes information to determine where this node came from
                        # (due to missing information from the synthetic data generation SDM stats creation)
                        for node, node_info in model_info["nodes"].items():
                            if node_name in node_info.get("parent_nodes", {}):
                                gen3_domain = node_info["dm_name"]
                                gen3_node = gen3_dd_schemas[gen3_domain].get(
                                    node_name, {}
                                )
                                gen3_node_prop = gen3_node["properties"].get(
                                    node_name, {}
                                )

                                if gen3_node_prop:
                                    original_info = {
                                        "gen3_domain": gen3_domain,
                                        "original_node": gen3_node,
                                        "original_property": gen3_node_prop,
                                    }
                                    synth_model_prop_to_original_models.setdefault(
                                        f"{node_name}.{property_name}", []
                                    ).append(original_info)

                    if original_model not in original_sources_already_seen_for_prop:
                        original_info = {
                            "gen3_domain": original_model,
                            "original_node": gen3_node,
                            "original_property": gen3_node_prop,
                        }
                        original_sources_already_seen_for_prop.add(original_model)
                        synth_model_prop_to_original_models.setdefault(
                            f"{node_name}.{property_name}", []
                        ).append(original_info)
                elif node_name in SYNONYMOUS_NODES:
                    alternate_names = SYNONYMOUS_NODES.get(node_name, [])

                    for alternate_node_name in alternate_names:
                        for gen3_domain in SYNONYMOUS_NODES_TO_GEN3_MANUAL_DD.get(
                            alternate_node_name, []
                        ):
                            gen3_node = gen3_dd_schemas[gen3_domain].get(
                                alternate_node_name, {}
                            )
                            new_property_name = property_name.replace(
                                node_name, alternate_node_name
                            )

                            gen3_node_prop = gen3_node["properties"].get(
                                new_property_name, {}
                            )

                            if gen3_node_prop:
                                original_info = {
                                    "gen3_domain": gen3_domain,
                                    "original_node": gen3_node,
                                    "original_property": gen3_node_prop,
                                }
                                synth_model_prop_to_original_models.setdefault(
                                    f"{node_name}.{property_name}", []
                                ).append(original_info)
                else:
                    logging.warning(
                        f"could not find {node_name}.{property_name} in SDM_{model_num}.json"
                    )

        synth_model_to_original_info["synth_model_prop_to_original_models"] = (
            synth_model_prop_to_original_models
        )

        synth_model_to_original[model_num] = synth_model_to_original_info
    return synth_model_to_original

In [None]:
synth_model_to_original = None

In [None]:
# This takes time, run this once then rely on persisted results (i.e. comment this out after running once)
synth_model_to_original = get_synth_model_to_original(sdm_stats_filepath)
print(len(synth_model_to_original.keys()))

In [None]:
def output_training_data_for_directory(
    input_dir,
    output_dir,
    synth_model_to_original,
    force_recreation_of_synth_data_info=False,
    force_recreation_of_synth_data_contributions=False,
):
    for entry in os.scandir(input_dir):
        if entry.is_dir() and os.path.basename(entry).startswith("mutated_"):
            logging.warning(f"processing: {entry.path}")
            for data_model_dir in os.scandir(entry):
                if data_model_dir.is_dir() and os.path.basename(
                    data_model_dir
                ).endswith("_tsvs"):
                    logging.debug(
                        f"processing data model directory: {data_model_dir.path}"
                    )
                    for data_contribution_dir in os.scandir(data_model_dir):
                        if data_contribution_dir.is_dir():
                            logging.debug(
                                f"processing data contribution directory: {os.path.basename(data_contribution_dir.path)}"
                            )
                            persist_synth_data_info_to_contribution_directory(
                                data_contribution_dir.path,
                                synth_model_to_original,
                                force_recreation=force_recreation_of_synth_data_info,
                            )
                            process_synth_data_contribution_directory(
                                data_contribution_dir.path,
                                output_dir,
                                gen3_dd_schemas,
                                force_recreation=force_recreation_of_synth_data_contributions,
                            )


def persist_synth_data_info_to_contribution_directory(
    directory_path, synth_model_to_original, force_recreation=False
):
    if not synth_model_to_original:
        logging.warning(
            f"Trying to persist synth_model_to_original but nothing was provided"
        )
        return

    model_path = os.path.dirname(directory_path)
    mutation_path = os.path.dirname(model_path)
    # logging.debug(f"model_path: {model_path}")
    # logging.debug(f"mutation_path: {mutation_path}")

    synth_model_to_original_filepath = os.path.join(
        model_path, "synth_model_to_original.json"
    )

    if force_recreation and os.path.exists(synth_model_to_original_filepath):
        os.remove(synth_model_to_original_filepath)

    if not os.path.exists(synth_model_to_original_filepath):
        with open(
            os.path.join(model_path, "synth_model_to_original.json"), "w"
        ) as output:
            for entry in os.scandir(directory_path):
                if os.path.basename(entry).endswith("__var_map.json"):
                    synth_model_name = entry.path.split("__")[0]
                    synth_model_num = synth_model_name.split("_")[-1]

                    specific_synth_model_to_original = synth_model_to_original[
                        synth_model_num
                    ]
                    output.write(json.dumps(specific_synth_model_to_original))


def process_synth_data_contribution_directory(
    directory_path, output_dir, gen3_dd_schemas, force_recreation=False
):
    """
    For each original model, generate a folder with files related
    Format of output:
    """
    # each of these SDCs will generate multiple test cases b/c properties from

    model_path = os.path.dirname(directory_path)
    mutation_path = os.path.dirname(model_path)

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(mutation_path, exist_ok=True)
    os.makedirs(model_path, exist_ok=True)

    synth_model_to_original_filepath = os.path.join(
        model_path, "synth_model_to_original.json"
    )
    specific_synth_model_to_original = {}
    with open(synth_model_to_original_filepath) as input_file:
        try:
            specific_synth_model_to_original = json.load(input_file)
        except Exception as exc:
            logging.error(
                f"Could not get specific_synth_model_to_original from: {input_file}. Skipping..."
            )
            return

    output_dir_updated = os.path.join(output_dir, os.path.basename(mutation_path))
    output_dir_updated = os.path.join(output_dir_updated, os.path.basename(model_path))
    output_dir_updated = os.path.join(
        output_dir_updated, os.path.basename(directory_path)
    )

    if force_recreation and os.path.exists(output_dir_updated):
        os.remove(output_dir_updated)

    if not os.path.exists(output_dir_updated):
        output_mappings_for_directory(
            gen3_dd_schemas=gen3_dd_schemas,
            input_dir=directory_path,
            output_dir=output_dir_updated,
            specific_synth_model_to_original=specific_synth_model_to_original,
        )
    else:
        # print(f"skipping {output_dir_updated}, already exists")
        pass


def output_mappings_for_directory(
    gen3_dd_schemas, input_dir, output_dir, specific_synth_model_to_original
):
    # Mirror directory structure and process files
    for root, dirs, files in os.walk(input_dir):
        relative_path = os.path.relpath(root, input_dir)
        mirrored_path = os.path.join(output_dir, relative_path)
        os.makedirs(mirrored_path, exist_ok=True)

        ai_model_node_props = {}
        for file_name in files:
            if file_name.endswith("__var_map.json"):
                synth_model_name = file_name.split("__")[0]
                synth_model_num = synth_model_name.split("_")[-1]

                synth_model_prop_to_original_models = specific_synth_model_to_original[
                    "synth_model_prop_to_original_models"
                ]

                # Transform var_map.json to expected_mappings.tsv
                with open(os.path.join(root, file_name), "r") as var_map_file:
                    var_map_data = json.load(var_map_file)

                # print(f"var_map_data: {var_map_data}")

                # since we can have nodes from multiple different original models
                tsv_lines_per_original_model = {}
                tsv_lines_header = (
                    "ai_model_node_prop_desc\tharmonized_model_node_prop_desc\n"
                )
                original_node_to_harmonized_node = {}
                for original_key, value in var_map_data.items():
                    # the varmap data comes in 3 forms: node, node.property, and node."foreign_key_node.foreign_key_property"
                    # we only want really care about determining mapping from node.property at this point, so ignore
                    # the other two
                    if "." not in original_key:
                        original_node_to_harmonized_node[original_key] = value.get(
                            "new_name", ""
                        )

                for original_key, value in var_map_data.items():
                    if "." not in original_key:
                        continue

                    original_node = original_key.split(".")[0]
                    harmonized_node = original_node_to_harmonized_node[original_node]
                    original_property = ".".join(original_key.split(".")[-2:])

                    new_name = value.get("new_name", "")
                    new_description = value.get("new_description", "").strip()

                    ai_model_node_prop_desc = (
                        f"{harmonized_node}.{new_name}: {new_description}"
                    )
                    # ai_model_node_prop_desc = specific_synth_model_to_original["synth_model_node_to_properties"].get(
                    #     new_description, ""
                    # )

                    # if not ai_model_node_prop_desc:
                    #     ai_model_node_prop_desc = ai_model_node_props.get(new_name, "")

                    # if not ai_model_node_prop_desc:
                    #     logging.debug(
                    #         f"can't find: {new_name} by description: {new_description} in output. skipping..."
                    #     )
                    #     continue

                    # get gen3_domain (e.g. original model) for this particular node prop by
                    # using previously constructured mapping information
                    original_node_props = synth_model_prop_to_original_models.get(
                        original_key, []
                    )
                    # if not original_node_props:
                    #     logging.warning(
                    #         f"{original_key} not in synth_model_prop_to_original_models for {ai_model_node_prop_desc}"
                    #     )

                    for original_node_prop in original_node_props:
                        gen3_domain = original_node_prop["gen3_domain"]
                        original_node = original_node_prop["original_node"]["name"]
                        original_property = original_node_prop["original_property"].get(
                            "name", ""
                        )

                        # Extract descriptions from Gen3 dictionary domain
                        gen3_description = (
                            get_desc_from_real_data_model(
                                gen3_dd_schemas,
                                gen3_domain,
                                original_node,
                                original_property,
                            )
                            or ""
                        )
                        harmonized_model_node_prop_desc = (
                            f"{original_node}.{original_property}: {gen3_description}"
                        )

                        ai_model_line = f"{ai_model_node_prop_desc}\t{harmonized_model_node_prop_desc}"
                        tsv_lines_per_original_model.setdefault(gen3_domain, []).append(
                            ai_model_line
                        )

                # Save as expected_mappings.tsv
                if tsv_lines_per_original_model:
                    for gen3_domain, tsv_lines in tsv_lines_per_original_model.items():
                        os.makedirs(
                            os.path.join(mirrored_path, gen3_domain), exist_ok=True
                        )
                        with open(
                            os.path.join(
                                os.path.join(mirrored_path, gen3_domain),
                                "expected_mappings.tsv",
                            ),
                            "w",
                        ) as tsv_file:
                            tsv_file.write(tsv_lines_header)
                            tsv_file.write("\n".join(list(set(tsv_lines))))
                else:
                    logging.warning(f"No mappings for {mirrored_path}")

            # Process files, rename jsonschema to ai_model_output
            elif "__jsonschema_dd__" in file_name:
                if file_name.endswith("__jsonschema_dd.json"):
                    gen3_domain = file_name.split("__")[0]

                    output_harmonized_model_path = os.path.join(
                        mirrored_path, "harmonized_data_model.json"
                    )
                    with open(output_harmonized_model_path, "w") as dd_file:
                        dd_file.write(json.dumps(gen3_dd_schemas[gen3_domain]))

                    # Rename JSON schema files
                    new_name = file_name.replace("__jsonschema_dd__", "__").replace(
                        "__jsonschema_dd.json", "__ai_model_output.json"
                    )
                    shutil.copy(
                        os.path.join(root, file_name),
                        os.path.join(mirrored_path, new_name),
                    )
                    with open(os.path.join(root, file_name)) as ai_model_ouput_file:
                        ai_model_ouput = json.load(ai_model_ouput_file)
                        ai_model_node_props_raw = (
                            get_data_model_as_node_prop_descriptions(ai_model_ouput)
                        )
                        for node_prop in ai_model_node_props_raw:
                            if ":" in node_prop:
                                prop = node_prop.split(":")[0]
                                desc = ":".join(node_prop.split(":")[1:])
                            else:
                                prop = node_prop
                                desc = ""

                            # key on description b/c that's what's in the mapping we can rely on
                            ai_model_node_props[desc.strip()] = node_prop

    # print(f"Done. Output: {os.path.abspath(output_dir)}")


# sequential approach, ignore for now. async approach is below
# output_training_data_for_directory(input_dir, output_dir, synth_model_to_original)

## Process all SDCs into Training Data using the Pre-Computed Info Above

This takes a _long_ time. If you leave the default settings, it'll skip recomputing folders that already exist in the output (i.e. you can stop and restart it many times and it'll basically continue where it left off).

You can adjust the number of workers `num_workers` below and you should set that as high as possible.

We're processing 7 folders, each with 10k synthetic data models, and for each synthetic data model we can have upwards of a dozen SDCs. And each SDC can create mappings for up to 27 of the original data models (e.g. if project contains properties that would map to any model, we generate expected mappings to that model).

We don't have the full count yet, but this could mean a maximum of ~20 million expected mappings.

In [None]:
import os
import logging
import concurrent.futures


def output_training_data_for_directory(
    input_dir,
    output_dir,
    synth_model_to_original,
    force_recreation_of_synth_data_info=False,
    force_recreation_of_synth_data_contributions=False,
    num_workers=8,
):
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        for entry in os.scandir(input_dir):
            if entry.is_dir() and os.path.basename(entry).startswith("mutated_"):
                logging.warning(f"processing: {entry.path}")
                for data_model_dir in os.scandir(entry):
                    if data_model_dir.is_dir() and os.path.basename(
                        data_model_dir
                    ).endswith("_tsvs"):
                        logging.debug(
                            f"processing data model directory: {data_model_dir.path}"
                        )
                        for data_contribution_dir in os.scandir(data_model_dir):
                            if data_contribution_dir.is_dir():
                                logging.debug(
                                    f"queueing data contribution directory: {os.path.basename(data_contribution_dir.path)}"
                                )
                                # submit each contribution dir to the pool
                                futures.append(
                                    executor.submit(
                                        _process_single_data_contribution_dir,
                                        data_contribution_dir.path,
                                        output_dir,
                                        synth_model_to_original,
                                        force_recreation_of_synth_data_info,
                                        force_recreation_of_synth_data_contributions,
                                    )
                                )
        # wait for all to finish
        for f in concurrent.futures.as_completed(futures):
            try:
                f.result()
            except Exception as exc:
                logging.error(
                    f"[THREAD ERROR] Data contribution dir task failed: {exc}"
                )


def _process_single_data_contribution_dir(
    data_contribution_path,
    output_dir,
    synth_model_to_original,
    force_recreation_of_synth_data_info,
    force_recreation_of_synth_data_contributions,
):
    # This function is called in thread pool, for a single data_contribution directory
    persist_synth_data_info_to_contribution_directory(
        data_contribution_path,
        synth_model_to_original,
        force_recreation=force_recreation_of_synth_data_info,
    )
    process_synth_data_contribution_directory(
        data_contribution_path,
        output_dir,
        gen3_dd_schemas,
        force_recreation=force_recreation_of_synth_data_contributions,
    )


output_training_data_for_directory(input_dir, output_dir, synth_model_to_original)

## Use Data Generated from Above to Construct AI-ready Training Data in JSONL Files

In [None]:
training_files_dir = (
    "../datasets/harmonization_training_Mutated_SDCs_v3_20250423_v0.0.2"
)

In [None]:
import os
import shutil


def remove_property_folders_without_mapping(root_folder):
    """
    In each root/mutated_dd_*/original_gen3_dd_*/<arbitrary_folder>/,
    deletes <arbitrary_folder> if expected_mappings.tsv is missing.
    """
    for mutated_dd_dir in os.listdir(root_folder):
        mutated_dd_path = os.path.join(root_folder, mutated_dd_dir)
        if not os.path.isdir(mutated_dd_path):
            continue
        for original_dd_dir in os.listdir(mutated_dd_path):
            original_dd_path = os.path.join(mutated_dd_path, original_dd_dir)
            if not os.path.isdir(original_dd_path):
                continue
            for prop_folder in os.listdir(original_dd_path):
                prop_folder_path = os.path.join(original_dd_path, prop_folder)
                if not os.path.isdir(prop_folder_path):
                    continue
                for gen3_dd_dir in os.listdir(prop_folder_path):
                    gen3_dd_path = os.path.join(prop_folder_path, gen3_dd_dir)
                    if not os.path.isdir(gen3_dd_path):
                        continue
                    mapping_path = os.path.join(gen3_dd_path, "expected_mappings.tsv")
                    if not os.path.exists(mapping_path):
                        print(
                            f"Removing folder (no expected_mappings.tsv): {gen3_dd_path} "
                        )
                        shutil.rmtree(prop_folder_path)

In [None]:
remove_property_folders_without_mapping(training_files_dir)

In [None]:
# import concurrent.futures
# import os
# import logging


# def _process_single_mutated_dir(
#     entry,
#     synth_model_to_original,
#     output_dir,
#     force_recreation_of_synth_data_info,
#     force_recreation_of_synth_data_contributions,
# ):
#     for data_model_dir in os.scandir(entry.path):
#         if data_model_dir.is_dir() and os.path.basename(data_model_dir).endswith(
#             "_tsvs"
#         ):
#             logging.debug(f"processing data model directory: {data_model_dir.path}")
#             for data_contribution_dir in os.scandir(data_model_dir):
#                 if data_contribution_dir.is_dir():
#                     logging.debug(
#                         f"processing data contribution directory: {os.path.basename(data_contribution_dir.path)}"
#                     )
#                     persist_synth_data_info_to_contribution_directory(
#                         data_contribution_dir.path,
#                         synth_model_to_original,
#                         force_recreation=force_recreation_of_synth_data_info,
#                     )
#                     process_synth_data_contribution_directory(
#                         data_contribution_dir.path,
#                         output_dir,
#                         gen3_dd_schemas,
#                         force_recreation=force_recreation_of_synth_data_contributions,
#                     )


# def output_training_data_for_directory(
#     input_dir,
#     output_dir,
#     synth_model_to_original,
#     force_recreation_of_synth_data_info=False,
#     force_recreation_of_synth_data_contributions=False,
#     num_workers=14,
# ):
#     with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
#         futures = []
#         for entry in os.scandir(input_dir):
#             if entry.is_dir() and os.path.basename(entry).startswith("mutated_"):
#                 logging.warning(f"processing: {entry.path}")
#                 futures.append(
#                     executor.submit(
#                         _process_single_mutated_dir,
#                         entry,
#                         synth_model_to_original,
#                         output_dir,
#                         force_recreation_of_synth_data_info,
#                         force_recreation_of_synth_data_contributions,
#                     )
#                 )
#         # Optionally, wait for all to complete:
#         for future in concurrent.futures.as_completed(futures):
#             try:
#                 future.result()
#             except Exception as exc:
#                 logging.error(f"Error in parallel worker: {exc}")

# output_training_data_for_directory(input_dir, output_dir, synth_model_to_original)

In [None]:
# def output_mappings_from_root_directory(gen3_dd_schemas, input_dir, output_dir):
#     # Mirror directory structure and process files
#     for root, dirs, files in os.walk(input_dir):
#         for dir in dirs:
#             if "mutated_" not in dir:
#                 continue

#             mirrored_path = os.path.join(output_dir, dir)
#             os.makedirs(mirrored_path, exist_ok=True)

#             full_path = os.path.join(root, dir)
#             print(f"Handling dir: {full_path}")
#             output_mappings_for_directory(
#                 gen3_dd_schemas, full_path,
#                 mirrored_path,
#             )

In [None]:
# output_mappings_from_root_directory(gen3_dd_schemas, input_dir, output_dir)

Format of output:

- harmonization_training_Mutated_SDCs_v3_20250423_v0.0.2
  - mutated_synthetic_data_with_specific_parameters
    - SDM_synthetic_data_model_0
      - synthetic_data_contribution_0
        - original_real_source_data_model_0
          - expected_mappings.tsv
        - original_real_source_data_model_1
          - expected_mappings.tsv
        - ...
      - ...
    - ...

`expected_mappings.tsv` has the following columns: 

- `ai_model_node_prop_desc`
    - The AI Model generated node property (e.g. the property we want to map/harmonize to a target model)
- `harmonized_model_node_prop_desc`
    - The source of truth target model node property (e.g. the property from the target model we know the above should map to)

and the target model itself is identified by the `original_real_source_data_model` folder that the expected mappings are in.

#### How do we know the proper mapping?

We took mutated synthetic data from our first algorithm and traced backwards.

This is how we got the mutated data in the first place:

27 Real Gen3 Data Models -> 10k Synthetic Data Models -> Synthetic Data Contributions -> Mutated Synthetic Data Contributions

We were able to trace from the mutations, back to the synthetic model, and from the synthetic model we are able to determine
the set of possible mappings for that particular property back to the real data models. 

> Key point: A single mutated property could potentially map back to `n` original real data models, so we collect every option

## Construct Single Benchmark Test File

TBD
<!-- 
Now let's create a JSONL file with a test per row.

The JSONL file should have 3 columns: `input_source_model`, `input_target_model`, `harmonized_mapping`

Those 3 columns should be populated by content of the files:

- `*__ai_model_ouput.json` == `input_source_model`
- `expected_mappings.tsv` == `input_target_model`
- `harmonized_data_model.json` == `harmonized_mapping` -->

In [None]:
# def create_jsonl_from_structure(root_dir, output_jsonl_path):
#     """
#     Iterates through subfolders under root_dir and writes a single JSONL file
#     with input_source_model, input_target_model, harmonized_mapping fields.
#     """
#     records = []
#     for dirpath, dirnames, filenames in os.walk(root_dir):
#         # Find the first * __ai_model_ouput.json file in this directory
#         print(f"Current dir: {dirpath}")
#         print(f"Files in dir: {filenames}")

#         ai_model_files = glob.glob(os.path.join(dirpath, "*ai_model_output*"))
#         expected_mappings_path = os.path.join(dirpath, "expected_mappings.tsv")
#         target_harmoized_model_path = os.path.join(dirpath, "harmonized_data_model.json")

#         if (
#             ai_model_files
#             and os.path.isfile(expected_mappings_path)
#             and os.path.isfile(target_harmoized_model_path)
#         ):
#             ai_model_path = ai_model_files[0]
#             # Read files
#             with open(ai_model_path, "r", encoding="utf-8") as f:
#                 input_source_model = f.read()
#             with open(expected_mappings_path, "r", encoding="utf-8") as f:
#                 harmonized_mapping = f.read()
#             with open(target_harmoized_model_path, "r", encoding="utf-8") as f:
#                 input_target_model = f.read()
#             # Append JSON record
#             record = {
#                 "input_source_model": input_source_model,
#                 "input_target_model": input_target_model,
#                 "harmonized_mapping": harmonized_mapping,
#             }
#             records.append(record)

#     # Write to JSONL file
#     print(f"Test count: {len(records)}")
#     with open(output_jsonl_path, "w", encoding="utf-8") as fout:
#         for record in records:
#             fout.write(json.dumps(record) + "\n")

In [None]:
# output_json_filepath = os.path.join(output_dir, "output.jsonl")
# create_jsonl_from_structure(output_dir, output_json_filepath)

### Test ability to get original files from JSONL

In [None]:
# def get_files_from_harmonization_benchmark_jsonl_row(row_dict, output_dir, row_index):
#     """
#     Extracts the desired content from the row,
#     creates a per-row output subdirectory,
#     and writes each file into that subdirectory.
#     Returns a dict of {filename: content} for that row.
#     """
#     # Make a subdirectory for this row
#     row_folder = os.path.join(output_dir, f"row_{row_index}")
#     os.makedirs(row_folder, exist_ok=True)

#     files = {
#         "restored__ai_model_output.json": row_dict["input_source_model"],
#         "expected_mappings.tsv": row_dict["input_target_model"],
#         "harmonized_data_model.json": row_dict["harmonized_mapping"],
#     }
#     for filename, contents in files.items():
#         with open(os.path.join(row_folder, filename), "w", encoding="utf-8") as f:
#             f.write(contents)
#     return files


# def process_harmonization_benchmark_jsonl(jsonl_path, output_dir):
#     """
#     Reads the JSONL file and calls the row handler for each row.
#     """
#     with open(jsonl_path, "r", encoding="utf-8") as f:
#         for idx, line in enumerate(f):
#             row = json.loads(line)
#             _ = get_files_from_harmonization_benchmark_jsonl_row(row, output_dir, idx)

In [None]:
# process_harmonization_benchmark_jsonl(
#     output_json_filepath, "../output/temp/harmonization_training"
# )