# Create Harmonization Benchmark

We will use the same source of data as we did for the 1st objective benchmark (27 Real Gen3 Data Dictionaries with mutations).

In [None]:
# If you are actively working on related *.py files and would like changes to reload automatically into this notebook
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
import json
import shutil
import logging

from harmonization.utils import TEMP_DIR, get_data_model_as_node_prop_descriptions, get_gen3_json_schemas_and_templates

GEN3_DOMAIN_MAPPING = {
    "kidsfirst": "https://data.kidsfirstdrc.org",
}
GEN3_MANUAL_DD_PATH = {
    "microbiome": "microbiome_schema.json",
    "hnc": "hnc_schema.json",
    "genomel": "genomel_schema.json",
    "rerf": "rerf_schema.json",
    "pdp": "pdp_schema.json",
    "ibdgc": "ibdgc_schema.json",
    "mmrf": "mmrf_schema.json",
    "toxdatacommons": "toxdatacommons_schema.json",
}

Manually retrieve some data dictionaries instead of relying on a deployed Gen3 instance with API.

In [None]:
!wget "https://s3.amazonaws.com/dictionary-artifacts/mmrf_dictionary/0.0.5/schema.json" -O mmrf_schema.json
!wget "https://s3.amazonaws.com/dictionary-artifacts/rerf_dictionary/0.4.1/schema.json" -O rerf_schema.json
!wget "https://s3.amazonaws.com/dictionary-artifacts/pdp_dictionary/1.0.0/schema.json" -O pdp_schema.json
!wget "https://s3.amazonaws.com/dictionary-artifacts/kf-dictionary/1.2.3/schema.json" -O kidsfirst_schema.json
!wget "https://s3.amazonaws.com/dictionary-artifacts/genomel-dictionary/0.3.2/schema.json" -O genomel_schema.json
!wget "https://s3.amazonaws.com/dictionary-artifacts/ibdgc-dictionary/1.6.10/schema.json" -O ibdgc_schema.json
!wget "https://s3.amazonaws.com/dictionary-artifacts/microbiome_datadictionary/1.1.1/schema.json" -O microbiome_schema.json
!wget "https://s3.amazonaws.com/dictionary-artifacts/acctdictionary/0.7.1/schema.json" -O account_schema.json
!wget "https://s3.amazonaws.com/dictionary-artifacts/canine_dictionary/1.1.0/schema.json" -O canine_schema.json
!wget "https://s3.amazonaws.com/dictionary-artifacts/hnc_dictionary/0.1.0/schema.json" -O hnc_schema.json
!wget "https://toxdatacommons.com/api/v0/submission/_dictionary/_all" -O toxdatacommons_schema.json

In [None]:
# Obtain mutated real data dicts: https://uchicago.app.box.com/folder/320103463138?s=a58qiodbv0grnxq0h94ronz5qa5xlmpc
# and place as input_dir
input_dir = "../data/benchmark_SDCs_27_Gen3_DMs_plus_mutated"
output_dir = "../datasets/harmonization_benchmark_SDCs_27_Gen3_DMs_mutated_v0.0.2"

In [None]:
# Load Gen3 dictionary schema description
def load_gen3_dd_schemas(domain):
    schema_path = (
        GEN3_MANUAL_DD_PATH.get(domain)
        or f"{TEMP_DIR}/{domain}/Unmodified/schema.json"
    )

    if not os.path.exists(schema_path):
        real_domain = GEN3_DOMAIN_MAPPING.get(domain, domain)
        get_gen3_json_schemas_and_templates(f"https://{real_domain}", TEMP_DIR)

    with open(schema_path, "r") as file:
        gen3_dd_schema = json.load(file)

    return gen3_dd_schema

In [None]:
def output_mappings_for_directory(input_dir, output_dir):
    # Mirror directory structure and process files
    for root, dirs, files in os.walk(input_dir):
        relative_path = os.path.relpath(root, input_dir)
        mirrored_path = os.path.join(output_dir, relative_path)
        os.makedirs(mirrored_path, exist_ok=True)

        gen3_dd_schemas = {}

        ai_model_node_props = {}
        for file_name in files:
            if file_name.endswith("__var_map.json"):
                gen3_domain = file_name.split("__")[0]

                if gen3_domain not in gen3_dd_schemas:
                    continue

                # Transform var_map.json to expected_mappings.tsv
                with open(os.path.join(root, file_name), "r") as var_map_file:
                    var_map_data = json.load(var_map_file)

                tsv_lines = ["ai_model_node_prop_desc\tharmonized_model_node_prop_desc"]
                for harmonized_key, value in var_map_data.items():
                    # the varmap data comes in 3 forms: node, node.property, and node.foreign_key_node.foreign_key_property
                    # we only want really care about determining mapping from node.property at this point, so ignore
                    # the other two
                    if harmonized_key.count(".") != 1:
                        continue

                    new_name = value.get("new_name", "")
                    new_description = value.get("new_description", "").strip()
                    full_node_prop = ai_model_node_props.get(new_description, "")

                    if not full_node_prop:
                        full_node_prop = ai_model_node_props.get(new_name, "")

                    if not full_node_prop:
                        logging.debug(
                            f"can't find: {new_name} by description: {new_description} in output. skipping..."
                        )
                        continue

                    original_description = f"{harmonized_key}: "

                    # Extract descriptions from Gen3 dictionary domain
                    domain_element, domain_property = harmonized_key.split(".")
                    gen3_node = gen3_dd_schemas[gen3_domain].get(domain_element, {})

                    if not gen3_node:
                        logging.debug(f"{domain_element} not in {gen3_domain}")
                        continue

                    gen3_node_prop = gen3_node["properties"].get(domain_property, {})

                    if not gen3_node_prop:
                        logging.warning(
                            f"{domain_property} not in {gen3_node} in {gen3_domain}"
                        )
                        continue

                    gen3_description = (
                        gen3_node_prop.get("description", "")
                        .replace("\t", "    ")
                        .replace("\n", " ")
                    )
                    original_description += f"{gen3_description}"

                    ai_model_line = f"{full_node_prop}\t{original_description}"
                    tsv_lines.append(ai_model_line)

                # Save as expected_mappings.tsv
                if tsv_lines:
                    os.makedirs(mirrored_path, exist_ok=True)
                    with open(
                        os.path.join(mirrored_path, "expected_mappings.tsv"), "w"
                    ) as tsv_file:
                        tsv_file.write("\n".join(tsv_lines))
                else:
                    logging.warning(f"No mappings for {mirrored_path}")

            # Process files, rename jsonschema to ai_model_output
            elif "__jsonschema_dd__" in file_name:
                if file_name.endswith("__jsonschema_dd.json"):
                    gen3_domain = file_name.split("__")[0]
                    if gen3_domain not in gen3_dd_schemas:
                        try:
                            gen3_dd_schemas[gen3_domain] = load_gen3_dd_schemas(
                                gen3_domain
                            )

                        except Exception:
                            logging.debug(
                                f"Could not get Gen3 DD for: {gen3_domain}. Skipping..."
                            )
                            continue

                    output_harmonized_model_path = os.path.join(
                        mirrored_path, "harmonized_data_model.json"
                    )
                    with open(output_harmonized_model_path, "w") as dd_file:
                        dd_file.write(json.dumps(gen3_dd_schemas[gen3_domain]))

                    # Rename JSON schema files
                    new_name = file_name.replace("__jsonschema_dd__", "__").replace(
                        "__jsonschema_dd.json", "__ai_model_output.json"
                    )
                    shutil.copy(
                        os.path.join(root, file_name),
                        os.path.join(mirrored_path, new_name),
                    )
                    with open(os.path.join(root, file_name)) as ai_model_ouput_file:
                        ai_model_ouput = json.load(ai_model_ouput_file)
                        ai_model_node_props_raw = (
                            get_data_model_as_node_prop_descriptions(ai_model_ouput)
                        )
                        for node_prop in ai_model_node_props_raw:
                            if ":" in node_prop:
                                prop = node_prop.split(":")[0]
                                desc = ":".join(node_prop.split(":")[1:])
                            else:
                                prop = node_prop
                                desc = ""

                            # key on description b/c that's what's in the mapping we can rely on
                            ai_model_node_props[desc.strip()] = node_prop

    print(f"Done. Output: {os.path.abspath(output_dir)}")

In [None]:
def output_mappings_from_root_directory(input_dir, output_dir):
    # Mirror directory structure and process files
    for root, dirs, files in os.walk(input_dir):
        for dir in dirs:
            if "mutated_" not in dir:
                continue 

            mirrored_path = os.path.join(output_dir, dir)
            os.makedirs(mirrored_path, exist_ok=True)

            full_path = os.path.join(root, dir)
            print(f"Handling dir: {full_path}")
            output_mappings_for_directory(
                full_path, mirrored_path, # gen3_dd_schema
            )

In [None]:
output_mappings_from_root_directory(input_dir, output_dir)

In [None]:
import os
import shutil


def remove_property_folders_without_mapping(root_folder):
    """
    In each root/mutated_dd_*/original_gen3_dd_*/<arbitrary_folder>/,
    deletes <arbitrary_folder> if expected_mappings.tsv is missing.
    """
    for mutated_dd_dir in os.listdir(root_folder):
        mutated_dd_path = os.path.join(root_folder, mutated_dd_dir)
        if not os.path.isdir(mutated_dd_path):
            continue
        for original_dd_dir in os.listdir(mutated_dd_path):
            original_dd_path = os.path.join(mutated_dd_path, original_dd_dir)
            if not os.path.isdir(original_dd_path):
                continue
            for prop_folder in os.listdir(original_dd_path):
                prop_folder_path = os.path.join(original_dd_path, prop_folder)
                if not os.path.isdir(prop_folder_path):
                    continue
                mapping_path = os.path.join(prop_folder_path, "expected_mappings.tsv")
                if not os.path.exists(mapping_path):
                    print(
                        f"Removing folder (no expected_mappings.tsv): {prop_folder_path} "
                    )
                    shutil.rmtree(prop_folder_path)

In [None]:
remove_property_folders_without_mapping(output_dir)

Format of output:

- root
    - mutated_dd_0
        - original_gen3_dd_0
            - original_node.property
                - `*__ai_model_ouput.json`
                - `expected_mappings.tsv`
                - `harmonized_data_model.json`
        - original_gen3_dd_1
        - ...
    - mutated_dd_1
    - ...

The AI Model Output simulates output from our AI model, a source data model to start from. The Harmonized Data Model represents a target data model to harmonize to. The expected mappings are known links between the source and target node properties (e.g. columns in tables). They are known b/c for each of them, we used an LLM to "mutate" the name and description from the harmonized model in order to generate the AI Model Outputs.

## Construct Single Benchmark Test File

Each test should include a source model: `*__ai_model_output.json`, with desire to harmonize to `harmonized_data_model.json`. We expect harmonization `expected_mappings.tsv`.

Now let's create a JSONL file with a test per row.

The JSONL file should have 3 columns: `input_source_model`, `input_target_model`, `harmonized_mapping`

Those 3 columns should be populated by content of the files:

- `*__ai_model_ouput.json` == `input_source_model`
- `expected_mappings.tsv` == `input_target_model`
- `harmonized_data_model.json` == `harmonized_mapping`

In [None]:
def create_jsonl_from_structure(root_dir, output_jsonl_path):
    """
    Iterates through subfolders under root_dir and writes a single JSONL file
    with input_source_model, input_target_model, harmonized_mapping fields.
    """
    records = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        # Find the first * __ai_model_ouput.json file in this directory
        print(f"Current dir: {dirpath}")
        print(f"Files in dir: {filenames}")

        ai_model_files = glob.glob(os.path.join(dirpath, "*ai_model_output*"))
        expected_mappings_path = os.path.join(dirpath, "expected_mappings.tsv")
        target_harmoized_model_path = os.path.join(dirpath, "harmonized_data_model.json")

        if (
            ai_model_files
            and os.path.isfile(expected_mappings_path)
            and os.path.isfile(target_harmoized_model_path)
        ):
            ai_model_path = ai_model_files[0]
            # Read files
            with open(ai_model_path, "r", encoding="utf-8") as f:
                input_source_model = f.read()
            with open(expected_mappings_path, "r", encoding="utf-8") as f:
                harmonized_mapping = f.read()
            with open(target_harmoized_model_path, "r", encoding="utf-8") as f:
                input_target_model = f.read()
            # Append JSON record
            record = {
                "input_source_model": input_source_model,
                "input_target_model": input_target_model,
                "harmonized_mapping": harmonized_mapping,
            }
            records.append(record)

    # Write to JSONL file
    print(f"Test count: {len(records)}")
    with open(output_jsonl_path, "w", encoding="utf-8") as fout:
        for record in records:
            fout.write(json.dumps(record) + "\n")

In [None]:
output_json_filepath = os.path.join(output_dir, "output.jsonl")
create_jsonl_from_structure(output_dir, output_json_filepath)

### Test ability to get original files from JSONL

In [None]:
def get_files_from_harmonization_benchmark_jsonl_row(row_dict, output_dir, row_index):
    """
    Extracts the desired content from the row,
    creates a per-row output subdirectory,
    and writes each file into that subdirectory.
    Returns a dict of {filename: content} for that row.
    """
    # Make a subdirectory for this row
    row_folder = os.path.join(output_dir, f"row_{row_index}")
    os.makedirs(row_folder, exist_ok=True)

    files = {
        "restored__ai_model_output.json": row_dict["input_source_model"],
        "expected_mappings.tsv": row_dict["harmonized_mapping"],
        "harmonized_data_model.json": row_dict["input_target_model"],
    }
    for filename, contents in files.items():
        with open(os.path.join(row_folder, filename), "w", encoding="utf-8") as f:
            f.write(contents)
    return files


def process_harmonization_benchmark_jsonl(jsonl_path, output_dir):
    """
    Reads the JSONL file and calls the row handler for each row.
    """
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for idx, line in enumerate(f):
            row = json.loads(line)
            _ = get_files_from_harmonization_benchmark_jsonl_row(row, output_dir, idx)

In [None]:
process_harmonization_benchmark_jsonl(
    output_json_filepath, "../output/temp/harmonization"
)