# Create Harmonization Benchmark

Requires data from: https://uchicago.app.box.com/folder/341601784541 contents in a `real_benchmarks` folder. Put the "combined_data_models" into a separate folder `./real_benchmarks/_combo_models`.


In [None]:
import csv
import copy
import glob
import json
import os

In [None]:
# Obtain mutated real data dicts: https://uchicago.app.box.com/folder/320103463138?s=a58qiodbv0grnxq0h94ronz5qa5xlmpc
# and place as input_dir
input_combo_models_dir = os.path.abspath("../real_benchmarks/_combo_models")
input_source_models_dir = os.path.abspath("../real_benchmarks/source_models")
target_model_path = os.path.abspath("./examples/example_target_model_BDC.json")
output_dir = os.path.abspath(
    "../datasets/harmonization_benchmark_real_BIOLINCC_BDC_v0.0.1"
)

In [None]:
# manually fill this out, maps from the source model to the corresponding combined model
source_model_to_combo_model = {
    f"{input_source_models_dir}/source_model_phs003738.v1.p1.c1_BioLINCC-BL_ARIC_HMB-NPU-MDS.json": f"{input_combo_models_dir}/combined_data_model__phs003738.v1.p1.c1_BioLINCC-BL_ARIC_HMB-NPU-MDS.json",
    f"{input_source_models_dir}/source_model__phs003948.v1.p1.c1_Individual_Study-PATH_HHT_DS-HHT-IRB-PUB-COL.json": f"{input_combo_models_dir}/combined_data_model__phs003948.v1.p1.c1_Individual_Study-PATH_HHT_DS-HHT-IRB-PUB-COL.json",
    f"{input_source_models_dir}/source_model__phs004055.v1.p1.c1_BioLINCC-BL_CONCERT_HF_GRU.json": f"{input_combo_models_dir}/combined_data_model__phs004055.v1.p1.c1_BioLINCC-BL_CONCERT_HF_GRU.json",
}

In [None]:
def parse_combo_model(source_model_to_combo_model, target_model_path, output_dir):
    # right now this assumes a single target
    with open(target_model_path) as input_file:
        target_model = json.load(input_file)

    for source_model_path, combo_model_path in source_model_to_combo_model.items():
        with open(source_model_path) as input_file:
            source_model = json.load(input_file)
        with open(combo_model_path) as input_file:
            combo_model = json.load(input_file)

        phs_id = (
            f"phs"
            + os.path.basename(source_model_path).split("phs")[1].split(".json")[0]
        )
        output_subdir = os.path.join(output_dir, phs_id)
        os.makedirs(output_subdir, exist_ok=True)

        # write the source model to the output directory
        with open(os.path.join(output_subdir, "source_model.json"), "w") as output_file:
            json.dump(source_model, output_file, indent=2)

        # write the target model to the output directory
        with open(os.path.join(output_subdir, "target_model.json"), "w") as output_file:
            json.dump(target_model, output_file, indent=2)

        # create a TSV file for the expected mappings
        tsv_path = os.path.join(output_subdir, "expected_mappings.tsv")
        with open(tsv_path, "w", newline="") as tsvfile:
            writer = csv.writer(tsvfile, delimiter="\t", quotechar='"')
            writer.writerow(
                ["source_node_prop_type_desc", "target_node_prop_type_desc"]
            )

            for source_node in source_model["nodes"]:
                for source_prop in source_node["properties"]:

                    # handle malformated names in input
                    source_node_name = source_node.get(
                        "name", source_node.get("name:", "")
                    )
                    source_prop_name = source_prop.get(
                        "name", source_prop.get("name:", "")
                    )
                    source_desc = f"{source_node_name}.{source_prop_name} ({source_prop['type']}): {source_prop['description']}"

                    for combo_node in combo_model["nodes"]:
                        for combo_prop in combo_node["properties"]:
                            if (
                                combo_prop.get("name") == source_prop.get("name")
                                and combo_prop.get("description")
                                == source_prop.get("description")
                                and combo_prop.get("type") == source_prop.get("type")
                                and "gen3_name" in combo_prop
                                and "gen3_description" in combo_prop
                                and "gen3_type" in combo_prop
                            ):
                                target_desc = f"{combo_node['name']}.{combo_prop['gen3_name']} ({combo_prop['gen3_type']}): {combo_prop['gen3_description']}"
                                writer.writerow([source_desc, target_desc])

In [None]:
parse_combo_model(
    source_model_to_combo_model=source_model_to_combo_model,
    target_model_path=target_model_path,
    output_dir=output_dir,
)

Format of output:

- output_dir
    - source_target_folder_0
        - `source_model.json`
        - `expected_mappings.tsv`
        - `target_model.json`
    - ...

The Target Data Model represents a target data model to harmonize to. The expected mappings are known links between the source and target node properties (e.g. columns in tables). 

## Construct Single Benchmark Test File

Each test should include a source model, with desire to harmonize to a target. We expect harmonization `expected_mappings.tsv`.

Now let's create a JSONL file with a test per row.

The JSONL file should have 3 columns: `input_source_model`, `input_target_model`, `harmonized_mapping`

Those 3 columns should be populated by content of the files:

- `source_model.json` == `input_source_model`
- `expected_mappings.tsv` == `harmonized_mapping`
- `target_model.json` == `input_target_model`

In [None]:
def create_jsonl_from_structure(root_dir, output_jsonl_path):
    """
    Iterates through subfolders under root_dir and writes a single JSONL file
    with input_source_model, input_target_model, harmonized_mapping fields.
    """
    records = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        # find the first source_model.json file in this directory
        print(f"Current dir: {dirpath}")
        print(f"Files in dir: {filenames}")
        source_model_files = glob.glob(os.path.join(dirpath, "source_model*"))
        expected_mappings_path = os.path.join(dirpath, "expected_mappings.tsv")
        target_model_files = glob.glob(os.path.join(dirpath, "target_model*"))
        if (
            source_model_files
            and os.path.isfile(expected_mappings_path)
            and target_model_files
        ):
            source_model_path = source_model_files[0]
            target_model_path = target_model_files[0]
            # Read files
            with open(source_model_path, "r", encoding="utf-8") as input_file:
                input_source_model = json.load(input_file)
            with open(expected_mappings_path, "r", encoding="utf-8") as input_file:
                harmonized_mapping = input_file.read()
            with open(target_model_path, "r", encoding="utf-8") as input_file:
                input_target_model = json.load(input_file)
            record = {
                "input_source_model": input_source_model,
                "input_target_model": input_target_model,
                "harmonized_mapping": harmonized_mapping,
            }
            records.append(record)

    print(f"Test count: {len(records)}")
    with open(output_jsonl_path, "w", encoding="utf-8") as output_file:
        for record in records:
            output_file.write(json.dumps(record) + "\n")

In [None]:
output_json_filepath = os.path.join(output_dir, "output.jsonl")
create_jsonl_from_structure(output_dir, output_json_filepath)

In [None]:
def harmonization_data_jsonl_to_csv(jsonl_file, csv_file, input_headers=None):
    """
    Converts a JSONL file to a CSV file.

    Headers must include: `harmonized_mapping`

    This denormalizes the harmonized mapping so each property mapped is its own row.
    """
    input_headers = input_headers or [
        "input_source_model",
        "input_target_model",
        "harmonized_mapping",
    ]

    if "harmonized_mapping" not in input_headers:
        raise Exception("Headers must include: `harmonized_mapping`")

    input_headers.remove("harmonized_mapping")
    output_headers = copy.deepcopy(input_headers)
    output_headers.extend(
        [
            "source_node_prop_type_desc",
            "target_node_prop_type_desc",
        ]
    )

    with open(jsonl_file, "r") as f_in, open(csv_file, "w", newline="") as f_out:
        writer = csv.writer(f_out)
        writer.writerow(output_headers)

        for line in f_in:
            if not line.strip():
                continue
            try:
                data = json.loads(line)
                if not data:
                    continue
                for single_property_harmonized_mapping in data[
                    "harmonized_mapping"
                ].split("\n")[1:]:
                    if not single_property_harmonized_mapping:
                        continue
                    source_node_prop_type_desc, target_node_prop_type_desc = (
                        single_property_harmonized_mapping.split("\t")
                    )
                    row = []
                    for header in input_headers:
                        if header == "harmonized_mapping":
                            continue
                        row.append(data[header])
                    row += [
                        source_node_prop_type_desc,
                        target_node_prop_type_desc,
                    ]
                    writer.writerow(row)
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON line: {line.strip()} - {e}")

In [None]:
harmonization_data_jsonl_to_csv(
    f"{output_dir}/output.jsonl",
    f"{output_dir}/output.csv",
)