# Harmonization Approach Using Abstractions

## Prerequisites

Install package manager and sync required packages.

In [None]:
# If you are actively working on related *.py files and would like changes to reload automatically into this notebook
%load_ext autoreload
%autoreload 2

## Single Benchmark Test File

Each test should include a source model: `*__ai_model_output.json`, with desire to harmonize to `harmonized_data_model.json`. We expect harmonization `expected_mappings.tsv`.

JSONL file with a test per row.

The JSONL file has 3 columns: `input_source_model`, `input_target_model`, `harmonized_mapping`

Those 3 columns should be populated by content of the files:

- `*__ai_model_ouput.json` == `input_source_model`
- `expected_mappings.tsv` == `input_target_model`
- `harmonized_data_model.json` == `harmonized_mapping`

In [None]:
import os
import json
import time

from harmonization.jsonl import (
    split_harmonization_jsonl_by_input_target_model,
    jsonl_to_csv,
)
from harmonization.harmonization_benchmark import get_metrics_for_approach
from harmonization.harmonization_approaches.similarity_inmem import (
    SimilaritySearchInMemoryVectorDb,
)
from harmonization.harmonization_approaches.embeddings import (
    MedGemmaEmbeddings,
    QwenEmbeddings
)
from langchain_huggingface import HuggingFaceEmbeddings

`output.jsonl` file contains 710 lines and `limited_output.jsonl` file contains 10 first lines from `output.jsonl` file
`limited_output.jsonl` might be useful for testing locally

In [None]:
#output_json_filepath = (
#    "../datasets/harmonization_benchmark_SDCs_27_Gen3_DMs_mutated_v0.0.2/output.jsonl"
#)

output_json_filepath = (
    "../datasets/harmonization_benchmark_SDCs_27_Gen3_DMs_mutated_v0.0.2/limited_output.jsonl"
)

In [None]:
output_jsonls_per_target_model_dir_path = (
    "../output/temp/harmonization/v0.0.2/per_target"
)
split_harmonization_jsonl_by_input_target_model(
    output_json_filepath, output_jsonls_per_target_model_dir_path
)

> Warning: The next cells will take **a very long time** and a lot of CPU/GPU the first time you run it (took me 32 minutes on an M3 Mac), and just **a long time** (took me 20 minutes on an M3 Mac) on future runs. It's embedding every single target data model into a persistent vectorstore on disk (and loaded in mem) as it goes the first time. And then every run it's embedding all the test case `node.property` and doing similarity search.

In [None]:
folder_name = time.time()
output_directory = "./output/harmonization/"

In [None]:
for file in os.listdir(output_jsonls_per_target_model_dir_path):
    full_file_path = os.path.join(output_jsonls_per_target_model_dir_path, file)
    print(f"Opening {full_file_path}...")
    output_json_filepath = f"{output_directory}/{folder_name}/{file}"
    os.makedirs(os.path.dirname(output_json_filepath), exist_ok=True)

    # since these files are separated by target model already, just get the first row
    input_target_model = None
    with open(full_file_path, "r", encoding="utf-8") as infile:
        for line in infile:
            row = json.loads(line)
            input_target_model = json.loads(row["input_target_model"])
            break

    # :62 b/c of limitation on chromadb collection names
    harmonization_approach = SimilaritySearchInMemoryVectorDb(
        vectordb_persist_directory_name=f"{file[:62]}",
        input_target_model=input_target_model,
    )

    output_filename = get_metrics_for_approach(
        full_file_path,
        harmonization_approach,
        output_json_filepath,
        metrics_column_name="custom_metrics",
    )
    print(f"Output metrics to {output_json_filepath}")

## Use of Medgemma, Qwen embeddings

Test embeddings on small text inputs

In [None]:
from harmonization.harmonization_approaches.embeddings import MedGemmaEmbeddings, QwenEmbeddings

# MedGemma
#medgemma_embedder = MedGemmaEmbeddings()
#emb_m = medgemma_embedder.embed_query("heart disease")
#print(emb_m)

# Qwen3 (0.6B or 8B)
qwen_embedder = QwenEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B")
emb_q = qwen_embedder.embed_query("heart disease")
print(emb_q)

Choose desired embedding by uncommenting a line

In [None]:
#embedding_fn = MedGemmaEmbeddings(model_name="google/medgemma-4b-pt")
#embedding_fn = QwenEmbeddings(model_name="Qwen/Qwen3-Embedding-4B")
embedding_fn = QwenEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B")
#embedding_fn = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

The only difference from previous code block in cell 6 is that we are passing embedding_function and force_vectorstore_recreation

In [None]:
for file in os.listdir(output_jsonls_per_target_model_dir_path):
    full_file_path = os.path.join(output_jsonls_per_target_model_dir_path, file)
    print(f"Opening {full_file_path}...")
    output_json_filepath = f"{output_directory}/{folder_name}/{file}"
    os.makedirs(os.path.dirname(output_json_filepath), exist_ok=True)

    # since these files are separated by target model already, just get the first row
    input_target_model = None
    with open(full_file_path, "r", encoding="utf-8") as infile:
        for line in infile:
            row = json.loads(line)
            input_target_model = json.loads(row["input_target_model"])
            break

    # :62 b/c of limitation on chromadb collection names
    print("ok 2")
    harmonization_approach = SimilaritySearchInMemoryVectorDb(
        vectordb_persist_directory_name=f"{file[:62]}",
        input_target_model=input_target_model,
        embedding_function=embedding_fn,
        force_vectorstore_recreation=True
    )

    print("ok3")
    output_filename = get_metrics_for_approach(
        full_file_path,
        harmonization_approach,
        output_json_filepath,
        metrics_column_name="custom_metrics",
    )
    print(f"Output metrics to {output_json_filepath}")

### Example conversation to CSVs

In [None]:
# output_directory = "./output/harmonization/"
# output_directory = os.path.join(
#     output_directory, "1755028259.3249412"
# )  # REPLACE with folder you want
# for file in os.listdir(output_directory):
#     full_file_path = os.path.abspath(os.path.join(output_directory, file))
#     csv_path = full_file_path.replace(".jsonl", ".csv")
#     jsonl_to_csv(jsonl_path=full_file_path, csv_path=csv_path)