# Create Harmonization Benchmark To GDC Data Dictionary

This notebook utilizes data from the following paper:

* Yurong Liu, Eduardo H. M. Pena, Aécio Santos, Eden Wu, and Juliana Freire. 2025. Magneto: Combining Small and Large Language Models for Schema Matching. Proc. VLDB Endow. 18, 8 (April 2025), 2681–2694. https://doi.org/10.14778/3742728.3742757

## Setup

In [None]:
# If you are actively working on related *.py files and would like changes to reload automatically into this notebook
%load_ext autoreload
%autoreload 2

In [None]:
import os

## Get benchmark data

Set input and output directories

In [None]:
input_dir = os.path.abspath(
    "../datasets/harmonization_benchmark_real_GDC/inputs"
)

output_dir = os.path.abspath(
    "../datasets/harmonization_benchmark_real_GDC/outputs"
)

Get GDC Data Dictionary as a target model

In [None]:
url = "https://api.gdc.cancer.gov/v0/submission/_dictionary/_all"
target_model_path = os.path.abspath(input_dir) + "/target_model_GDC.json"
os.makedirs(os.path.dirname(target_model_path), exist_ok=True)
!wget -q -O "{target_model_path}" "{url}"

Get 10 source CSVs from the paper as a source tables

In [None]:
source_tables = [
    ("1MyQOryVm3S0iBz3-uqAC_bPqZjMtS6IA", "Cao.csv"), # pragma: allowlist secret
    ("1N3rbTHtnVDe19kMNei0opy_g-8Hr_Hl5", "Clark.csv"), # pragma: allowlist secret
    ("1Ml-lY2LnAwpFpgHGeE7R2qqWRxBVLso9", "Dou.csv"), # pragma: allowlist secret
    ("1Nac7mZR_reZPdK5zghI5Y3pEKq8VPTRQ", "Gilette.csv"), # pragma: allowlist secret
    ("1NIFT5dHcguZ1vzbQ_qz1tIhNDx-QENSe", "Huang.csv"), # pragma: allowlist secret
    ("1MjNgXn-peUUaSadqIcWlqszECgxw7-ux", "Krug.csv"), # pragma: allowlist secret
    ("1ND-qu_62kGtz98O23AMFId4SHQX5GPzJ", "McDermott.csv"), # pragma: allowlist secret
    ("1NE13PtlXR6w2wRXyZrY6ar2lUXeLUw1-", "Satpathy.csv"), # pragma: allowlist secret
    ("1MxEwZbz-31bQqM8ECIKnrClSQNTNwwpY", "Vasaikar.csv"), # pragma: allowlist secret
    ("1NgEsOT7jPdCll0Q3iQ_tuAMqe8L2XFBE", "Wang.csv") # pragma: allowlist secret
]

source_tables_path = os.path.abspath(input_dir) + "/source_tables"
os.makedirs(source_tables_path, exist_ok=True)


for id, name in source_tables:
    url = f"https://drive.google.com/uc?export=download&id={id}"
    !wget -q --no-check-certificate "{url}" -O "{source_tables_path}/{name}"

Get 10 ground truth mappings CSVs from the paper as source mappings

In [None]:
source_mappings = [
    ("1c64T1cq09T6WmOIIMGRO6yglIRBDDaYP", "Cao.csv"), # pragma: allowlist secret
    ("10pzRiZWuhE_jfNAm7D8XzKzM7ebJgbyj", "Clark.csv"), # pragma: allowlist secret
    ("1vqL5HhFT6SxptQu7FyLidJnn2VKb4UMg", "Dou.csv"), # pragma: allowlist secret
    ("1S0Fe2YlcqNhO1aFMwnePLjKVm1LPVDL8", "Gilette.csv"), # pragma: allowlist secret
    ("1Jy3FIE8jcrNiNlyXsoQIo86nfGVSeAsL", "Huang.csv"), # pragma: allowlist secret
    ("1VS27jhKjNjxPnxn4SJt3OcvbMItYSfG2", "Krug.csv"), # pragma: allowlist secret
    ("107WFZ_-kCY-Yh9MGn1Fx1N93b23be27D", "McDermott.csv"), # pragma: allowlist secret
    ("1JY5fo4Tg3b_bgp-6JHPqweCiunjpWqPe", "Satpathy.csv"), # pragma: allowlist secret
    ("1qZ_kOz9-iC8IzMSvdRHhZIc-mjrU-aSk", "Vasaikar.csv"), # pragma: allowlist secret
    ("1N8h2qwWBy8IO7QMx9ahkUE6vuhDdT6El", "Wang.csv") # pragma: allowlist secret
]

source_mappings_path = os.path.abspath(input_dir) + "/source_mappings"
os.makedirs(source_mappings_path, exist_ok=True)


for id, name in source_mappings:
    url = f"https://drive.google.com/uc?export=download&id={id}"
    !wget -q --no-check-certificate "{url}" -O "{source_mappings_path}/{name}"