In [2]:
import os
import json
import glob
import pandas as pd
from datasets import Dataset, DatasetDict, Features, Value, ClassLabel, load_dataset
from huggingface_hub import HfApi, create_repo

# Function to load JSON/JSONL files
def load_json_file(file_path):
    if file_path.endswith('.json'):
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    elif file_path.endswith('.jsonl'):
        data = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                data.append(json.loads(line.strip()))
        return data
    else:
        raise ValueError(f"Unsupported file format: {file_path}")

# Get all test_hard files
dataset_map = {
    'medqa': 'MedQA',
    'pubmedqa': 'PubMedQA', 
    'medmcqa': 'MedMCQA',
    'medbullets': 'MedBullets',
    'mmlu': 'MMLU',
    'mmlu-pro': 'MMLU-Pro',
    'afrimedqa': 'AfrimedQA',
    'medexqa': 'MedExQA',
    'medxpertqa-r': 'MedXpertQA-R',
    'medxpertqa-u': 'MedXpertQA-U',
}


# Create a dataset dictionary to store all subsets
datasets = {}

# Find all hard files
hard_files = []
for dataset_key, dataset_name in dataset_map.items():
    dataset_dict = DatasetDict()
    # test_hard
    file_path = f"{dataset_key}/test_hard.jsonl"
    data = load_json_file(file_path)
    dataset = Dataset.from_pandas(pd.DataFrame(data))
    dataset_dict['test_hard'] = dataset
    
    # test
    file_path = f"{dataset_key}/test.jsonl"
    data = load_json_file(file_path)
    dataset = Dataset.from_pandas(pd.DataFrame(data))
    dataset_dict['test'] = dataset

    kept_columns = dataset_dict['test'].column_names

    # test hard leftout 
    file_path = f"{dataset_key}/test_hard_reimp_leftout.jsonl"
    data = load_json_file(file_path)
    dataset = Dataset.from_pandas(pd.DataFrame(data))
    # only keep the columns that are in the test set
    dataset = dataset.remove_columns([col for col in dataset.column_names if col not in kept_columns])

    dataset_dict['test_hard_leftout'] = dataset

    datasets[dataset_name] = dataset_dict

repo_name = "medagents-benchmark"
# Push to hub with config names for each su set
for dataset_name, dataset_dict in datasets.items():
    dataset_dict.push_to_hub(
        repo_id=repo_name,
        config_name=dataset_name,
        private=False
    )
    print(f"Uploaded {dataset_name} configuration")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 277.51ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.28it/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 80.95ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.22s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 190.96ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


Uploaded MedQA configuration


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 407.17ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.13it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 138.20ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.21it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 641.63ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]


Uploaded PubMedQA configuration


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 782.81ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.35it/s]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 279.01ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.40it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 276.47ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.09it/s]


Uploaded MedMCQA configuration


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 289.90ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.37it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 103.02ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.43it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 451.34ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]


Uploaded MedBullets configuration


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 600.22ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 305.36ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.35it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 605.15ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.22it/s]


Uploaded MMLU configuration


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 452.31ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 142.35ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 319.54ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s]


Uploaded MMLU-Pro configuration


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 694.65ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.15it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 402.02ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.09it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 834.52ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.13it/s]


Uploaded AfrimedQA configuration


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 688.38ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.31it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 256.25ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.39it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 761.22ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.29it/s]


Uploaded MedExQA configuration


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 365.17ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.40it/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 64.33ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.22it/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 90.20ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.09s/it]


Uploaded MedXpertQA-R configuration


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 412.22ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.36it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 118.34ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.44it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 170.22ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it]


Uploaded MedXpertQA-U configuration


In [None]:
dataset_data = []
for dataset_key, dataset_name in dataset_map.items():
    for item in datasets[dataset_name]['test_hard']:
        # Format the data in the required format
        formatted_item = {
            "question": item["question"],
            "answer": item["answer_idx"]
        }
        
        # Add options if they exist
        if "options" in item:
            for key, value in item["options"].items():
                formatted_item[key] = value

        dataset_data.append(formatted_item)
    
# Save to a JSONL file
with open(f"full_hard_dataset.jsonl", "w", encoding="utf-8") as f:
    for item in dataset_data:
        f.write(json.dumps(item) + "\n")

print(f"Saved {len(dataset_data)} items to full_hard_dataset.jsonl")

Saved 894 items to full_hard_dataset.jsonl
