#### MedMCQA

In [1]:
import os
import json

from datasets import load_dataset

ds = load_dataset("openlifescienceai/medmcqa")

print(ds)

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 182822/182822 [00:00<00:00, 703302.16 examples/s]
Generating test split: 100%|██████████| 6150/6150 [00:00<00:00, 1042768.71 examples/s]
Generating validation split: 100%|██████████| 4183/4183 [00:00<00:00, 639037.47 examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 182822
    })
    test: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 6150
    })
    validation: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 4183
    })
})





In [2]:
data = ds['validation']

final_data = []
for index, item in enumerate(data):
    id = item['id']
    question = item['question'] + "\nAnswer Choices: (A) " + item['opa'] + " (B) " + item['opb'] + " (C) " + item['opc'] + " (D) " + item['opd']
    label = item['cop']
    if label == 0:
        label = ["A"]
    elif label == 1:
        label = ["B"]
    elif label == 2:
        label = ["C"]
    elif label == 3:
        label = ["D"]
    else:
        raise ValueError("Invalid label")
    label_rationale = item['exp']
    subject_name = item['subject_name']
    topic_name = item['topic_name']
    final_data.append({"id": id, "question": question, "label": label, "label_rationale": label_rationale, "subject_name": subject_name, "topic_name": topic_name})

output_path = "medmcqa/input/medmcqa_input.jsonl"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in final_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

#### MedQA

In [3]:
ds = load_dataset("GBaker/MedQA-USMLE-4-options-hf")

print(ds)

Generating train split: 100%|██████████| 10178/10178 [00:00<00:00, 400755.01 examples/s]
Generating validation split: 100%|██████████| 1272/1272 [00:00<00:00, 383630.88 examples/s]
Generating test split: 100%|██████████| 1273/1273 [00:00<00:00, 406838.54 examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'sent1', 'sent2', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 10178
    })
    validation: Dataset({
        features: ['id', 'sent1', 'sent2', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 1272
    })
    test: Dataset({
        features: ['id', 'sent1', 'sent2', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 1273
    })
})





In [5]:
data = ds['test']

final_data = []
for index, item in enumerate(data):
    id = item['id']
    question = item['sent1'] + "\nAnswer Choices: (A) " + item['ending0'] + " (B) " + item['ending1'] + " (C) " + item['ending2'] + " (D) " + item['ending3']
    label = item['label']
    if label == 0:
        label = ["A"]
    elif label == 1:
        label = ["B"]
    elif label == 2:
        label = ["C"]
    elif label == 3:
        label = ["D"]
    else:
        raise ValueError("Invalid label")
    final_data.append({"id": id, "question": question, "label": label})

output_path = "medqa/input/medqa_input.jsonl"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in final_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

#### MMLU Medical

In [6]:
subject_list = ["anatomy", "clinical_knowledge", "college_biology", "college_medicine", "medical_genetics", "professional_medicine"]
data = []
for subject in subject_list:
    data.append(load_dataset("cais/mmlu", subject))

Generating test split: 100%|██████████| 135/135 [00:00<00:00, 39688.16 examples/s]
Generating validation split: 100%|██████████| 14/14 [00:00<00:00, 7168.00 examples/s]
Generating dev split: 100%|██████████| 5/5 [00:00<00:00, 2301.53 examples/s]
Generating test split: 100%|██████████| 265/265 [00:00<00:00, 78556.12 examples/s]
Generating validation split: 100%|██████████| 29/29 [00:00<00:00, 13677.59 examples/s]
Generating dev split: 100%|██████████| 5/5 [00:00<00:00, 3824.83 examples/s]
Generating test split: 100%|██████████| 144/144 [00:00<00:00, 69937.45 examples/s]
Generating validation split: 100%|██████████| 16/16 [00:00<00:00, 12252.85 examples/s]
Generating dev split: 100%|██████████| 5/5 [00:00<00:00, 3402.26 examples/s]
Generating test split: 100%|██████████| 173/173 [00:00<00:00, 88999.70 examples/s]
Generating validation split: 100%|██████████| 22/22 [00:00<00:00, 19903.94 examples/s]
Generating dev split: 100%|██████████| 5/5 [00:00<00:00, 4115.29 examples/s]
Generating te

In [7]:
final_data = []
for ds in data:
    samples = ds['test']
    for index, item in enumerate(samples):
        assert len(item['choices']) == 4
        question = item['question'] + "\nAnswer Choices: (A) " + item['choices'][0] + " (B) " + item['choices'][1] + " (C) " + item['choices'][2] + " (D) " + item['choices'][3]
        label = item['answer']
        if label == 0:
            label = ["A"]
        elif label == 1:
            label = ["B"]
        elif label == 2:
            label = ["C"]
        elif label == 3:
            label = ["D"]
        else:
            raise ValueError("Invalid label")
        subject = item['subject']
        final_data.append({"question": question, "label": label, "subject": subject})

output_path = "mmlu_medical/input/mmlu_medical_input.jsonl"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in final_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")