In [1]:
from datasets import load_dataset, Features, Value
import random
import re

LETTER_INDICES = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]

ds1 = load_dataset("openlifescienceai/medmcqa")
ds2 = load_dataset("Sangeetha/Kaggle-LLM-Science-Exam")
ds3 = load_dataset("allenai/sciq")
ds4 = load_dataset("allenai/openbookqa", "additional")
ds5 = load_dataset("deepmind/aqua_rat", "raw")
ds6 = load_dataset("mvujas/stem_mcqa_questions")
# ds7 = load_dataset("shredder-31/MCQ_Question_DataSets")  # Bad Dataset
ds8 = load_dataset("allenai/qasc")
ds9 = load_dataset("ibragim-bad/arc_easy")

am_features = Features({
    "messages": [
        {
            "role": Value("string"),
            "content": Value("string"),
            "info": {
                "source": Value("string"),
                "reference_answer": Value("string"),
                "test_case": Value("string"),
                "think_content": Value("string"),
                "answer_content": Value("string")
            }
        }
    ]
})
am = load_dataset('a-m-team/AM-DeepSeek-R1-Distilled-1.4M', 'am_0.5M', features=am_features)

am_0.5M.jsonl.zst:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/28 [00:00<?, ?it/s]

In [2]:
def preprocess_medmcqa(ex):
    return {
        "dataset": "medmcqa",
        "question": ex["question"],
        "choices": [ex["opa"], ex["opb"], ex["opc"], ex["opd"]], # without the letters
        "question_type": "mcq",
        "answer": LETTER_INDICES[ex["cop"]], # only letter
        "explanation": ex["exp"] if ex["exp"] is not None else ""
    }

def preprocess_kagglellmscienceexam(ex):
    return {
        "dataset": "kaggle_llm_science_exam",
        "question": ex["prompt"],
        "choices": [ex["A"], ex["B"], ex["C"], ex["D"], ex["E"]], # without the letters
        "question_type": "mcq",
        "answer": ex["answer"], # only letter
        "explanation": ""
    }

def shuffle_choices(choices, correct_index=3):
    indices = list(range(len(choices)))
    random.shuffle(indices)
    shuffled = [choices[i] for i in indices]
    new_correct_index = indices.index(correct_index)
    return shuffled, new_correct_index

def preprocess_sciq(ex):
    choices = [ex["distractor1"], ex["distractor2"], ex["distractor3"], ex["correct_answer"]]
    shuffled_choices, correct_idx = shuffle_choices(choices)
    return {
        "dataset": "sciq",
        "question": ex["question"],
        "choices": choices,
        "question_type": "mcq",
        "answer": LETTER_INDICES[correct_idx],
        "explanation": ex['support']
    }

def preprocess_openbookqa(ex):
    return {
        "dataset": "openbookqa",
        "question": ex["question_stem"],
        "choices": ex["choices"]["text"],
        "question_type": "mcq",
        "answer": ex["answerKey"],
        "explanation": ex['fact1']
    }

def preprocess_aquarat(ex):
    choices  = [op[2:] for op in ex["options"]] # Remove "A)" or "B)" "C)" "D)" at the start
    return {
        "dataset": "aqua_rat",
        "question": ex["question"],
        "choices": choices,
        "question_type": "mcq",
        "answer": ex["correct"],
        "explanation": ex['rationale']
    }

def extract_answer_choices(prompt):
    pattern = r"[A-D]\.\s+(.*?)\s*(?=[A-D]\.|$)"
    matches = re.findall(pattern, prompt, re.DOTALL)
    return [match.strip() for match in matches]

def extract_question(prompt):
    # Split on the "Options:" keyword to separate question from choices
    question_part = prompt.split("Options:")[0].strip()
    return question_part

def preprocess_stemmcqaquestions(ex):
    return {
        "dataset": "stem_mcqa_questions",
        "question": extract_question(ex["question"]),
        "choices": extract_answer_choices(ex["question"]),
        "question_type": "mcq",
        "answer": ex["answer"],
        "explanation": ex['explanation']
    }

# No longer used
# def preprocess_contextmcq(ex):
#     question = ex["Question"]
    
#     prompt = f"The following are multiple choice questions about {topic}.\n\n"
#     prompt = ex["Context"] + "\n"
#     prompt += question + "\n"
#     prompt += ex["Choices"] + "\n"
#     prompt += "Answer:"

#     return {
#         "dataset": "contextmcq",
#         "prompt": prompt,
#         "completion": f" {ex["Answer"]}."
#     }

def preprocess_qasc(ex):
    return {
        "dataset": "qasc",
        "question": ex["question"],
        "choices": ex["choices"]["text"],
        "question_type": "mcq",
        "answer": ex["answerKey"],
        "explanation": f"{ex['fact1']}\n{ex['fact2']}\n{ex['combinedfact']}"
    }

def preprocess_arceasy(ex):
    return {
        "dataset": "arc_easy",
        "question": ex["question"],
        "choices": ex["choices"]["text"],
        "question_type": "mcq",
        "answer": ex["answerKey"],
        "explanation": ""
    }


In [3]:
def get_question(line):
    for m in line["messages"]:
        if m["role"] == "user":
            return m["content"]
    raise Exception("No question found")

def get_question_source(line):
    for m in line["messages"]:
        if m["role"] == "user":
            return m["info"]["source"]
    raise Exception("No source found")

def get_answer(line):
    for m in line["messages"]:
        if m["role"] == "assistant" and m["info"]["answer_content"] is not None:
            return m["info"]["answer_content"]
    raise Exception("No answer found")

# After manual inspection, this is not perfect and some open questions are considered mcq by this function.
# Results must be taken with a grain of salt and filtering based on that might contain open questions considered as mcqs
def is_mcq(text) -> bool:
    """
    Heuristic check for whether `text` is a multiple-choice question.
    Very inclusive: detects letters, digits, roman numerals, bullets, "Option X:",
    True/False, Yes/No, and trigger phrases.
    """
    lines = text.splitlines()
    
    opt_marker = re.compile(r"""
        ^\s*
        (?:                             # one of:
          [\(\[]?[A-Za-z0-9IVXivx]{1,3}[\)\]\.\:]  # A. (A) A) A: 1. (ii)
          |
          Option\s+[A-Za-z0-9IVXivx]\:            # Option A:
        )
        \s+
        """, re.VERBOSE)
    option_lines = [L for L in lines if opt_marker.match(L)]
    
    # 2) Numeric‐only lists: 1) 2) or 1. 2. 3+
    num_marker = re.compile(r"^\s*\d+[\)\.]\s+")
    num_lines = [L for L in lines if num_marker.match(L)]
    
    # 3) Bullet lists immediately following a question
    #    Count lines starting with "-" or "*" or "•"
    bullet_lines = [L for L in lines if re.match(r"^\s*[-\*\u2022]\s+", L)]
    
    # 4) Trigger phrases
    triggers = [
        r"Which of the following",
        r"Choose (?:the )?correct",
        r"Select (?:the )?best answer",
        r"(?:All|None) of the above",
        r"True\s+or\s+False",
        r"Yes\s+or\s+No",
        r"Options?:", # e.g. "Options: A. …"
        r"Pick\s+one",
        r"One of these"
    ]
    for tg in triggers:
        if re.search(tg, text, re.IGNORECASE):
            return True
    
    # 5) Basic question‐plus‐options logic:
    #    If there's a question mark somewhere, plus ≥2 option‐style lines anywhere.
    has_q = "?" in text
    if has_q and (len(option_lines) >= 2 or len(num_lines) >= 2 or len(bullet_lines) >= 3):
        return True
    
    # 6) Pure option cluster: ≥3 lines of any option‐style markers
    if len(option_lines) + len(num_lines) + len(bullet_lines) >= 3:
        return True
    
    return False

def preprocess_am(ex):
    text = get_question(ex)
    return {
        "dataset": f"am0.5:{get_question_source(ex)}",
        "question": text,
        "choices": None,
        "question_type": "open_mcq" if is_mcq(text) else "open",
        "answer": None,
        "explanation": get_answer(ex)
    }

In [5]:
ds1_processed = ds1.map(preprocess_medmcqa).select_columns(["dataset", "question", "choices", "question_type", "answer", "explanation"])

In [7]:
ds2_processed = ds2.map(preprocess_kagglellmscienceexam).select_columns(["dataset", "question", "choices", "question_type", "answer", "explanation"])


In [8]:
ds3_processed = ds3.map(preprocess_sciq).select_columns(["dataset", "question", "choices", "question_type", "answer", "explanation"])

Map:   0%|          | 0/11679 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
ds4_processed = ds4.map(preprocess_openbookqa).select_columns(["dataset", "question", "choices", "question_type", "answer", "explanation"])

Map:   0%|          | 0/4957 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [10]:
ds5_processed = ds5.map(preprocess_aquarat).select_columns(["dataset", "question", "choices", "question_type", "answer", "explanation"])

Map:   0%|          | 0/97467 [00:00<?, ? examples/s]

Map:   0%|          | 0/254 [00:00<?, ? examples/s]

Map:   0%|          | 0/254 [00:00<?, ? examples/s]

In [11]:
ds6_processed = ds6.map(preprocess_stemmcqaquestions).select_columns(["dataset", "question", "choices", "question_type", "answer", "explanation"])

Map:   0%|          | 0/2361 [00:00<?, ? examples/s]

In [12]:
# ds7_processed = ds7.map(preprocess_contextmcq).select_columns(["dataset", "prompt", "completion"])

In [13]:
ds8_processed = ds8.map(preprocess_qasc).select_columns(["dataset", "question", "choices", "question_type", "answer", "explanation"])

Map:   0%|          | 0/8134 [00:00<?, ? examples/s]

Map:   0%|          | 0/920 [00:00<?, ? examples/s]

Map:   0%|          | 0/926 [00:00<?, ? examples/s]

In [14]:
ds9_processed = ds9.map(preprocess_arceasy).select_columns(["dataset", "question", "choices", "question_type", "answer", "explanation"])

Map:   0%|          | 0/2251 [00:00<?, ? examples/s]

Map:   0%|          | 0/570 [00:00<?, ? examples/s]

Map:   0%|          | 0/2376 [00:00<?, ? examples/s]

In [15]:
am_processed = am.map(preprocess_am).select_columns(["dataset", "question", "choices", "question_type", "answer", "explanation"])

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

In [16]:
from datasets import concatenate_datasets

merged_ds = concatenate_datasets([ds1_processed["train"], 
                                  ds2_processed["train"], 
                                  ds3_processed["train"], 
                                  ds4_processed["train"], 
                                  ds5_processed["train"], 
                                  ds6_processed["train"], 
                                  # ds7_processed["train"],
                                  ds8_processed["train"], 
                                  ds9_processed["train"],
                                  am_processed["train"]])

In [22]:
def get_prompt_completion(line):
    q_type = line["question_type"]
    topic = "knowledge and skills in advanced master-level STEM courses"
    prompt = f"The following are {"open" if q_type == "open" else "multiple choice"} questions (with answers) about {topic.replace('_', ' ')}.\n\n"
    prompt += line["question"].strip() + "\n"
    # If question_type is open_mcq, the choices are already in the question
    if q_type == "mcq":
        for choice in line["choices"]:
            if choice is None:
                raise Exception(f"Did not expected None for choice: {line}")
        prompt += "".join([f"{key}. {choice.strip()}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
    prompt += "Answer:"

    explanation = line["explanation"].strip()
    if q_type == "mcq":
        completion = f" {line["answer"]}."
        if explanation:
            completion += f" {explanation}"
    else:
        completion = f" {explanation}"

    return {
        "prompt": prompt,
        "completion": completion
    }

# Some datasets, like kaggle_llm_science_exam, have choices that are None for some reason (not good for formatting!). Remove those
merged_ds = merged_ds.filter(lambda ex: None not in (ex["choices"] or []))
merged_ds = merged_ds.map(get_prompt_completion)

Filter:   0%|          | 0/816355 [00:00<?, ? examples/s]

Map:   0%|          | 0/816351 [00:00<?, ? examples/s]

In [23]:
merged_ds

Dataset({
    features: ['dataset', 'question', 'choices', 'question_type', 'answer', 'explanation', 'prompt', 'completion'],
    num_rows: 816351
})

In [24]:
from datasets import DatasetDict

final_ds = DatasetDict({"train": merged_ds})
final_ds.push_to_hub("brygotti/MNLP_M3_mcqa_dataset")

Uploading the dataset shards:   0%|          | 0/6 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/137 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/137 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/137 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/137 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/137 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/137 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/brygotti/unified-0.8M/commit/b92cb38cff2af91159deee64bb778b45bec145f2', commit_message='Upload dataset', commit_description='', oid='b92cb38cff2af91159deee64bb778b45bec145f2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/brygotti/unified-0.8M', endpoint='https://huggingface.co', repo_type='dataset', repo_id='brygotti/unified-0.8M'), pr_revision=None, pr_num=None)