# MMLU

MMLU is a question answering task where each question has four potential answers, one of which is correct. Questions come from 57 categories, including elementary mathematics, US history, computer science, law, and more.

In [None]:
from zeno_client import ZenoClient, ZenoMetric
import datasets
import pandas as pd
import numpy as np
import os

API_KEY = os.environ["ZENO_API_KEY"]

Feel free to change the list of models used.
You can go to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to check what models are available.
Some of them might not have associated data, you can check this by clicking on the little icon next to the model name.
If you get a 404 after clicking, we won't be able to fetch the model data and this notebook will crash.

In [None]:
models = ["meta-llama/Llama-2-70b-hf", "mistralai/Mistral-7B-v0.1", "tiiuae/falcon-40b", "Riiid/sheep-duck-llama-2-70b-v1.1", "AIDC-ai-business/Marcoroni-70B-v1", "ICBU-NPU/FashionGPT-70B-V1.1", "adonlee/LLaMA_2_70B_LoRA", "uni-tianyan/Uni-TianYan"]

In [None]:
# All 57 tasks in the MMLU dataset
tasks = ["hendrycksTest-abstract_algebra", "hendrycksTest-anatomy", "hendrycksTest-astronomy", "hendrycksTest-business_ethics", "hendrycksTest-clinical_knowledge", "hendrycksTest-college_biology", "hendrycksTest-college_chemistry", "hendrycksTest-college_computer_science", "hendrycksTest-college_mathematics", "hendrycksTest-college_medicine", "hendrycksTest-college_physics", "hendrycksTest-computer_security", "hendrycksTest-conceptual_physics", "hendrycksTest-econometrics", "hendrycksTest-electrical_engineering", "hendrycksTest-elementary_mathematics", "hendrycksTest-formal_logic", "hendrycksTest-global_facts", "hendrycksTest-high_school_biology", "hendrycksTest-high_school_chemistry", "hendrycksTest-high_school_computer_science", "hendrycksTest-high_school_european_history", "hendrycksTest-high_school_geography", "hendrycksTest-high_school_government_and_politics", "hendrycksTest-high_school_macroeconomics", "hendrycksTest-high_school_mathematics", "hendrycksTest-high_school_microeconomics", "hendrycksTest-high_school_physics", "hendrycksTest-high_school_psychology", "hendrycksTest-high_school_statistics", "hendrycksTest-high_school_us_history", "hendrycksTest-high_school_world_history", "hendrycksTest-human_aging", "hendrycksTest-human_sexuality", "hendrycksTest-international_law", "hendrycksTest-jurisprudence", "hendrycksTest-logical_fallacies", "hendrycksTest-machine_learning", "hendrycksTest-management", "hendrycksTest-marketing", "hendrycksTest-medical_genetics", "hendrycksTest-miscellaneous", "hendrycksTest-moral_disputes", "hendrycksTest-moral_scenarios", "hendrycksTest-nutrition", "hendrycksTest-philosophy", "hendrycksTest-prehistory", "hendrycksTest-professional_accounting", "hendrycksTest-professional_law", "hendrycksTest-professional_medicine", "hendrycksTest-professional_psychology", "hendrycksTest-public_relations", "hendrycksTest-security_studies", "hendrycksTest-sociology", "hendrycksTest-us_foreign_policy", "hendrycksTest-virology", "hendrycksTest-world_religions"]

In [None]:
def get_data_for_task(model: str, task: str):
    data_path = "details_" + model.replace("/", "__")
    return datasets.load_dataset(
        "open-llm-leaderboard/" + data_path,
        f"harness_{task.replace('-', '_')}_5",
    )

In [None]:
def get_data(model: str):
    frames = []
    for task in tasks:
        data = get_data_for_task(model, task)['latest'].to_pandas()
        data['task'] = task
        frames.append(data)
    df = pd.concat(frames, ignore_index=True)
    return df

In [None]:
labels = ["A", "B", "C", "D"]

def generate_dataset(df):
    df_lim = df[["example", "choices", "gold", "task"]]
    df_lim.loc[:, "data"] = df_lim.apply(lambda x: x["example"][:x["example"].rfind('\n')], axis=1)
    df_lim.loc[:, "label"] = df_lim.apply(lambda x: labels[x["gold"]], axis=1)
    df_lim = df_lim.drop(columns=["example", "choices", "gold"])
    df_lim["id"] = df_lim.index 
    return df_lim

def generate_system(df):
    df_system = df[["predictions", "acc", "choices"]]
    df_system["predictions"] = df_system.apply(lambda x: labels[np.argmax(x['predictions'])] + "\n\n" + "Pred.: " + ", ".join(map(lambda y: str(round(y, 2)), x['predictions'])), axis=1)
    df_system["correct"] = df_system.apply(lambda x: True if x['acc'] > 0 else False, axis=1)
    df_system = df_system.drop(columns=["acc", "choices"])
    df_system["id"] = df_system.index
    return df_system

Make sure you have your Zeno API key in your environment variables.

In [None]:
client = ZenoClient(API_KEY)

Lets create a project to hold the data for the MMLU task.

In [None]:
proj = client.create_project(
    name="MMLU", 
    view="text-classification", 
    description="MMLU (https://arxiv.org/abs/2009.03300) tasks in the Open-LLM-Leaderboard.",
    metrics=[
        ZenoMetric(name="accuracy", type="mean", columns=["correct"])
    ]
)

Let us now upload the data to the project we just created.

In [None]:
df = generate_dataset(get_data(models[0]))
print("\nYour dataset has {} rows\n".format(len(df)))
num_rows = len(df)
proj.upload_dataset(df, id_column="id", label_column="label", data_column="data")

Finally, let us upload all the model outputs for the models we specified above.

In [None]:
for model in models:
    dataset = get_data(model)
    if len(dataset) != num_rows:
        print("Skipping {} because it has {} rows instead of {}".format(model, len(dataset), num_rows))
        continue
    df_system = generate_system(dataset)
    proj.upload_system(df_system, name=model.replace('/', "__"), output_column="predictions", id_column="id")