# HellaSwag

HellaSwag is a common-sense inference task.

In [None]:
from zeno_client import ZenoClient, ZenoMetric
import datasets
import numpy as np
import os

API_KEY = os.environ["ZENO_API_KEY"]

Feel free to change the list of models used.
You can go to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) to check what models are available.
Some of them might not have associated data, you can check this by clicking on the little icon next to the model name.
If you get a 404 after clicking, we won't be able to fetch the model data and this notebook will crash.

In [None]:
models = ["meta-llama/Llama-2-70b-hf", "mistralai/Mistral-7B-v0.1", "tiiuae/falcon-40b", "Riiid/sheep-duck-llama-2-70b-v1.1", "AIDC-ai-business/Marcoroni-70B-v1", "ICBU-NPU/FashionGPT-70B-V1.1", "adonlee/LLaMA_2_70B_LoRA", "uni-tianyan/Uni-TianYan"]

In [None]:
def get_data(model: str):
    data_path = "details_" + model.replace("/", "__")
    return datasets.load_dataset(
        "open-llm-leaderboard/" + data_path,
        "harness_hellaswag_10",
    )

In [None]:
labels = ["A", "B", "C", "D"]

def generate_dataset(df):
    df_lim = df[["example", "choices", "gold"]]
    df_lim.loc[:, "data"] = df_lim.apply(lambda x: "\n" + x["example"] + "\n\nOptions:\n" + "\n".join(f"{labels[i]}: {x}" for i,x in enumerate(x['choices'])), axis=1)
    df_lim.loc[:, "label"] = df_lim.apply(lambda x: labels[x["gold"]], axis=1)
    df_lim = df_lim.drop(columns=["example", "choices", "gold"])
    df_lim["id"] = df_lim.index 
    return df_lim

def generate_system(df):
    df_system = df[["predictions", "acc_norm", "choices", "acc"]]
    df_system["answer_raw"] = df_system.apply(lambda x: labels[np.argmax(x['predictions'])], axis=1)
    df_system["answer_norm"] = df_system.apply(lambda x: labels[np.argmax(x['predictions']/np.array([float(len(i)) for i in x['choices']]))], axis=1)
    df_system["predictions"] = df_system.apply(lambda x: x['answer_norm'] + "\n\n" + "Raw Pred.: " + ", ".join(map(lambda y: str(round(y, 2)), x['predictions'])) + "\nNorm Pred.: " + ", ".join(map(lambda y: str(round(y, 2)), x['predictions']/np.array([float(len(i)) for i in x['choices']]))), axis=1)
    df_system["correct"] = df_system.apply(lambda x: True if x['acc_norm'] > 0 else False, axis=1)
    df_system["correct_raw"] = df_system.apply(lambda x: True if x['acc'] > 0 else False, axis=1)
    df_system = df_system.drop(columns=["acc_norm", "choices", "acc"])
    df_system["id"] = df_system.index
    return df_system

Make sure you have your Zeno API key in your environment variables.

In [None]:
client = ZenoClient(API_KEY)

Lets create a project to hold the data for the HellaSwag task.

In [None]:
proj = client.create_project(
    name="HellaSwag", 
    view="text-classification", 
    description="HellaSwag (https://arxiv.org/abs/1905.07830) task in the Open-LLM-Leaderboard.",
    metrics=[
        ZenoMetric(name="accuracy", type="mean", columns=["correct"])
    ]
)

Let us now upload the data to the project we just created.

In [None]:
df = generate_dataset(get_data(models[0])['latest'].to_pandas())
print("\nYour dataset has {} rows\n".format(len(df)))
num_rows = len(df)
proj.upload_dataset(df, id_column="id", label_column="label", data_column="data")

Finally, let us upload all the model outputs for the models we specified above.

In [None]:
for model in models:
    dataset = get_data(model)['latest'].to_pandas()
    if len(dataset) != num_rows:
        print("Skipping {} because it has {} rows instead of {}".format(model, len(dataset), num_rows))
        continue
    df_system = generate_system(dataset)
    proj.upload_system(df_system, name=model.replace('/', "__"), output_column="predictions", id_column="id")