# WINGRANDE Task

An adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.

In [None]:
from zeno_client import ZenoClient, ZenoMetric
import datasets
import json
import os
import dotenv

dotenv.load_dotenv(override=True)

In [None]:
API_KEY = os.environ["ZENO_API_KEY"]
client = ZenoClient(API_KEY)

In [None]:
models = ["01-ai__Yi-34B_public"]

In [None]:
base_df = datasets.load_dataset("winogrande", "winogrande_m")["validation"].to_pandas()

In [None]:
proj = client.create_project(
    name="WinoGrande OpenLLM Leaderboard", 
    view="text-classification", 
    description="WinoGrande task in the Open-LLM-Leaderboard (https://arxiv.org/abs/1907.10641).",
    metrics=[
        ZenoMetric(name="acc", type="mean", columns=["acc"])
    ]
)

In [None]:
base_df['id'] = base_df.index
base_df["input length"] = base_df["sentence"].str.len()
base_df["prompt"] = base_df.apply(lambda x: f"{x['sentence']}\n\n{x['option1']}\n{x['option2']}", axis=1)
base_df["label"] = base_df.apply(lambda x: x["option1"] if int(x["answer"]) == 1 else x["option2"], axis=1)

In [None]:
proj.upload_dataset(base_df[['id', "prompt", "label", "input length"]], id_column="id", data_column="prompt", label_column="label")

In [None]:
for m in models:
    output_df = datasets.load_dataset("open-llm-leaderboard/details_" + m, "harness_winogrande_5")["latest"].to_pandas()
    merged_df = output_df.merge(base_df, left_on="example", right_on="sentence")
    merged_df["output"] = merged_df.apply(lambda x: f"{x['option1'] if x['predictions'][0] > x['predictions'][1] else x['option2']} {x['predictions']}", axis=1)
    merged_df["acc"] = merged_df['metrics'].apply(lambda x: x["acc"])
    proj.upload_system(merged_df[["id", "output", "acc"]], name=m, id_column="id", output_column="output")