# DROP Task

English reading comprehension benchmark requiring Discrete Reasoning Over the content of Paragraphs.


In [None]:
from zeno_client import ZenoClient, ZenoMetric
import datasets
import json
import os
import dotenv

dotenv.load_dotenv(override=True)

In [None]:
API_KEY = os.environ["ZENO_API_KEY"]
client = ZenoClient(API_KEY)

In [None]:
models = ["01-ai__Yi-34B_public"]

In [None]:
base_df = datasets.load_dataset("drop")["validation"].to_pandas()

In [None]:
base_df = base_df.drop_duplicates(subset=['query_id'])
base_df['input'] = base_df.apply(lambda x: f"**Passage**: {x['passage']} \n\n**Question:** {x['question']}", axis=1)
base_df['answers'] = base_df.apply(lambda x: ", ".join(x['answers_spans']['spans']), axis=1)
base_df['answer type'] = base_df["answers_spans"].apply(lambda x: x['types'][0])
base_df['passage length'] = base_df['passage'].str.len()
base_df['question length'] = base_df['question'].str.len()

In [None]:
proj = client.create_project(
    name="DROP OpenLLM Leaderboard", 
    view={
        "data": {
            "type": "markdown"
        },
        "label": {
            "type": "text"
        },
        "output": {
            "type": "text"
        } 
    }, 
    description="DROP task in the Open-LLM-Leaderboard (https://arxiv.org/pdf/1903.00161.pdf).",
    metrics=[
        ZenoMetric(name="em", type="mean", columns=["em"]),
        ZenoMetric(name="f1", type="mean", columns=["f1"])
    ]
)

In [None]:
proj.upload_dataset(base_df[["query_id", "input", "answers", "passage length", "question length", "answer type"]], id_column="query_id", data_column="input", label_column="answers")

In [None]:
for m in models:
    output_df = datasets.load_dataset("open-llm-leaderboard/details_" + m, "harness_drop_3")["latest"].to_pandas()
    merged_df = output_df.merge(base_df, left_on="example", right_on="query_id").drop_duplicates(subset=['query_id'])
    merged_df['output'] = merged_df['predictions'].apply(lambda x: x[0] if len(x) > 0 else '')
    merged_df["f1"] = merged_df['metrics'].apply(lambda x: x['f1'])
    merged_df["em"] = merged_df['metrics'].apply(lambda x: x['em'])
    proj.upload_system(merged_df[["query_id", "output", "f1", "em"]], name=m, id_column="query_id", output_column="output")