### Setup

Import libraries, set up model, and read input data.


In [None]:
import pandas as pd
import json
import os
from dotenv import load_dotenv

from zeno_client import ZenoClient, ZenoMetric

In [None]:
data = json.load(open("tax-benchmark.json"))

In [None]:
def format_question(input):
    return_question = input["source_question"]["description"].replace("\\n", "\n")
    return_question += "\n\n"
    for answer in enumerate(input["source_question"]["options"]):
        return_question += f"{answer[0] + 1}. {answer[1]}\n"
    return return_question


df_input = pd.DataFrame(
    {
        "question": [format_question(d) for d in data],
        "answer": [str(d["source_question"]["correct_answer"]) for d in data],
        "reference": [d["source_question"]["reference"] for d in data],
        "tag": [d["source_question"]["tag"] for d in data],
        "category": [d["source_question"]["category"] for d in data],
    }
)
df_input["question length"] = df_input["question"].apply(lambda x: len(x))
df_input["id"] = df_input.index

In [None]:
# optional, generate topics using BERTopic
from bertopic import BERTopic

topic_model = BERTopic("english", min_topic_size=3)
topics, probs = topic_model.fit_transform(
    [d["source_question"]["description"] for d in data]
)
df_input["topic"] = topics
df_input["topic"] = df_input["topic"].astype(str)

## Create Zeno Project

Our view configuration will feature markdown for the input data and the system output.
We'll add two metrics, accuracy and output length.


In [None]:
load_dotenv("../.env", override=True)
client = ZenoClient(os.environ.get("ZENO_API_KEY"))

In [None]:
project = client.create_project(
    name="LLM Taxes Benchmark",
    view={
        "data": {"type": "markdown"},
        "label": {"type": "text"},
        "output": {"type": "markdown"},
    },
    description="Tax questions for LLMs",
    public=True,
    metrics=[
        ZenoMetric(name="accuracy", type="mean", columns=["correct"]),
        ZenoMetric(name="output length", type="mean", columns=["output length"]),
    ],
)

In [None]:
project.upload_dataset(
    df_input, id_column="id", data_column="question", label_column="answer"
)

In [None]:
for model in data[0]["full"].keys():
    df_system = pd.DataFrame(
        {
            "output": [
                f"**Full:** {d['full'][model]}\n\n**Simplified**: {d['simplified'][model]}"
                for d in data
            ],
            "output length": [len(d["full"][model]) for d in data],
            "simplified output": [str(d["simplified"][model]) for d in data],
        }
    )
    df_system["correct"] = df_input["answer"] == df_system["simplified output"]
    df_system["id"] = df_input["id"]
    project.upload_system(
        df_system, name=model.replace("/", "-"), id_column="id", output_column="output"
    )