# Exploring Web Arena Results with Zeno 


In [None]:
import pandas as pd
import json
import os
from dotenv import load_dotenv

import zeno_client

We first need to convert and combine the output `HTML` trajectories into a single `JSON` file using the `html2json` script:

```bash
python html2json.py --results_folder RESULTS_DIR --config_json ../config_files/test.raw.json
```

Run this for however many models you wish to compare and add the filenames to the `RESULTS_JSONS` list.

In [None]:
RESULT_JSONS = ["../json_dump_4.json", "../json_dump_palm.json"]
RESULT_NAMES = ["gpt4-8k-cot", "palm-2"]

## Obtaining Data

We can use the first results file to create the base `dataset` we'll upload to Zeno with just the initial prompt intent.

In [None]:
with open(RESULTS_JSONS[0], "r") as f:
    raw_json: dict = json.load(f)

In [None]:
df = pd.DataFrame(
    {
        "example_id": list(raw_json.keys()),
        "site": [", ".join(x["sites"]) for x in raw_json.values()],
        "eval_type": [", ".join(x["eval_types"]) for x in raw_json.values()],
        "achievable": [x["achievable"] for x in raw_json.values()],
        "context": [
            json.dumps(
                [
                    {
                        "role": "system",
                        "content": row["intent"],
                    }
                ]
            )
            for row in raw_json.values()
        ],
    }
)

## Authenticate and Create a Project

We can now create a new [Zeno](https://zenoml.com) project and upload this data.

Create an account and API key by signing up at [Zeno Hub](https://hub.zenoml.com) and going to your [Account page](http://hub.zenoml.com/account). Save the API key in a `.env` file.

In [None]:
# read ZENO_API_KEY from .env file
load_dotenv(override=True)

client = zeno_client.ZenoClient(os.environ.get("ZENO_API_KEY"))

In [None]:
project = client.create_project(
    name="Web Arena Exploration",
    view={
        "data": {
            "type": "list",
            "elements": {"type": "message", "content": {"type": "markdown"}},
            "collapsible": "top",
        },
        "label": {"type": "markdown"},
        "output": {
            "type": "list",
            "elements": {
                "type": "message",
                "highlight": True,
                "content": {"type": "markdown"},
            },
            "collapsible": "top",
        },
    },
    metrics=[
        zeno_client.ZenoMetric(name="success", type="mean", columns=["success"]),
        zeno_client.ZenoMetric(
            name="# of go backs", type="mean", columns=["# of go_backs"]
        ),
        zeno_client.ZenoMetric(name="# of steps", type="mean", columns=["# of steps"]),
    ],
)

In [None]:
project.upload_dataset(df, id_column="example_id", data_column="context")

# Uploading Model Outputs

We can now upload the full trajectory outputs for our models.

In [None]:
def format_message(row):
    return_list = []
    for message in row["messages"]:
        role = "user" if "user" in message else "assistant"

        if role == "user":
            content = (
                "[![image](https://phontron.com/data/webarena_images/%s)](https://phontron.com/data/webarena_images/%s)\n%s"
                % (
                    "/".join(message["image"].split("/")[-2:]),
                    "/".join(message["image"].split("/")[-2:]),
                    message[role],
                )
            )
        else:
            content = message[role]
        return_list.append({"role": role, "content": content})
    return return_list

In [None]:
def get_system_df(result_path: str):
    with open(result_path, "r") as f:
        json_input: dict = json.load(f)
    return pd.DataFrame(
        {
            "example_id": list(json_input.keys()),
            "# of clicks": [
                sum(
                    [
                        1
                        for x in r["messages"]
                        if "assistant" in x and "`click" in x["assistant"]
                    ]
                )
                for r in json_input.values()
            ],
            "# of types": [
                sum(
                    [
                        1
                        for x in r["messages"]
                        if "assistant" in x and "`type" in x["assistant"]
                    ]
                )
                for r in json_input.values()
            ],
            "# of go_backs": [
                sum(
                    [
                        1
                        for x in r["messages"]
                        if "assistant" in x and "`go_back" in x["assistant"]
                    ]
                )
                for r in json_input.values()
            ],
            "# of steps": [len(r["messages"]) for r in json_input.values()],
            "context": [json.dumps(format_message(row)) for row in json_input.values()],
            "success": [r["success"] for r in json_input.values()],
        }
    )

In [None]:
for i, system in enumerate(RESULT_JSONS):
    output_df = get_system_df(system)
    project.upload_system(
        output_df, name=RESULT_NAMES[i], id_column="example_id", output_column="context"
    ) 