# GSM8k Task 

Diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.

In [None]:
from zeno_client import ZenoClient, ZenoMetric
import datasets
import os
import dotenv

dotenv.load_dotenv(override=True)

In [None]:
API_KEY = os.environ["ZENO_API_KEY"]
client = ZenoClient(API_KEY)

In [None]:
models = ["teknium__OpenHermes-2.5-Mistral-7B", "Weyaxi__OpenHermes-2.5-neural-chat-7b-v3-1-7B"]

In [None]:
initial_df = datasets.load_dataset("gsm8k", "main")["test"].to_pandas()

In [None]:
initial_df["id"] = initial_df.index

In [None]:
proj = client.create_project(
    name="GSM8K OpenLLM", 
    view="text-classification", 
    description="Open LLM GSM8K dataset",
    metrics=[
        ZenoMetric(name="acc", type="mean", columns=["acc"]),
        ZenoMetric(name="avg. output length", type="mean", columns=["output length"])
    ]
)

In [None]:
proj.upload_dataset(initial_df, id_column="id", data_column="question", label_column="answer")

In [None]:
for m in models:
    output_df = datasets.load_dataset("open-llm-leaderboard/details_" + m, "harness_gsm8k_5")["latest"].to_pandas()
    merged_df = output_df.merge(initial_df, left_on="example", right_on="question")
    merged_df["output"] = merged_df['predictions'].apply(lambda x: x[0])
    merged_df["output length"] = merged_df['output'].apply(lambda x: len(x))
    merged_df["acc"] = merged_df['metrics'].apply(lambda x: x["acc"])
    proj.upload_system(merged_df[["id", "output", "output length", "acc"]], name=m, id_column="id", output_column="output")