In [1]:
from openai import OpenAI
from pydantic import BaseModel


class Row(BaseModel):
    zip_code: str
    city: str
    state: str
    avg_temp_f: float
    population: int
    median_income: int
    known_for: str

class Rows(BaseModel):
    rows: list[Row]

def generate_dataset_rows(location: str = "United States", count: int = 5, year: int = 2022):
    client = OpenAI()

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Please generate {count} rows of data for random zip codes in {location} for the year {year}."},
        ],
        response_format=Rows
        
    )

    return completion.choices[0].message.parsed.model_dump()['rows']

rows = generate_dataset_rows()

In [2]:
project_name = "leaderboard-demo-3"

In [3]:
import weave


@weave.op
def check_concrete_fields(city: str, state: str, model_output: dict):
    return {
        "city_match": city == model_output["city"],
        "state_match": state == model_output["state"],
    }

@weave.op
def check_value_fields(avg_temp_f: float, population: int, median_income: int, model_output: dict):
    return {
        "avg_temp_f_err": abs(avg_temp_f - model_output["avg_temp_f"]) / avg_temp_f,
        "population_err": abs(population - model_output["population"]) / population,
        "median_income_err": abs(median_income - model_output["median_income"]) / median_income,
    }

@weave.op
def check_subjective_fields(zip_code: str, known_for: str, model_output: dict):
    client = OpenAI()

    class Response(BaseModel):
        correct_known_for: bool

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"My student was asked what the zip code {zip_code} is best known best for. The right answer is '{known_for}', and they said '{model_output['known_for']}'. Is their answer correct?"},
        ],
        response_format=Response
    )

    return completion.choices[0].message.parsed.model_dump()


evaluation = weave.Evaluation(
    name="United States - 2022 - Concrete Fields",
    dataset=rows,
    scorers=[
        check_concrete_fields,
        check_value_fields,
        check_subjective_fields,
    ]
)

In [8]:
weave.init(project_name)

@weave.op
def baseline_model(zip_code: str):
    return {
        "city": "California",
        "state": "NY",
        "avg_temp_f": 50.0,
        "population": 1000000,
        "median_income": 100000,
        "known_for": "The Big Apple",
    }

await evaluation.evaluate(baseline_model)

In [None]:
weave.init(project_name)

@weave.op
def gpt_4o_mini_no_context(zip_code: str):
    client = OpenAI()

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": f"""Zip code {zip_code}"""}],
        response_format=Row
    )

    return completion.choices[0].message.parsed.model_dump()

await evaluation.evaluate(gpt_4o_mini_no_context)

In [None]:
weave.init(project_name)

@weave.op
def gpt_4o_mini_with_context(zip_code: str):
    client = OpenAI()

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": f"""Please answer the following questions about the zip code {zip_code}:
                   1. What is the city?
                   2. What is the state?
                   3. What is the average temperature in Fahrenheit?
                   4. What is the population?
                   5. What is the median income?
                   6. What is the most well known thing about this zip code?
                   """}],
        response_format=Row
    )

    return completion.choices[0].message.parsed.model_dump()

await evaluation.evaluate(baseline_model)

In [None]:
scorers = [
    check_concrete_fields,
    check_value_fields,
    check_subjective_fields,
]
evaluations = [
    weave.Evaluation(
        name="United States - 2022",
        dataset=weave.Dataset(name="United States - 2022", rows=generate_dataset_rows("United States", 5, 2022)),
        scorers=scorers,
    ),
    weave.Evaluation(
        name="California - 2022",
        dataset=weave.Dataset(name="California - 2022", rows=generate_dataset_rows("California", 5, 2022)),
        scorers=scorers,
    ),
    # weave.Evaluation(
    #     name="United States - 2000",
    #     dataset=weave.Dataset(name="United States - 2000", rows=generate_dataset_rows("United States", 5, 2000)),
    #     scorers=scorers,
    # ),
]
models = [
    baseline_model,
    gpt_4o_mini_no_context,
    gpt_4o_mini_with_context,
]

for evaluation in evaluations:
    for model in models:
        await evaluation.evaluate(model, __weave={"display_name": evaluation.name + ":" + model.__name__})

https://beta.wandb.ai/?betaVersion=234f24492fa0a1999a084aeb93d4d2128944512c
https://beta.wandb.ai/timssweeney/leaderboard-demo-1/weave/leaderboards/leaderboard-2024-10-23T07-43-07-604Z?edit=true