Messy code for counting tokens from the experiments.

In [None]:
from collections import Counter

from redel.utils import read_jsonl


# from the docs
def count_tokens(fp):
    tokens_used_prompt = Counter()
    tokens_used_output = Counter()
    for event in read_jsonl(fp):
        if event["type"] == "tokens_used":
            tokens_used_prompt[event["id"]] += event["prompt_tokens"]
            tokens_used_output[event["id"]] += event["completion_tokens"]
    return tokens_used_prompt, tokens_used_output

In [None]:
from pathlib import Path

# define base experiments path
EXPERIMENTS = Path("/Users/andrew/Desktop/Code/kanpai/experiments")

In [None]:
import json


def count_system(fp, root_input_cost_1m, root_output_cost_1m, del_input_cost_1m=0, del_output_cost_1m=0):
    if not fp.exists():
        return 0, 0
    # get all event paths in system
    event_paths = []
    for result in read_jsonl(fp / "results.jsonl"):
        event_paths.append(fp / Path(result["log_dir"]).stem / "events.jsonl")

    n = len(event_paths)
    total_prompt_tokens_root = 0
    total_output_tokens_root = 0
    total_prompt_tokens_del = 0
    total_output_tokens_del = 0
    # for each run,
    for ep in event_paths:
        # find the root node ID
        with open(ep.parent / "state.json") as f:
            state = json.load(f)
            root_id = state["state"][0]["id"]

        # count tokens by root and by del
        prompt_tokens, output_tokens = count_tokens(ep)
        prompt_tokens_root = prompt_tokens[root_id]
        output_tokens_root = output_tokens[root_id]

        total_prompt_tokens_root += prompt_tokens_root
        total_output_tokens_root += output_tokens_root
        total_prompt_tokens_del += prompt_tokens.total() - prompt_tokens_root
        total_output_tokens_del += output_tokens.total() - output_tokens_root

    # m a t h
    # (I should have used a dict but I did this in like 10 minutes don't judge me)
    avg_prompt_tokens_root = total_prompt_tokens_root / n
    avg_output_tokens_root = total_output_tokens_root / n
    avg_prompt_tokens_del = total_prompt_tokens_del / n
    avg_output_tokens_del = total_output_tokens_del / n

    avg_prompt_cost_root = avg_prompt_tokens_root * root_input_cost_1m / 1000000
    avg_output_cost_root = avg_output_tokens_root * root_output_cost_1m / 1000000
    avg_prompt_cost_del = avg_prompt_tokens_del * del_input_cost_1m / 1000000
    avg_output_cost_del = avg_output_tokens_del * del_output_cost_1m / 1000000

    total_cost = (avg_prompt_cost_root + avg_output_cost_root + avg_prompt_cost_del + avg_output_cost_del) * n

    print(f"========== {fp} ==========")
    print(f"Avg prompt tokens root: {avg_prompt_tokens_root} (${avg_prompt_cost_root})")
    print(f"Avg output tokens root: {avg_output_tokens_root} (${avg_output_cost_root})")
    print(f"Avg prompt tokens del: {avg_prompt_tokens_del} (${avg_prompt_cost_del})")
    print(f"Avg output tokens del: {avg_output_tokens_del} (${avg_output_cost_del})")
    print(f"N: {n}")
    print(f"Total cost: ${total_cost}")

    return total_cost, n

In [None]:
count_system(
    EXPERIMENTS / Path("fanoutqa/dev/trial2/small-leaf"),
    root_input_cost_1m=5,
    root_output_cost_1m=15,
    del_input_cost_1m=0.5,
    del_output_cost_1m=1.5,
)
count_system(
    EXPERIMENTS / Path("travelplanner/validation/small-leaf"),
    root_input_cost_1m=5,
    root_output_cost_1m=15,
    del_input_cost_1m=0.5,
    del_output_cost_1m=1.5,
)
count_system(
    EXPERIMENTS / Path("webarena/test/small-leaf"),
    root_input_cost_1m=5,
    root_output_cost_1m=15,
    del_input_cost_1m=0.5,
    del_output_cost_1m=1.5,
)

In [None]:
count_system(EXPERIMENTS / Path("fanoutqa/dev/trial2/baseline"), root_input_cost_1m=5, root_output_cost_1m=15)
count_system(EXPERIMENTS / Path("travelplanner/validation/baseline"), root_input_cost_1m=5, root_output_cost_1m=15)
count_system(EXPERIMENTS / Path("webarena/test/baseline"), root_input_cost_1m=5, root_output_cost_1m=15)

In [None]:
total_cost_small_leaf = 9.200377 + 11.0021565 + 98.039051
total_cost_baseline = 102.65177 + 41.63858 + 184.745045
print(total_cost_small_leaf, total_cost_baseline)

In [None]:
329.035395 / 118.2415845

In [None]:
costs_by_system = {
    "full": (5, 15, 5, 15),
    "root-fc": (5, 15, 5, 15),
    "baseline": (5, 15, 0, 0),
    "small-leaf": (5, 15, 0.5, 1.5),
    "small-all": (0.5, 1.5, 0.5, 1.5),
    "small-baseline": (0.5, 1.5, 0, 0),
    "short-context": (5, 15, 5, 15),
    "short-baseline": (5, 15, 0, 0),
}

for system in [
    "full",
    "root-fc",
    "baseline",
    "small-leaf",
    "small-all",
    "small-baseline",
    "short-context",
    "short-baseline",
]:
    benchmark_cost = 0
    for benchmark in ["fanoutqa/dev/trial2", "travelplanner/validation", "webarena/test"]:
        root_in_cost, root_out_cost, del_in_cost, del_out_cost = costs_by_system[system]
        total_cost, _ = count_system(
            EXPERIMENTS / benchmark / system,
            root_input_cost_1m=root_in_cost,
            root_output_cost_1m=root_out_cost,
            del_input_cost_1m=del_in_cost,
            del_output_cost_1m=del_out_cost,
        )
        benchmark_cost += total_cost
    print(f"TOTAL SYSTEM COST: ${benchmark_cost}")