In [1]:
import os
os.chdir("../")
os.getcwd()

'/data1/xhuan192/codes/medagents-benchmark'

In [2]:
from pathlib import Path
import json

In [3]:
# 4o:  2.5 / 1_000_000, 10 / 1_000_000
def parse_result(data):
    num_samples = len(data)
    num_correct = 0
    prompt_tokens = 0
    completion_tokens = 0
    total_time = 0
    for sample in data:
        if sample["predicted_answer"].upper() == sample["answer_idx"].upper():
            num_correct += 1
        prompt_tokens += sample['token_usage']["prompt_tokens"]
        completion_tokens += sample['token_usage']["completion_tokens"]
        total_time += sample["time_elapsed"]
    total_tokens = prompt_tokens + completion_tokens
    avg_prompt_tokens = prompt_tokens / num_samples
    avg_completion_tokens = completion_tokens / num_samples
    avg_tokens = total_tokens / num_samples
    
    accuracy = num_correct / num_samples
    # 4o
    input_cost = prompt_tokens * 2.5 / 1_000_000
    output_cost = completion_tokens * 10 / 1_000_000
    total_cost = input_cost + output_cost

    average_input_cost = input_cost / num_samples
    average_output_cost = output_cost / num_samples
    average_total_cost = total_cost / num_samples

    average_time = total_time / num_samples
    return {
        "accuracy": accuracy,
        "avg_prompt_tokens": avg_prompt_tokens,
        "avg_completion_tokens": avg_completion_tokens,
        "avg_tokens": avg_tokens,
        "input_cost": input_cost,
        "output_cost": output_cost,
        "total_cost": total_cost,
        "average_input_cost": average_input_cost,
        "average_output_cost": average_output_cost,
        "average_total_cost": average_total_cost,
        "total_time": total_time,
        "average_time": average_time,
        "input_tokens": prompt_tokens,
        "output_tokens": completion_tokens,
        "num_samples": num_samples,
    }


In [4]:
output_dir = Path("outputs")
# get all json files recursively in the output_dir
json_files = list(output_dir.rglob("*.json"))
"""
├── mdagents
│   ├── medbullets
│   │   └── gpt-4o-1120-nofilter-global_medbullets_test_hard_adaptive.json
│   └── pubmedqa
│       └── gpt-4o-1120-nofilter-global_pubmedqa_test_hard_adaptive.json
├── medagents
│   ├── medbullets
│   │   └── gpt-4o-1120-nofilter-global-medbullets-test_hard-syn_verif.json
└── medprompt
    ├── cot
    │   ├── medbullets
    │   │   └── gpt-4o-1120-nofilter-global-medbullets-test_hard-cot.json
"""

method_dataset_result_mapping = {}
for json_file in json_files:
    # get the method and dataset from the file name
    method, dataset = json_file.parts[-3], json_file.parts[-2]
    model = json_file.parts[-1].split(dataset)[0]

    if method not in method_dataset_result_mapping:
        method_dataset_result_mapping[method] = {}

    # get the json data
    with open(json_file, "r") as f:
        data = json.load(f)
    method_dataset_result_mapping[method][dataset] = parse_result(data)

In [5]:
def parser_df(method_dataset_result_mapping, key):
    method_dataset_acc_mapping = {}
    for method, dataset_result_mapping in method_dataset_result_mapping.items():
        for dataset, result in dataset_result_mapping.items():
            if method not in method_dataset_acc_mapping:
                method_dataset_acc_mapping[method] = {}
            method_dataset_acc_mapping[method][dataset] = result[key]
    return method_dataset_acc_mapping


In [6]:
import pandas as pd


In [7]:
key = "accuracy"
print(key)
df = pd.DataFrame(parser_df(method_dataset_result_mapping,key))
df = df[["zero_shot",
"few_shot",
"cot",
"cot_sc",
"self_refine",
"multi_persona",
"medprompt",
"medagents",
"mdagents"]]
df = df.transpose()
df["avg"], df["sum"] = df.mean(axis=1), df.sum(axis=1)
df

accuracy


Unnamed: 0,medbullets,pubmedqa,medxpertqa-r,mmlu,medmcqa,medqa,medexqa,medxpertqa-u,mmlu-pro,avg,sum
zero_shot,0.179775,0.09,0.13,0.287671,0.28,0.42,0.15,0.11,0.37,0.224161,2.017447
few_shot,0.258427,0.16,0.14,0.246575,0.22,0.35,0.17,0.06,0.16,0.196111,1.765002
cot,0.314607,0.08,0.19,0.30137,0.29,0.55,0.25,0.18,0.39,0.282886,2.545977
cot_sc,0.292135,0.1,0.15,0.369863,0.29,0.52,0.22,0.18,0.38,0.278,2.501998
self_refine,0.337079,0.15,,0.333333,0.29,0.61,,,,0.344082,1.720412
multi_persona,0.235955,0.11,0.15,0.39726,0.31,0.4,0.25,0.12,0.45,0.269246,2.423215
medprompt,0.269663,0.13,0.21,0.39726,0.3,0.48,0.2,0.22,0.45,0.295214,2.656923
medagents,0.222222,0.16,0.17,,0.32,0.51,0.21,0.16,0.38,0.266528,2.132222
mdagents,0.269231,0.222222,0.105263,0.262295,0.342105,0.326923,0.202703,0.06383,0.268293,0.229207,2.062865


In [8]:
key = "num_samples"
print(key)
df = pd.DataFrame(parser_df(method_dataset_result_mapping,key))
df = df[["zero_shot",
"few_shot",
"cot",
"cot_sc",
"self_refine",
"multi_persona",
"medprompt",
"medagents",
"mdagents"]]
df = df.transpose()
df["avg"], df["sum"] = df.mean(axis=1), df.sum(axis=1)
df

num_samples


Unnamed: 0,medbullets,pubmedqa,medxpertqa-r,mmlu,medmcqa,medqa,medexqa,medxpertqa-u,mmlu-pro,avg,sum
zero_shot,89.0,100.0,100.0,73.0,100.0,100.0,100.0,100.0,100.0,95.777778,862.0
few_shot,89.0,100.0,100.0,73.0,100.0,100.0,100.0,100.0,100.0,95.777778,862.0
cot,89.0,100.0,100.0,73.0,100.0,100.0,100.0,100.0,100.0,95.777778,862.0
cot_sc,89.0,100.0,100.0,73.0,100.0,100.0,100.0,100.0,100.0,95.777778,862.0
self_refine,89.0,100.0,,6.0,100.0,100.0,,,,79.0,395.0
multi_persona,89.0,100.0,100.0,73.0,100.0,100.0,100.0,100.0,100.0,95.777778,862.0
medprompt,89.0,100.0,100.0,73.0,100.0,100.0,100.0,100.0,100.0,95.777778,862.0
medagents,63.0,100.0,100.0,,100.0,100.0,100.0,100.0,100.0,95.375,763.0
mdagents,52.0,54.0,38.0,61.0,76.0,52.0,74.0,47.0,41.0,55.0,495.0


In [9]:
key = "total_cost"
print(key)
df = pd.DataFrame(parser_df(method_dataset_result_mapping,key))
df = df[["zero_shot",
"few_shot",
"cot",
"cot_sc",
"self_refine",
"multi_persona",
"medprompt",
"medagents",
"mdagents"]]
df = df.transpose()
df["avg"], df["sum"] = df.mean(axis=1), df.sum(axis=1)
display(df)
print(f"total cost", df.sum().sum())

total_cost


Unnamed: 0,medbullets,pubmedqa,medxpertqa-r,mmlu,medmcqa,medqa,medexqa,medxpertqa-u,mmlu-pro,avg,sum
zero_shot,0.074565,0.105178,0.114522,0.03289,0.03456,0.076808,0.03595,0.099218,0.058078,0.070196,0.631768
few_shot,0.559813,0.537118,1.205852,0.208873,0.12159,0.555608,0.19417,1.039848,0.308457,0.525703,4.731327
cot,0.66207,0.508057,0.849172,0.349055,0.43508,0.677227,0.42497,0.756698,0.606838,0.585463,5.269168
cot_sc,3.529535,2.718037,4.424685,1.839425,2.30741,3.582337,2.28455,3.943985,3.279315,3.101031,27.90928
self_refine,2.110092,1.811352,,0.067845,1.45077,2.091292,,,,1.506271,7.531353
multi_persona,5.627782,4.823738,7.081045,3.149475,4.0749,5.731768,3.95419,6.439715,5.42428,5.14521,46.306893
medprompt,4.13768,3.71156,6.461752,1.90148,2.257532,4.087738,2.57215,6.0232,3.581277,3.859374,34.73437
medagents,8.96518,13.520955,15.979782,,11.091995,13.72993,10.277535,14.056593,11.695663,12.414704,99.317633
mdagents,15.74592,29.350395,16.9717,2.496965,2.766865,14.814553,2.023792,18.624862,4.32569,11.902305,107.120742


total cost 706.2153226180556


In [10]:
key = "total_time"
print(key)
df = pd.DataFrame(parser_df(method_dataset_result_mapping,key))
df = df[["zero_shot",
"few_shot",
"cot",
"cot_sc",
"self_refine",
"multi_persona",
"medprompt",
"medagents",
"mdagents"]]
df = df.transpose()
df["avg"], df["sum"] = df.mean(axis=1), df.sum(axis=1)
display(df)
print(f"total time (sequential)", df.sum().sum()/ 3600, "hours")

total_time


Unnamed: 0,medbullets,pubmedqa,medxpertqa-r,mmlu,medmcqa,medqa,medexqa,medxpertqa-u,mmlu-pro,avg,sum
zero_shot,117.423813,120.036392,130.322861,91.745115,122.114668,159.455,133.20715,120.110625,140.575729,126.11015,1134.991352
few_shot,431.87656,182.442526,497.295514,146.547857,149.204682,416.124951,189.426589,306.918099,223.121611,282.550932,2542.958389
cot,741.143786,577.05441,911.187562,432.599729,562.309358,775.587569,554.230422,833.257928,728.123208,679.49933,6115.493973
cot_sc,4086.096926,3069.240881,4610.132303,2296.453717,2999.074553,4261.197869,2885.214147,4114.840982,3948.115543,3585.596324,32270.36692
self_refine,2740.980017,1873.401011,,155.648026,1870.088043,2188.321566,,,,1765.687733,8828.438664
multi_persona,6354.858115,5487.516438,7836.027494,3972.571823,5140.38592,6927.533108,5104.391573,7163.115247,6645.460177,6070.206655,54631.859897
medprompt,2691.566365,2186.517701,3535.895233,1498.090172,1938.659024,2742.306511,2091.636876,3197.637639,2629.79886,2501.345376,22512.10838
medagents,6195.258976,7909.034654,10008.023817,,8386.952224,8603.323901,7670.366319,8575.908013,8216.173803,8195.630213,65565.041707
mdagents,5782.756743,9974.618224,4881.718329,1349.963703,1810.389738,5082.428447,1511.949758,4631.696717,1705.182806,4081.189385,36730.704467


total time (sequential) 135.5421509987782 hours


In [11]:
key = "total_time"
print(key)
df = pd.DataFrame(parser_df(method_dataset_result_mapping,key))
df = df[["zero_shot",
"few_shot",
"cot",
"cot_sc",
"self_refine",
"multi_persona",
"medprompt",
"medagents",
"mdagents"]]
df = df.transpose()
df["avg"], df["sum"] = df.mean(axis=1), df.sum(axis=1)
display(df)
print(f"total time (sequential)", df.sum().sum()/ 3600, "hours")

total_time


Unnamed: 0,medbullets,pubmedqa,medxpertqa-r,mmlu,medmcqa,medqa,medexqa,medxpertqa-u,mmlu-pro,avg,sum
zero_shot,117.423813,120.036392,130.322861,91.745115,122.114668,159.455,133.20715,120.110625,140.575729,126.11015,1134.991352
few_shot,431.87656,182.442526,497.295514,146.547857,149.204682,416.124951,189.426589,306.918099,223.121611,282.550932,2542.958389
cot,741.143786,577.05441,911.187562,432.599729,562.309358,775.587569,554.230422,833.257928,728.123208,679.49933,6115.493973
cot_sc,4086.096926,3069.240881,4610.132303,2296.453717,2999.074553,4261.197869,2885.214147,4114.840982,3948.115543,3585.596324,32270.36692
self_refine,2740.980017,1873.401011,,155.648026,1870.088043,2188.321566,,,,1765.687733,8828.438664
multi_persona,6354.858115,5487.516438,7836.027494,3972.571823,5140.38592,6927.533108,5104.391573,7163.115247,6645.460177,6070.206655,54631.859897
medprompt,2691.566365,2186.517701,3535.895233,1498.090172,1938.659024,2742.306511,2091.636876,3197.637639,2629.79886,2501.345376,22512.10838
medagents,6195.258976,7909.034654,10008.023817,,8386.952224,8603.323901,7670.366319,8575.908013,8216.173803,8195.630213,65565.041707
mdagents,5782.756743,9974.618224,4881.718329,1349.963703,1810.389738,5082.428447,1511.949758,4631.696717,1705.182806,4081.189385,36730.704467


total time (sequential) 135.5421509987782 hours


In [12]:
key = "avg_tokens"
print(key)
df = pd.DataFrame(parser_df(method_dataset_result_mapping,key))
df = df[["zero_shot",
"few_shot",
"cot",
"cot_sc",
"self_refine",
"multi_persona",
"medprompt",
"medagents",
"mdagents"]]
df = df.transpose()
df["avg"], df["sum"] = df.mean(axis=1), df.sum(axis=1)
display(df)
print(f"avg_tokens", df.sum().sum())

avg_tokens


Unnamed: 0,medbullets,pubmedqa,medxpertqa-r,mmlu,medmcqa,medqa,medexqa,medxpertqa-u,mmlu-pro,avg,sum
zero_shot,327.47191,413.63,449.39,172.986301,128.37,299.46,136.51,388.08,224.39,282.254246,2540.288211
few_shot,1504.94382,1997.57,3884.47,921.808219,402.15,1410.51,577.48,3672.46,982.52,1705.990227,15353.912039
cot,998.089888,827.01,1194.54,616.589041,539.39,910.38,536.03,1056.06,783.65,829.082103,7461.738929
cot_sc,5534.41573,4675.17,6540.39,3393.767123,2976.74,5024.76,2994.41,5783.66,4383.66,4589.66365,41306.972854
self_refine,3850.707865,3164.15,,1675.0,2136.45,3383.31,,,,2841.923573,14209.617865
multi_persona,8249.966292,7199.61,9647.47,5438.09589,4960.41,7512.26,4884.97,8683.34,6811.9,7043.113576,63388.022183
medprompt,11805.359551,10580.15,17922.24,6398.0,5492.32,10456.22,6270.85,16753.99,8687.93,10485.228839,94367.059551
medagents,34678.539683,34092.06,39613.37,,25692.26,33455.77,23995.98,34452.34,27710.72,31711.37996,253691.039683
mdagents,107502.173077,203644.055556,163944.815789,13325.540984,11608.355263,101831.519231,8243.675676,146962.276596,36705.121951,88196.39268,793767.534122


avg_tokens 2719857.3997266684
