In [1]:
import os
os.chdir("../")
os.getcwd()

'/data1/xhuan192/codes/medagents-benchmark'

In [2]:
from pathlib import Path
import json
from dataclasses import dataclass

In [3]:
COST_PER_TOKEN_MAPPING = {
    "gpt-4o": {
        "input": 2.5 / 1_000_000,
        "output": 10 / 1_000_000,
    }
}

@dataclass
class Result:
    accuracy: float
    model_name: str
    num_samples: int
    num_correct: int
    # token usage
    input_tokens: int
    output_tokens: int
    total_tokens: int
    # cost
    input_cost: float
    output_cost: float
    total_cost: float
    # time
    total_time: float
    # average calculations
    # token usage
    avg_input_tokens: float
    avg_output_tokens: float
    avg_tokens: float
    # cost
    avg_input_cost: float
    avg_output_cost: float
    avg_total_cost: float
    # time
    avg_time: float


def parse_result(data, model_name):

    num_correct = 0
    input_tokens = 0
    output_tokens = 0
    total_time = 0

    for sample in data:
        if sample["predicted_answer"].upper() == sample["answer_idx"].upper():
            num_correct += 1
        input_tokens += sample['token_usage']["prompt_tokens"]
        output_tokens += sample['token_usage']["completion_tokens"]
        total_time += sample["time_elapsed"]

    num_samples = len(data)
    accuracy = num_correct / num_samples
    total_tokens = input_tokens + output_tokens

    input_cost = input_tokens * COST_PER_TOKEN_MAPPING[model_name]["input"]
    output_cost = output_tokens * COST_PER_TOKEN_MAPPING[model_name]["output"]
    total_cost = input_cost + output_cost


    # average calculations
    avg_input_tokens = input_tokens / num_samples
    avg_output_tokens = output_tokens / num_samples
    avg_tokens = total_tokens / num_samples

    avg_input_cost = input_cost / num_samples
    avg_output_cost = output_cost / num_samples
    avg_total_cost = total_cost / num_samples

    avg_time = total_time / num_samples

    result_dict =  {
        "accuracy": accuracy,
        "model_name": model_name,
        "num_samples": num_samples,
        "num_correct": num_correct,
        # token usage
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "total_tokens": total_tokens,
        # cost
        "input_cost": input_cost,
        "output_cost": output_cost,
        "total_cost": total_cost,
        # time
        "total_time": total_time,
        # average calculations
        # token usage
        "avg_input_tokens": avg_input_tokens,
        "avg_output_tokens": avg_output_tokens,
        "avg_tokens": avg_tokens,
        # cost
        "avg_input_cost": avg_input_cost,
        "avg_output_cost": avg_output_cost,
        "avg_total_cost": avg_total_cost,
        # time
        "avg_time": avg_time,
    }
    return result_dict


In [4]:
output_dir = Path("outputs")
# get all json files recursively in the output_dir
json_files = list(output_dir.rglob("*.json"))
"""
├── mdagents
│   ├── medbullets
│   │   └── gpt-4o-1120-nofilter-global_medbullets_test_hard_adaptive.json
│   └── pubmedqa
│       └── gpt-4o-1120-nofilter-global_pubmedqa_test_hard_adaptive.json
├── medagents
│   ├── medbullets
│   │   └── gpt-4o-1120-nofilter-global-medbullets-test_hard-syn_verif.json
└── medprompt
    ├── cot
    │   ├── medbullets
    │   │   └── gpt-4o-1120-nofilter-global-medbullets-test_hard-cot.json
"""

method_dataset_result_mapping = {}
model_name = "gpt-4o"
for json_file in json_files:
    # get the method and dataset from the file name
    method, dataset = json_file.parts[-3], json_file.parts[-2]
    model = json_file.parts[-1].split(dataset)[0]

    if method not in method_dataset_result_mapping:
        method_dataset_result_mapping[method] = {}

    # get the json data
    with open(json_file, "r") as f:
        data = json.load(f)
    method_dataset_result_mapping[method][dataset] = parse_result(data, model_name)

In [5]:
def parser_df(method_dataset_result_mapping, key):
    method_dataset_acc_mapping = {}
    for method, dataset_result_mapping in method_dataset_result_mapping.items():
        for dataset, result in dataset_result_mapping.items():
            if method not in method_dataset_acc_mapping:
                method_dataset_acc_mapping[method] = {}
            method_dataset_acc_mapping[method][dataset] = result[key]
    return method_dataset_acc_mapping


In [6]:
import pandas as pd


In [7]:
def display_df(df, key):
    df = pd.DataFrame(parser_df(method_dataset_result_mapping, key))
    df = df[
        ["zero_shot",
        "few_shot",
        "cot",
        "cot_sc",
        "self_refine",
        "multi_persona",
        "medprompt",
        "medagents",
        "mdagents"]
    ]
    df = df.transpose()
    df = df[["medqa", "pubmedqa", "medmcqa", "medbullets", "mmlu", "mmlu-pro", "medexqa", "medxpertqa-r", "medxpertqa-u"]]
    print(f"key: {key}")
    print(f"total: {df.sum().sum()}")

    df["avg"], df["sum"] = df.mean(axis=1), df.sum(axis=1)
    display(df)

In [8]:
for result_key in Result.__dataclass_fields__.keys():
    if result_key in ["model_name"]:
        continue
    display_df(method_dataset_result_mapping, result_key)

key: accuracy
total: 21.433267661997846


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,0.42,0.09,0.28,0.179775,0.287671,0.37,0.15,0.13,0.11,0.224161,2.017447
few_shot,0.35,0.16,0.22,0.258427,0.246575,0.16,0.17,0.14,0.06,0.196111,1.765002
cot,0.55,0.08,0.29,0.314607,0.30137,0.39,0.25,0.19,0.18,0.282886,2.545977
cot_sc,0.52,0.1,0.29,0.292135,0.369863,0.38,0.22,0.15,0.18,0.278,2.501998
self_refine,0.61,0.15,0.29,0.337079,0.356164,0.37,0.2,0.2,0.21,0.302583,2.723243
multi_persona,0.4,0.11,0.31,0.235955,0.39726,0.45,0.25,0.15,0.12,0.269246,2.423215
medprompt,0.48,0.13,0.3,0.269663,0.39726,0.45,0.2,0.21,0.22,0.295214,2.656923
medagents,0.51,0.16,0.32,0.235955,0.369863,0.38,0.21,0.17,0.16,0.279535,2.515818
mdagents,0.41,0.22,0.31,0.303371,0.260274,0.34,0.22,0.1,0.12,0.253738,2.283645


key: num_samples
total: 7758


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,100,100,100,89,73,100,100,100,100,95.777778,862
few_shot,100,100,100,89,73,100,100,100,100,95.777778,862
cot,100,100,100,89,73,100,100,100,100,95.777778,862
cot_sc,100,100,100,89,73,100,100,100,100,95.777778,862
self_refine,100,100,100,89,73,100,100,100,100,95.777778,862
multi_persona,100,100,100,89,73,100,100,100,100,95.777778,862
medprompt,100,100,100,89,73,100,100,100,100,95.777778,862
medagents,100,100,100,89,73,100,100,100,100,95.777778,862
mdagents,100,100,100,89,73,100,100,100,100,95.777778,862


key: num_correct
total: 2036


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,42,9,28,16,21,37,15,13,11,21.333333,192
few_shot,35,16,22,23,18,16,17,14,6,18.555556,167
cot,55,8,29,28,22,39,25,19,18,27.0,243
cot_sc,52,10,29,26,27,38,22,15,18,26.333333,237
self_refine,61,15,29,30,26,37,20,20,21,28.777778,259
multi_persona,40,11,31,21,29,45,25,15,12,25.444444,229
medprompt,48,13,30,24,29,45,20,21,22,28.0,252
medagents,51,16,32,21,27,38,21,17,16,26.555556,239
mdagents,41,22,31,27,19,34,22,10,12,24.222222,218


key: input_tokens
total: 115718040


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,29687,41127,12508,28918,12452,22175,13408,44649,38515,27048.78,243439
few_shot,113987,194727,37408,103945,61873,89875,51108,357149,351015,151231.9,1361087
cot,31087,42527,13908,30164,13474,23575,14808,46049,39915,28389.67,255507
cot_sc,192323,260951,89244,186146,85070,147246,94648,282094,245290,175890.2,1583012
self_refine,172269,180373,91424,175605,72055,117076,88554,227180,185041,145508.6,1309577
multi_persona,237399,316783,118068,228625,109378,185016,124104,342190,299150,217857.0,1960713
medprompt,849131,915812,431305,849212,369208,680887,493160,1528065,1430772,838616.9,7547552
medagents,2630112,2742814,1946702,2539754,1483564,2135341,1829126,3151145,2719433,2353110.0,21177991
mdagents,12272249,21564064,1059802,10210705,1034952,3443872,745031,15741913,14206574,8919907.0,80279162


key: output_tokens
total: 18565752


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,259,236,329,227,176,264,243,290,293,257.444444,2317
few_shot,27064,5030,2807,29995,5419,8377,6640,31298,16231,14762.333333,132861
cot,59951,40174,40031,58666,31537,54790,38795,73405,65691,51448.888889,463040
cot_sc,310153,206566,208430,306417,162675,291120,204793,371945,333076,266130.555556,2395175
self_refine,166062,136042,122221,167108,87482,137763,120098,198686,165263,144525.0,1300725
multi_persona,513827,403178,377973,505622,287603,496174,364393,622557,569184,460056.777778,4140511
medprompt,196491,142203,117927,201465,97846,187906,133925,264159,244627,176283.222222,1586549
medagents,715465,666392,622524,685175,449462,635731,570472,810192,725801,653468.222222,5881214
mdagents,451087,471014,103905,443977,81709,195637,92974,449399,373658,295928.888889,2663360


key: total_tokens
total: 134283792


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,29946,41363,12837,29145,12628,22439,13651,44939,38808,27306.22,245756
few_shot,141051,199757,40215,133940,67292,98252,57748,388447,367246,165994.2,1493948
cot,91038,82701,53939,88830,45011,78365,53603,119454,105606,79838.56,718547
cot_sc,502476,467517,297674,492563,247745,438366,299441,654039,578366,442020.8,3978187
self_refine,338331,316415,213645,342713,159537,254839,208652,425866,350304,290033.6,2610302
multi_persona,751226,719961,496041,734247,396981,681190,488497,964747,868334,677913.8,6101224
medprompt,1045622,1058015,549232,1050677,467054,868793,627085,1792224,1675399,1014900.0,9134101
medagents,3345577,3409206,2569226,3224929,1933026,2771072,2399598,3961337,3445234,3006578.0,27059205
mdagents,12723336,22035078,1163707,10654682,1116661,3639509,838005,16191312,14580232,9215836.0,82942522


key: input_cost
total: 289.29510000000005


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,0.074218,0.102818,0.03127,0.072295,0.03113,0.055438,0.03352,0.111623,0.096288,0.067622,0.608598
few_shot,0.284968,0.486818,0.09352,0.259862,0.154683,0.224688,0.12777,0.892873,0.877538,0.37808,3.402718
cot,0.077718,0.106318,0.03477,0.07541,0.033685,0.058938,0.03702,0.115123,0.099788,0.070974,0.638768
cot_sc,0.480808,0.652378,0.22311,0.465365,0.212675,0.368115,0.23662,0.705235,0.613225,0.439726,3.95753
self_refine,0.430673,0.450933,0.22856,0.439013,0.180138,0.29269,0.221385,0.56795,0.462603,0.363771,3.273942
multi_persona,0.593498,0.791958,0.29517,0.571563,0.273445,0.46254,0.31026,0.855475,0.747875,0.544643,4.901783
medprompt,2.122828,2.28953,1.078263,2.12303,0.92302,1.702218,1.2329,3.820163,3.57693,2.096542,18.86888
medagents,6.57528,6.857035,4.866755,6.349385,3.70891,5.338353,4.572815,7.877863,6.798583,5.882775,52.944978
mdagents,30.680623,53.91016,2.649505,25.526763,2.58738,8.60968,1.862578,39.354783,35.516435,22.299767,200.697905


key: output_cost
total: 185.65752000000003


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,0.00259,0.00236,0.00329,0.00227,0.00176,0.00264,0.00243,0.0029,0.00293,0.002574,0.02317
few_shot,0.27064,0.0503,0.02807,0.29995,0.05419,0.08377,0.0664,0.31298,0.16231,0.147623,1.32861
cot,0.59951,0.40174,0.40031,0.58666,0.31537,0.5479,0.38795,0.73405,0.65691,0.514489,4.6304
cot_sc,3.10153,2.06566,2.0843,3.06417,1.62675,2.9112,2.04793,3.71945,3.33076,2.661306,23.95175
self_refine,1.66062,1.36042,1.22221,1.67108,0.87482,1.37763,1.20098,1.98686,1.65263,1.44525,13.00725
multi_persona,5.13827,4.03178,3.77973,5.05622,2.87603,4.96174,3.64393,6.22557,5.69184,4.600568,41.40511
medprompt,1.96491,1.42203,1.17927,2.01465,0.97846,1.87906,1.33925,2.64159,2.44627,1.762832,15.86549
medagents,7.15465,6.66392,6.22524,6.85175,4.49462,6.35731,5.70472,8.10192,7.25801,6.534682,58.81214
mdagents,4.51087,4.71014,1.03905,4.43977,0.81709,1.95637,0.92974,4.49399,3.73658,2.959289,26.6336


key: total_cost
total: 474.95262


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,0.076808,0.105178,0.03456,0.074565,0.03289,0.058078,0.03595,0.114523,0.099218,0.070196,0.631768
few_shot,0.555608,0.537118,0.12159,0.559813,0.208873,0.308458,0.19417,1.205853,1.039848,0.525703,4.731328
cot,0.677228,0.508058,0.43508,0.66207,0.349055,0.606838,0.42497,0.849173,0.756698,0.585463,5.269168
cot_sc,3.582338,2.718038,2.30741,3.529535,1.839425,3.279315,2.28455,4.424685,3.943985,3.101031,27.90928
self_refine,2.091293,1.811353,1.45077,2.110093,1.054957,1.67032,1.422365,2.55481,2.115233,1.809021,16.281193
multi_persona,5.731768,4.823738,4.0749,5.627783,3.149475,5.42428,3.95419,7.081045,6.439715,5.14521,46.306892
medprompt,4.087738,3.71156,2.257532,4.13768,1.90148,3.581278,2.57215,6.461753,6.0232,3.859374,34.73437
medagents,13.72993,13.520955,11.091995,13.201135,8.20353,11.695663,10.277535,15.979783,14.056593,12.417457,111.757117
mdagents,35.191493,58.6203,3.688555,29.966533,3.40447,10.56605,2.792318,43.848773,39.253015,25.259056,227.331505


key: total_time
total: 289738.94581127167


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,159.455,120.036392,122.114668,117.423813,91.745115,140.575729,133.20715,130.322861,120.110625,126.11015,1134.991352
few_shot,416.124951,182.442526,149.204682,431.87656,146.547857,223.121611,189.426589,497.295514,306.918099,282.550932,2542.958389
cot,775.587569,577.05441,562.309358,741.143786,432.599729,728.123208,554.230422,911.187562,833.257928,679.49933,6115.493973
cot_sc,4261.197869,3069.240881,2999.074553,4086.096926,2296.453717,3948.115543,2885.214147,4610.132303,4114.840982,3585.596324,32270.36692
self_refine,2188.321566,1873.401011,1870.088043,2740.980017,2263.562827,2857.887005,1992.322074,3092.517114,2496.268422,2375.038676,21375.348081
multi_persona,6927.533108,5487.516438,5140.38592,6354.858115,3972.571823,6645.460177,5104.391573,7836.027494,7163.115247,6070.206655,54631.859897
medprompt,2742.306511,2186.517701,1938.659024,2691.566365,1498.090172,2629.79886,2091.636876,3535.895233,3197.637639,2501.345376,22512.10838
medagents,8603.323901,7909.034654,8386.952224,10525.200796,6545.853068,8216.173803,7670.366319,10008.023817,8575.908013,8493.426288,76440.836595
mdagents,12274.937182,18150.758786,2536.321888,10450.468612,1786.278005,4872.619683,2159.521932,11246.272967,9237.803169,8079.442469,72714.982224


key: avg_input_tokens
total: 1186911.2094505157


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,296.87,411.27,125.08,324.921348,170.575342,221.75,134.08,446.49,385.15,279.576299,2516.186691
few_shot,1139.87,1947.27,374.08,1167.921348,847.575342,898.75,511.08,3571.49,3510.15,1552.020743,13968.186691
cot,310.87,425.27,139.08,338.921348,184.575342,235.75,148.08,460.49,399.15,293.576299,2642.186691
cot_sc,1923.23,2609.51,892.44,2091.52809,1165.342466,1472.46,946.48,2820.94,2452.9,1819.425617,16374.830556
self_refine,1722.69,1803.73,914.24,1973.089888,987.054795,1170.76,885.54,2271.8,1850.41,1508.812742,13579.314682
multi_persona,2373.99,3167.83,1180.68,2568.820225,1498.328767,1850.16,1241.04,3421.9,2991.5,2254.916555,20294.248992
medprompt,8491.31,9158.12,4313.05,9541.707865,5057.643836,6808.87,4931.6,15280.65,14307.72,8654.519078,77890.671701
medagents,26301.12,27428.14,19467.02,28536.561798,20322.794521,21353.41,18291.26,31511.45,27194.33,24489.565146,220406.086318
mdagents,122722.49,215640.64,10598.02,114727.022472,14177.424658,34438.72,7450.31,157419.13,142065.74,91026.610792,819239.497129


key: avg_output_tokens
total: 193074.96017700474


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,2.59,2.36,3.29,2.550562,2.410959,2.64,2.43,2.9,2.93,2.677947,24.101521
few_shot,270.64,50.3,28.07,337.022472,74.232877,83.77,66.4,312.98,162.31,153.969483,1385.725349
cot,599.51,401.74,400.31,659.168539,432.013699,547.9,387.95,734.05,656.91,535.505804,4819.552238
cot_sc,3101.53,2065.66,2084.3,3442.88764,2228.424658,2911.2,2047.93,3719.45,3330.76,2770.238033,24932.142298
self_refine,1660.62,1360.42,1222.21,1877.617978,1198.383562,1377.63,1200.98,1986.86,1652.63,1504.150171,13537.351539
multi_persona,5138.27,4031.78,3779.73,5681.146067,3939.767123,4961.74,3643.93,6225.57,5691.84,4788.197021,43093.773191
medprompt,1964.91,1422.03,1179.27,2263.651685,1340.356164,1879.06,1339.25,2641.59,2446.27,1830.709761,16476.38785
medagents,7154.65,6663.92,6225.24,7698.595506,6157.013699,6357.31,5704.72,8101.92,7258.01,6813.486578,61321.379204
mdagents,4510.87,4710.14,1039.05,4988.505618,1119.30137,1956.37,929.74,4493.99,3736.58,3053.838554,27484.546988


key: avg_tokens
total: 1379986.1696275205


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,299.46,413.63,128.37,327.47191,172.986301,224.39,136.51,449.39,388.08,282.254246,2540.288211
few_shot,1410.51,1997.57,402.15,1504.94382,921.808219,982.52,577.48,3884.47,3672.46,1705.990227,15353.912039
cot,910.38,827.01,539.39,998.089888,616.589041,783.65,536.03,1194.54,1056.06,829.082103,7461.738929
cot_sc,5024.76,4675.17,2976.74,5534.41573,3393.767123,4383.66,2994.41,6540.39,5783.66,4589.66365,41306.972854
self_refine,3383.31,3164.15,2136.45,3850.707865,2185.438356,2548.39,2086.52,4258.66,3503.04,3012.962913,27116.666221
multi_persona,7512.26,7199.61,4960.41,8249.966292,5438.09589,6811.9,4884.97,9647.47,8683.34,7043.113576,63388.022183
medprompt,10456.22,10580.15,5492.32,11805.359551,6398.0,8687.93,6270.85,17922.24,16753.99,10485.228839,94367.059551
medagents,33455.77,34092.06,25692.26,36235.157303,26479.808219,27710.72,23995.98,39613.37,34452.34,31303.051725,281727.465523
mdagents,127233.36,220350.78,11637.07,119715.52809,15296.726027,36395.09,8380.05,161913.12,145802.32,94080.449346,846724.044117


key: avg_input_cost
total: 2.9672780236262892


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,0.000742,0.001028,0.000313,0.000812,0.000426,0.000554,0.000335,0.001116,0.000963,0.000699,0.00629
few_shot,0.00285,0.004868,0.000935,0.00292,0.002119,0.002247,0.001278,0.008929,0.008775,0.00388,0.03492
cot,0.000777,0.001063,0.000348,0.000847,0.000461,0.000589,0.00037,0.001151,0.000998,0.000734,0.006605
cot_sc,0.004808,0.006524,0.002231,0.005229,0.002913,0.003681,0.002366,0.007052,0.006132,0.004549,0.040937
self_refine,0.004307,0.004509,0.002286,0.004933,0.002468,0.002927,0.002214,0.00568,0.004626,0.003772,0.033948
multi_persona,0.005935,0.00792,0.002952,0.006422,0.003746,0.004625,0.003103,0.008555,0.007479,0.005637,0.050736
medprompt,0.021228,0.022895,0.010783,0.023854,0.012644,0.017022,0.012329,0.038202,0.035769,0.021636,0.194727
medagents,0.065753,0.06857,0.048668,0.071341,0.050807,0.053384,0.045728,0.078779,0.067986,0.061224,0.551015
mdagents,0.306806,0.539102,0.026495,0.286818,0.035444,0.086097,0.018626,0.393548,0.355164,0.227567,2.048099


key: avg_output_cost
total: 1.9307496017700476


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,2.6e-05,2.4e-05,3.3e-05,2.6e-05,2.4e-05,2.6e-05,2.4e-05,2.9e-05,2.9e-05,2.7e-05,0.000241
few_shot,0.002706,0.000503,0.000281,0.00337,0.000742,0.000838,0.000664,0.00313,0.001623,0.00154,0.013857
cot,0.005995,0.004017,0.004003,0.006592,0.00432,0.005479,0.00388,0.007341,0.006569,0.005355,0.048196
cot_sc,0.031015,0.020657,0.020843,0.034429,0.022284,0.029112,0.020479,0.037194,0.033308,0.027702,0.249321
self_refine,0.016606,0.013604,0.012222,0.018776,0.011984,0.013776,0.01201,0.019869,0.016526,0.015042,0.135374
multi_persona,0.051383,0.040318,0.037797,0.056811,0.039398,0.049617,0.036439,0.062256,0.056918,0.047882,0.430938
medprompt,0.019649,0.01422,0.011793,0.022637,0.013404,0.018791,0.013392,0.026416,0.024463,0.018307,0.164764
medagents,0.071546,0.066639,0.062252,0.076986,0.06157,0.063573,0.057047,0.081019,0.07258,0.068135,0.613214
mdagents,0.045109,0.047101,0.010391,0.049885,0.011193,0.019564,0.009297,0.04494,0.037366,0.030538,0.274845


key: avg_total_cost
total: 4.898027625396336


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,0.000768,0.001052,0.000346,0.000838,0.000451,0.000581,0.00036,0.001145,0.000992,0.000726,0.006531
few_shot,0.005556,0.005371,0.001216,0.00629,0.002861,0.003085,0.001942,0.012059,0.010398,0.00542,0.048778
cot,0.006772,0.005081,0.004351,0.007439,0.004782,0.006068,0.00425,0.008492,0.007567,0.006089,0.054801
cot_sc,0.035823,0.02718,0.023074,0.039658,0.025198,0.032793,0.022845,0.044247,0.03944,0.032251,0.290258
self_refine,0.020913,0.018114,0.014508,0.023709,0.014451,0.016703,0.014224,0.025548,0.021152,0.018814,0.169322
multi_persona,0.057318,0.048237,0.040749,0.063234,0.043143,0.054243,0.039542,0.07081,0.064397,0.053519,0.481673
medprompt,0.040877,0.037116,0.022575,0.046491,0.026048,0.035813,0.025722,0.064618,0.060232,0.039943,0.359491
medagents,0.137299,0.13521,0.11092,0.148327,0.112377,0.116957,0.102775,0.159798,0.140566,0.129359,1.164229
mdagents,0.351915,0.586203,0.036886,0.336703,0.046637,0.105661,0.027923,0.438488,0.39253,0.258105,2.322944


key: avg_time
total: 3014.926933091091


Unnamed: 0,medqa,pubmedqa,medmcqa,medbullets,mmlu,mmlu-pro,medexqa,medxpertqa-r,medxpertqa-u,avg,sum
zero_shot,1.59455,1.200364,1.221147,1.319369,1.256782,1.405757,1.332071,1.303229,1.201106,1.314931,11.834375
few_shot,4.16125,1.824425,1.492047,4.852546,2.007505,2.231216,1.894266,4.972955,3.069181,2.945043,26.50539
cot,7.755876,5.770544,5.623094,8.327458,5.926024,7.281232,5.542304,9.111876,8.332579,7.074554,63.670987
cot_sc,42.611979,30.692409,29.990746,45.911201,31.45827,39.481155,28.852141,46.101323,41.14841,37.360848,336.247634
self_refine,21.883216,18.73401,18.70088,30.797528,31.00771,28.57887,19.923221,30.925171,24.962684,25.057032,225.513291
multi_persona,69.275331,54.875164,51.403859,71.4029,54.418792,66.454602,51.043916,78.360275,71.631152,63.207332,568.865992
medprompt,27.423065,21.865177,19.38659,30.242319,20.521783,26.297989,20.916369,35.358952,31.976376,25.998736,233.98862
medagents,86.033239,79.090347,83.869522,118.260683,89.66922,82.161738,76.703663,100.080238,85.75908,89.069748,801.627731
mdagents,122.749372,181.507588,25.363219,117.420996,24.469562,48.726197,21.595219,112.46273,92.378032,82.963657,746.672913
