In [1]:
def compute_stats(doremi, reference):
    total = 0
    succeed = 0
    same = 0
    amount_succeed = 0
    amount_fail = 0
    
    for name in doremi:
        tuned_acc = doremi[name]['acc']
        ref_acc = reference[name]['acc']
    
        if tuned_acc > ref_acc:
            succeed += 1
            amount_succeed += tuned_acc - ref_acc
        elif tuned_acc == ref_acc:
            # if doremi[name]["acc_stderr"] < reference[name]["acc_stderr"]:
            #     succeed += 1
            # else:
            #     same += 1
            same += 1
        else:
            amount_fail += ref_acc - tuned_acc
    
        total += 1

    failed = total - succeed - same

    print(f"total={total}, succeed={succeed}, failed={failed}, same={same}, amount_succeed={amount_succeed}, amount_fail={amount_fail}")

In [2]:
def plot(doremi, reference):
    import pandas as pd
    
    pd.set_option('display.max_rows', None)
    
    tasks = list(doremi.keys())
    columns = ['Task', 'DoReMi ACC', 'Reference ACC', 'DoReMi AccNorm', 'Reference AccNorm']
    data = []
    
    for task in tasks:
        row = [
            task,
            doremi[task]['acc'],
            reference[task]['acc'],
            doremi[task].get('acc_norm', 'N/A'),  # Using 'N/A' for missing 'acc_norm' values
            reference[task].get('acc_norm', 'N/A')
        ]
        data.append(row)
    
    # Create DataFrame
    comparison_df = pd.DataFrame(data, columns=columns)

    pd.set_option('display.max_rows', None)

    def highlight_greater_doremi_acc(row):
        if row['DoReMi ACC'] > row['Reference ACC']:
            return ['background-color: #59cc0c'] * len(row)  # Apply yellow background to entire row
        elif row['DoReMi ACC'] == row['Reference ACC']:
            return ['background-color: #cc9c0c'] * len(row)  # Apply yellow background to entire row
        else:
            return [''] * len(row)  # No styling for rows that don't meet the condition
    
    # Apply the styling function to the DataFrame
    styled_df = comparison_df.style.apply(highlight_greater_doremi_acc, axis=1)
    return styled_df

### 40k proxy, 25k reference

In [1]:
# reference_data_25k = {
#     "arc:challenge": {
#     "acc": 0.181,
#     "acc_stderr": 0.012181436179177893,
#     "acc_norm": 0.24,
#     "acc_norm_stderr": 0.013512312258920843
#     },
#     "arc:easy": {
#     "acc": 0.372,
#     "acc_stderr": 0.015292149942040577,
#     "acc_norm": 0.371,
#     "acc_norm_stderr": 0.015283736211823187
#     },
#     "commonsense_qa": {
#     "acc": 0.233,
#     "acc_stderr": 0.013374972519220039,
#     "acc_norm": 0.259,
#     "acc_norm_stderr": 0.013860415257527911
#     },
#     "hellaswag": {
#     "acc": 0.289,
#     "acc_stderr": 0.014341711358296172,
#     "acc_norm": 0.291,
#     "acc_norm_stderr": 0.014370995982377944
#     },
#     "mmlu:abstract_algebra": {
#     "acc": 0.22,
#     "acc_stderr": 0.0416333199893227,
#     "acc_norm": 0.26,
#     "acc_norm_stderr": 0.04408440022768078
#     },
#     "mmlu:anatomy": {
#     "acc": 0.28888888888888886,
#     "acc_stderr": 0.03915450630414251,
#     "acc_norm": 0.2740740740740741,
#     "acc_norm_stderr": 0.03853254836552003
#     },
#     "mmlu:astronomy": {
#     "acc": 0.23684210526315788,
#     "acc_stderr": 0.034597776068105365,
#     "acc_norm": 0.27631578947368424,
#     "acc_norm_stderr": 0.03639057569952924
#     },
#     "mmlu:business_ethics": {
#     "acc": 0.36,
#     "acc_stderr": 0.048241815132442176,
#     "acc_norm": 0.26,
#     "acc_norm_stderr": 0.04408440022768079
#     },
#     "mmlu:clinical_knowledge": {
#     "acc": 0.2,
#     "acc_stderr": 0.02461829819586651,
#     "acc_norm": 0.33584905660377357,
#     "acc_norm_stderr": 0.02906722014664483
#     },
#     "mmlu:college_biology": {
#     "acc": 0.25,
#     "acc_stderr": 0.03621034121889507,
#     "acc_norm": 0.2361111111111111,
#     "acc_norm_stderr": 0.03551446610810826
#     },
#     "mmlu:college_chemistry": {
#     "acc": 0.29,
#     "acc_stderr": 0.045604802157206845,
#     "acc_norm": 0.26,
#     "acc_norm_stderr": 0.0440844002276808
#     },
#     "mmlu:college_computer_science": {
#     "acc": 0.24,
#     "acc_stderr": 0.042923469599092816,
#     "acc_norm": 0.23,
#     "acc_norm_stderr": 0.042295258468165044
#     },
#     "mmlu:college_mathematics": {
#     "acc": 0.15,
#     "acc_stderr": 0.03588702812826371,
#     "acc_norm": 0.24,
#     "acc_norm_stderr": 0.04292346959909282
#     },
#     "mmlu:college_medicine": {
#     "acc": 0.2138728323699422,
#     "acc_stderr": 0.03126511206173043,
#     "acc_norm": 0.24277456647398843,
#     "acc_norm_stderr": 0.0326926380614177
#     },
#     "mmlu:college_physics": {
#     "acc": 0.22549019607843138,
#     "acc_stderr": 0.041583075330832865,
#     "acc_norm": 0.24509803921568626,
#     "acc_norm_stderr": 0.04280105837364396
#     },
#     "mmlu:computer_security": {
#     "acc": 0.22,
#     "acc_stderr": 0.04163331998932269,
#     "acc_norm": 0.23,
#     "acc_norm_stderr": 0.042295258468165044
#     },
#     "mmlu:conceptual_physics": {
#     "acc": 0.3276595744680851,
#     "acc_stderr": 0.030683020843231,
#     "acc_norm": 0.2297872340425532,
#     "acc_norm_stderr": 0.027501752944412417
#     },
#     "mmlu:econometrics": {
#     "acc": 0.16666666666666666,
#     "acc_stderr": 0.03505859682597264,
#     "acc_norm": 0.21929824561403508,
#     "acc_norm_stderr": 0.038924311065187546
#     },
#     "mmlu:electrical_engineering": {
#     "acc": 0.2413793103448276,
#     "acc_stderr": 0.03565998174135302,
#     "acc_norm": 0.2827586206896552,
#     "acc_norm_stderr": 0.03752833958003337
#     },
#     "mmlu:elementary_mathematics": {
#     "acc": 0.2037037037037037,
#     "acc_stderr": 0.020742740560122663,
#     "acc_norm": 0.21164021164021163,
#     "acc_norm_stderr": 0.021037331505262886
#     },
#     "mmlu:formal_logic": {
#     "acc": 0.2857142857142857,
#     "acc_stderr": 0.0404061017820884,
#     "acc_norm": 0.23015873015873015,
#     "acc_norm_stderr": 0.03764950879790605
#     },
#     "mmlu:global_facts": {
#     "acc": 0.25,
#     "acc_stderr": 0.04351941398892446,
#     "acc_norm": 0.24,
#     "acc_norm_stderr": 0.042923469599092816
#     },
#     "mmlu:high_school_biology": {
#     "acc": 0.22903225806451613,
#     "acc_stderr": 0.023904914311782648,
#     "acc_norm": 0.29354838709677417,
#     "acc_norm_stderr": 0.02590608702131929
#     },
#     "mmlu:high_school_chemistry": {
#     "acc": 0.1625615763546798,
#     "acc_stderr": 0.0259603000646056,
#     "acc_norm": 0.19704433497536947,
#     "acc_norm_stderr": 0.027986724666736223
#     },
#     "mmlu:high_school_computer_science": {
#     "acc": 0.22,
#     "acc_stderr": 0.041633319989322695,
#     "acc_norm": 0.28,
#     "acc_norm_stderr": 0.045126085985421296
#     },
#     "mmlu:high_school_european_history": {
#     "acc": 0.18181818181818182,
#     "acc_stderr": 0.030117688929503582,
#     "acc_norm": 0.28484848484848485,
#     "acc_norm_stderr": 0.03524390844511782
#     },
#     "mmlu:high_school_geography": {
#     "acc": 0.25252525252525254,
#     "acc_stderr": 0.030954055470365907,
#     "acc_norm": 0.2777777777777778,
#     "acc_norm_stderr": 0.03191178226713546
#     },
#     "mmlu:high_school_government_and_politics": {
#     "acc": 0.23316062176165803,
#     "acc_stderr": 0.030516111371476008,
#     "acc_norm": 0.26424870466321243,
#     "acc_norm_stderr": 0.03182155050916646
#     },
#     "mmlu:high_school_macroeconomics": {
#     "acc": 0.26153846153846155,
#     "acc_stderr": 0.022282141204204426,
#     "acc_norm": 0.28205128205128205,
#     "acc_norm_stderr": 0.022815813098896603
#     },
#     "mmlu:high_school_mathematics": {
#     "acc": 0.12962962962962962,
#     "acc_stderr": 0.020479910253320705,
#     "acc_norm": 0.15185185185185185,
#     "acc_norm_stderr": 0.021881130957380476
#     },
#     "mmlu:high_school_microeconomics": {
#     "acc": 0.23949579831932774,
#     "acc_stderr": 0.02772206549336128,
#     "acc_norm": 0.31932773109243695,
#     "acc_norm_stderr": 0.030283995525884396
#     },
#     "mmlu:high_school_physics": {
#     "acc": 0.23841059602649006,
#     "acc_stderr": 0.034791855725996586,
#     "acc_norm": 0.26490066225165565,
#     "acc_norm_stderr": 0.03603038545360384
#     },
#     "mmlu:high_school_psychology": {
#     "acc": 0.29908256880733947,
#     "acc_stderr": 0.019630417285415175,
#     "acc_norm": 0.28073394495412846,
#     "acc_norm_stderr": 0.019266055045871616
#     },
#     "mmlu:high_school_statistics": {
#     "acc": 0.25925925925925924,
#     "acc_stderr": 0.02988691054762696,
#     "acc_norm": 0.28703703703703703,
#     "acc_norm_stderr": 0.03085199299325701
#     },
#     "mmlu:high_school_us_history": {
#     "acc": 0.23529411764705882,
#     "acc_stderr": 0.029771775228145652,
#     "acc_norm": 0.30392156862745096,
#     "acc_norm_stderr": 0.03228210387037894
#     },
#     "mmlu:high_school_world_history": {
#     "acc": 0.21940928270042195,
#     "acc_stderr": 0.026939106581553945,
#     "acc_norm": 0.2616033755274262,
#     "acc_norm_stderr": 0.028609516716994934
#     },
#     "mmlu:human_aging": {
#     "acc": 0.3094170403587444,
#     "acc_stderr": 0.03102441174057222,
#     "acc_norm": 0.2825112107623318,
#     "acc_norm_stderr": 0.030216831011508766
#     },
#     "mmlu:human_sexuality": {
#     "acc": 0.33587786259541985,
#     "acc_stderr": 0.041423137719966634,
#     "acc_norm": 0.31297709923664124,
#     "acc_norm_stderr": 0.04066962905677697
#     },
#     "mmlu:international_law": {
#     "acc": 0.11570247933884298,
#     "acc_stderr": 0.029199802455622783,
#     "acc_norm": 0.256198347107438,
#     "acc_norm_stderr": 0.03984979653302871
#     },
#     "mmlu:jurisprudence": {
#     "acc": 0.1574074074074074,
#     "acc_stderr": 0.03520703990517963,
#     "acc_norm": 0.23148148148148148,
#     "acc_norm_stderr": 0.04077494709252627
#     },
#     "mmlu:logical_fallacies": {
#     "acc": 0.27607361963190186,
#     "acc_stderr": 0.0351238528370505,
#     "acc_norm": 0.3558282208588957,
#     "acc_norm_stderr": 0.03761521380046734
#     },
#     "mmlu:machine_learning": {
#     "acc": 0.21428571428571427,
#     "acc_stderr": 0.03894641120044792,
#     "acc_norm": 0.21428571428571427,
#     "acc_norm_stderr": 0.038946411200447915
#     },
#     "mmlu:management": {
#     "acc": 0.22330097087378642,
#     "acc_stderr": 0.04123553189891431,
#     "acc_norm": 0.22330097087378642,
#     "acc_norm_stderr": 0.04123553189891431
#     },
#     "mmlu:marketing": {
#     "acc": 0.2948717948717949,
#     "acc_stderr": 0.029872577708891165,
#     "acc_norm": 0.32905982905982906,
#     "acc_norm_stderr": 0.030782321577688156
#     },
#     "mmlu:medical_genetics": {
#     "acc": 0.23,
#     "acc_stderr": 0.04229525846816506,
#     "acc_norm": 0.25,
#     "acc_norm_stderr": 0.04351941398892446
#     },
#     "mmlu:miscellaneous": {
#     "acc": 0.2720306513409962,
#     "acc_stderr": 0.015913367447500517,
#     "acc_norm": 0.26181353767560667,
#     "acc_norm_stderr": 0.015720838678445266
#     },
#     "mmlu:moral_disputes": {
#     "acc": 0.20520231213872833,
#     "acc_stderr": 0.021742519835276305,
#     "acc_norm": 0.1676300578034682,
#     "acc_norm_stderr": 0.020110579919734833
#     },
#     "mmlu:moral_scenarios": {
#     "acc": 0.23798882681564246,
#     "acc_stderr": 0.014242630070574915,
#     "acc_norm": 0.27262569832402234,
#     "acc_norm_stderr": 0.014893391735249588
#     },
#     "mmlu:nutrition": {
#     "acc": 0.20588235294117646,
#     "acc_stderr": 0.023152722439402307,
#     "acc_norm": 0.29411764705882354,
#     "acc_norm_stderr": 0.026090162504279056
#     },
#     "mmlu:philosophy": {
#     "acc": 0.21543408360128619,
#     "acc_stderr": 0.023350225475471414,
#     "acc_norm": 0.2829581993569132,
#     "acc_norm_stderr": 0.025583062489984834
#     },
#     "mmlu:prehistory": {
#     "acc": 0.27469135802469136,
#     "acc_stderr": 0.024836057868294688,
#     "acc_norm": 0.19753086419753085,
#     "acc_norm_stderr": 0.022152889927898975
#     },
#     "mmlu:professional_accounting": {
#     "acc": 0.25886524822695034,
#     "acc_stderr": 0.026129572527180848,
#     "acc_norm": 0.22695035460992907,
#     "acc_norm_stderr": 0.02498710636564296
#     },
#     "mmlu:professional_law": {
#     "acc": 0.232,
#     "acc_stderr": 0.013354937452281558,
#     "acc_norm": 0.254,
#     "acc_norm_stderr": 0.013772206565168544
#     },
#     "mmlu:professional_medicine": {
#     "acc": 0.23529411764705882,
#     "acc_stderr": 0.02576725201085596,
#     "acc_norm": 0.2536764705882353,
#     "acc_norm_stderr": 0.02643132987078954
#     },
#     "mmlu:professional_psychology": {
#     "acc": 0.26143790849673204,
#     "acc_stderr": 0.017776947157528037,
#     "acc_norm": 0.2679738562091503,
#     "acc_norm_stderr": 0.017917974069594726
#     },
#     "mmlu:public_relations": {
#     "acc": 0.33636363636363636,
#     "acc_stderr": 0.04525393596302506,
#     "acc_norm": 0.24545454545454545,
#     "acc_norm_stderr": 0.041220665028782834
#     },
#     "mmlu:security_studies": {
# "acc": 0.30612244897959184,
# "acc_stderr": 0.029504896454595968,
# "acc_norm": 0.20816326530612245,
# "acc_norm_stderr": 0.025991117672813296
# },
# "mmlu:sociology": {
# "acc": 0.21890547263681592,
# "acc_stderr": 0.029239174636647,
# "acc_norm": 0.23383084577114427,
# "acc_norm_stderr": 0.02992941540834839
# },
# "mmlu:us_foreign_policy": {
# "acc": 0.22,
# "acc_stderr": 0.04163331998932269,
# "acc_norm": 0.19,
# "acc_norm_stderr": 0.03942772444036623
# },
# "mmlu:virology": {
# "acc": 0.2469879518072289,
# "acc_stderr": 0.03357351982064536,
# "acc_norm": 0.3253012048192771,
# "acc_norm_stderr": 0.03647168523683228
# },
# "mmlu:world_religions": {
# "acc": 0.18128654970760233,
# "acc_stderr": 0.029547741687640027,
# "acc_norm": 0.24561403508771928,
# "acc_norm_stderr": 0.03301405946987251
# },
# "openbookqa": {
# "acc": 0.132,
# "acc_stderr": 0.015152927850580155,
# "acc_norm": 0.268,
# "acc_norm_stderr": 0.019827714859587578
# },
# "piqa": {
# "acc": 0.571,
# "acc_stderr": 0.015658997547870236,
# "acc_norm": 0.58,
# "acc_norm_stderr": 0.015615500115072959
# },
# "siqa": {
# "acc": 0.358,
# "acc_stderr": 0.015167928865407559,
# "acc_norm": 0.379,
# "acc_norm_stderr": 0.015349091002225352
# },
# "winogrande": {
# "acc": 0.512,
# "acc_stderr": 0.015814743314581818,
# "acc_norm": 0.509,
# "acc_norm_stderr": 0.015816736995005392
# },
# "arc:_average": {
# "acc": 0.27649999999999997,
# "acc_stderr": 0.013736793060609235,
# "acc_norm": 0.3055,
# "acc_norm_stderr": 0.014398024235372016
# },
# "mmlu:_average": {
# "acc": 0.23854149080775466,
# "acc_stderr": 0.03163789714288859,
# "acc_norm": 0.2567727066277369,
# "acc_norm_stderr": 0.03259073413274518
# },
#     "mmlu:professional_medicine": {
#     "acc": 0.23529411764705882,
#     "acc_stderr": 0.02576725201085597,
#     "acc_norm": 0.23529411764705882,
#     "acc_norm_stderr": 0.02576725201085598
#   },
#   "mmlu:professional_psychology": {
#     "acc": 0.2549019607843137,
#     "acc_stderr": 0.017630827375148383,
#     "acc_norm": 0.25,
#     "acc_norm_stderr": 0.01751781884501444
#   },
#   "mmlu:public_relations": {
#     "acc": 0.35454545454545455,
#     "acc_stderr": 0.04582004841505417,
#     "acc_norm": 0.20909090909090908,
#     "acc_norm_stderr": 0.038950910157241385
#   },
#   "mmlu:security_studies": {
#     "acc": 0.3142857142857143,
#     "acc_stderr": 0.02971932942241746,
#     "acc_norm": 0.19591836734693877,
#     "acc_norm_stderr": 0.025409301953225678
#   },
#   "mmlu:sociology": {
#     "acc": 0.21393034825870647,
#     "acc_stderr": 0.02899690969332891,
#     "acc_norm": 0.22388059701492538,
#     "acc_norm_stderr": 0.029475250236017173
#   },
#   "mmlu:us_foreign_policy": {
#     "acc": 0.24,
#     "acc_stderr": 0.04292346959909283,
#     "acc_norm": 0.23,
#     "acc_norm_stderr": 0.04229525846816505
#   },
#   "mmlu:virology": {
#     "acc": 0.22289156626506024,
#     "acc_stderr": 0.03240004825594687,
#     "acc_norm": 0.3253012048192771,
#     "acc_norm_stderr": 0.03647168523683227
#   },
#   "mmlu:world_religions": {
#     "acc": 0.15789473684210525,
#     "acc_stderr": 0.027966785859160907,
#     "acc_norm": 0.23391812865497075,
#     "acc_norm_stderr": 0.032467217651178264
#   },
#   "openbookqa": {
#     "acc": 0.14,
#     "acc_stderr": 0.015533272840269622,
#     "acc_norm": 0.27,
#     "acc_norm_stderr": 0.01987435483128749
#   },
#   "piqa": {
#     "acc": 0.568,
#     "acc_stderr": 0.0156723202373362,
#     "acc_norm": 0.588,
#     "acc_norm_stderr": 0.015572363292015091
#   },
#   "siqa": {
#     "acc": 0.342,
#     "acc_stderr": 0.015008706182121726,
#     "acc_norm": 0.375,
#     "acc_norm_stderr": 0.015316971293620996
#   },
#   "winogrande": {
#     "acc": 0.511,
#     "acc_stderr": 0.015815471195292693,
#     "acc_norm": 0.498,
#     "acc_norm_stderr": 0.015819173374302706
#   },
#   "arc:_average": {
#     "acc": 0.277,
#     "acc_stderr": 0.013694891076312077,
#     "acc_norm": 0.298,
#     "acc_norm_stderr": 0.014299871901289755
#   },
#   "mmlu:_average": {
#     "acc": 0.24486225766415484,
#     "acc_stderr": 0.03198621725172759,
#     "acc_norm": 0.26125950385266233,
#     "acc_norm_stderr": 0.032822076137567394
#   }
# }

In [10]:
reference_data_25k = {
    "arc:challenge": {
"acc": 0.181,
"acc_stderr": 0.012181436179177893,
"acc_norm": 0.24,
"acc_norm_stderr": 0.013512312258920843
},
"arc:easy": {
"acc": 0.372,
"acc_stderr": 0.015292149942040577,
"acc_norm": 0.371,
"acc_norm_stderr": 0.015283736211823187
},
"commonsense_qa": {
"acc": 0.233,
"acc_stderr": 0.013374972519220039,
"acc_norm": 0.259,
"acc_norm_stderr": 0.013860415257527911
},
"hellaswag": {
"acc": 0.289,
"acc_stderr": 0.014341711358296172,
"acc_norm": 0.291,
"acc_norm_stderr": 0.014370995982377944
},
"mmlu:abstract_algebra": {
"acc": 0.22,
"acc_stderr": 0.0416333199893227,
"acc_norm": 0.26,
"acc_norm_stderr": 0.04408440022768078
},
"mmlu:anatomy": {
"acc": 0.28888888888888886,
"acc_stderr": 0.03915450630414251,
"acc_norm": 0.2740740740740741,
"acc_norm_stderr": 0.03853254836552003
},
"mmlu:astronomy": {
"acc": 0.23684210526315788,
"acc_stderr": 0.034597776068105365,
"acc_norm": 0.27631578947368424,
"acc_norm_stderr": 0.03639057569952924
},
"mmlu:business_ethics": {
"acc": 0.36,
"acc_stderr": 0.048241815132442176,
"acc_norm": 0.26,
"acc_norm_stderr": 0.04408440022768079
},
"mmlu:clinical_knowledge": {
"acc": 0.2,
"acc_stderr": 0.02461829819586651,
"acc_norm": 0.33584905660377357,
"acc_norm_stderr": 0.02906722014664483
},
"mmlu:college_biology": {
"acc": 0.25,
"acc_stderr": 0.03621034121889507,
"acc_norm": 0.2361111111111111,
"acc_norm_stderr": 0.03551446610810826
},
"mmlu:college_chemistry": {
"acc": 0.29,
"acc_stderr": 0.045604802157206845,
"acc_norm": 0.26,
"acc_norm_stderr": 0.0440844002276808
},
"mmlu:college_computer_science": {
"acc": 0.24,
"acc_stderr": 0.042923469599092816,
"acc_norm": 0.23,
"acc_norm_stderr": 0.042295258468165044
},
"mmlu:college_mathematics": {
"acc": 0.15,
"acc_stderr": 0.03588702812826371,
"acc_norm": 0.24,
"acc_norm_stderr": 0.04292346959909282
},
"mmlu:college_medicine": {
"acc": 0.2138728323699422,
"acc_stderr": 0.03126511206173043,
"acc_norm": 0.24277456647398843,
"acc_norm_stderr": 0.0326926380614177
},
"mmlu:college_physics": {
"acc": 0.22549019607843138,
"acc_stderr": 0.041583075330832865,
"acc_norm": 0.24509803921568626,
"acc_norm_stderr": 0.04280105837364396
},
"mmlu:computer_security": {
"acc": 0.22,
"acc_stderr": 0.04163331998932269,
"acc_norm": 0.23,
"acc_norm_stderr": 0.042295258468165044
},
"mmlu:conceptual_physics": {
"acc": 0.3276595744680851,
"acc_stderr": 0.030683020843231,
"acc_norm": 0.2297872340425532,
"acc_norm_stderr": 0.027501752944412417
},
"mmlu:econometrics": {
"acc": 0.16666666666666666,
"acc_stderr": 0.03505859682597264,
"acc_norm": 0.21929824561403508,
"acc_norm_stderr": 0.038924311065187546
},
"mmlu:electrical_engineering": {
"acc": 0.2413793103448276,
"acc_stderr": 0.03565998174135302,
"acc_norm": 0.2827586206896552,
"acc_norm_stderr": 0.03752833958003337
},
"mmlu:elementary_mathematics": {
"acc": 0.2037037037037037,
"acc_stderr": 0.020742740560122663,
"acc_norm": 0.21164021164021163,
"acc_norm_stderr": 0.021037331505262886
},
"mmlu:formal_logic": {
"acc": 0.2857142857142857,
"acc_stderr": 0.0404061017820884,
"acc_norm": 0.23015873015873015,
"acc_norm_stderr": 0.03764950879790605
},
"mmlu:global_facts": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.24,
"acc_norm_stderr": 0.042923469599092816
},
"mmlu:high_school_biology": {
"acc": 0.22903225806451613,
"acc_stderr": 0.023904914311782648,
"acc_norm": 0.29354838709677417,
"acc_norm_stderr": 0.02590608702131929
},
"mmlu:high_school_chemistry": {
"acc": 0.1625615763546798,
"acc_stderr": 0.0259603000646056,
"acc_norm": 0.19704433497536947,
"acc_norm_stderr": 0.027986724666736223
},
"mmlu:high_school_computer_science": {
"acc": 0.22,
"acc_stderr": 0.041633319989322695,
"acc_norm": 0.28,
"acc_norm_stderr": 0.045126085985421296
},
"mmlu:high_school_european_history": {
"acc": 0.18181818181818182,
"acc_stderr": 0.030117688929503582,
"acc_norm": 0.28484848484848485,
"acc_norm_stderr": 0.03524390844511782
},
"mmlu:high_school_geography": {
"acc": 0.25252525252525254,
"acc_stderr": 0.030954055470365907,
"acc_norm": 0.2777777777777778,
"acc_norm_stderr": 0.03191178226713546
},
"mmlu:high_school_government_and_politics": {
"acc": 0.23316062176165803,
"acc_stderr": 0.030516111371476008,
"acc_norm": 0.26424870466321243,
"acc_norm_stderr": 0.03182155050916646
},
"mmlu:high_school_macroeconomics": {
"acc": 0.26153846153846155,
"acc_stderr": 0.022282141204204426,
"acc_norm": 0.28205128205128205,
"acc_norm_stderr": 0.022815813098896603
},
"mmlu:high_school_mathematics": {
"acc": 0.12962962962962962,
"acc_stderr": 0.020479910253320705,
"acc_norm": 0.15185185185185185,
"acc_norm_stderr": 0.021881130957380476
},
"mmlu:high_school_microeconomics": {
"acc": 0.23949579831932774,
"acc_stderr": 0.02772206549336128,
"acc_norm": 0.31932773109243695,
"acc_norm_stderr": 0.030283995525884396
},
"mmlu:high_school_physics": {
"acc": 0.23841059602649006,
"acc_stderr": 0.034791855725996586,
"acc_norm": 0.26490066225165565,
"acc_norm_stderr": 0.03603038545360384
},
"mmlu:high_school_psychology": {
"acc": 0.29908256880733947,
"acc_stderr": 0.019630417285415175,
"acc_norm": 0.28073394495412846,
"acc_norm_stderr": 0.019266055045871616
},
"mmlu:high_school_statistics": {
"acc": 0.25925925925925924,
"acc_stderr": 0.02988691054762696,
"acc_norm": 0.28703703703703703,
"acc_norm_stderr": 0.03085199299325701
},
"mmlu:high_school_us_history": {
"acc": 0.23529411764705882,
"acc_stderr": 0.029771775228145652,
"acc_norm": 0.30392156862745096,
"acc_norm_stderr": 0.03228210387037894
},
"mmlu:high_school_world_history": {
"acc": 0.21940928270042195,
"acc_stderr": 0.026939106581553945,
"acc_norm": 0.2616033755274262,
"acc_norm_stderr": 0.028609516716994934
},
"mmlu:human_aging": {
"acc": 0.3094170403587444,
"acc_stderr": 0.03102441174057222,
"acc_norm": 0.2825112107623318,
"acc_norm_stderr": 0.030216831011508766
},
"mmlu:human_sexuality": {
"acc": 0.33587786259541985,
"acc_stderr": 0.041423137719966634,
"acc_norm": 0.31297709923664124,
"acc_norm_stderr": 0.04066962905677697
},
"mmlu:international_law": {
"acc": 0.11570247933884298,
"acc_stderr": 0.029199802455622783,
"acc_norm": 0.256198347107438,
"acc_norm_stderr": 0.03984979653302871
},
"mmlu:jurisprudence": {
"acc": 0.1574074074074074,
"acc_stderr": 0.03520703990517963,
"acc_norm": 0.23148148148148148,
"acc_norm_stderr": 0.04077494709252627
},
"mmlu:logical_fallacies": {
"acc": 0.27607361963190186,
"acc_stderr": 0.0351238528370505,
"acc_norm": 0.3558282208588957,
"acc_norm_stderr": 0.03761521380046734
},
"mmlu:machine_learning": {
"acc": 0.21428571428571427,
"acc_stderr": 0.03894641120044792,
"acc_norm": 0.21428571428571427,
"acc_norm_stderr": 0.038946411200447915
},
"mmlu:management": {
"acc": 0.22330097087378642,
"acc_stderr": 0.04123553189891431,
"acc_norm": 0.22330097087378642,
"acc_norm_stderr": 0.04123553189891431
},
"mmlu:marketing": {
"acc": 0.2948717948717949,
"acc_stderr": 0.029872577708891165,
"acc_norm": 0.32905982905982906,
"acc_norm_stderr": 0.030782321577688156
},
"mmlu:medical_genetics": {
"acc": 0.23,
"acc_stderr": 0.04229525846816506,
"acc_norm": 0.25,
"acc_norm_stderr": 0.04351941398892446
},
"mmlu:miscellaneous": {
"acc": 0.2720306513409962,
"acc_stderr": 0.015913367447500517,
"acc_norm": 0.26181353767560667,
"acc_norm_stderr": 0.015720838678445266
},
"mmlu:moral_disputes": {
"acc": 0.20520231213872833,
"acc_stderr": 0.021742519835276305,
"acc_norm": 0.1676300578034682,
"acc_norm_stderr": 0.020110579919734833
},
"mmlu:moral_scenarios": {
"acc": 0.23798882681564246,
"acc_stderr": 0.014242630070574915,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.014893391735249588
},
"mmlu:nutrition": {
"acc": 0.20588235294117646,
"acc_stderr": 0.023152722439402307,
"acc_norm": 0.29411764705882354,
"acc_norm_stderr": 0.026090162504279056
},
"mmlu:philosophy": {
"acc": 0.21543408360128619,
"acc_stderr": 0.023350225475471414,
"acc_norm": 0.2829581993569132,
"acc_norm_stderr": 0.025583062489984834
},
"mmlu:prehistory": {
"acc": 0.27469135802469136,
"acc_stderr": 0.024836057868294688,
"acc_norm": 0.19753086419753085,
"acc_norm_stderr": 0.022152889927898975
},
"mmlu:professional_accounting": {
"acc": 0.25886524822695034,
"acc_stderr": 0.026129572527180848,
"acc_norm": 0.22695035460992907,
"acc_norm_stderr": 0.02498710636564296
},
"mmlu:professional_law": {
"acc": 0.232,
"acc_stderr": 0.013354937452281558,
"acc_norm": 0.254,
"acc_norm_stderr": 0.013772206565168544
},
"mmlu:professional_medicine": {
"acc": 0.23529411764705882,
"acc_stderr": 0.02576725201085596,
"acc_norm": 0.2536764705882353,
"acc_norm_stderr": 0.02643132987078954
},
"mmlu:professional_psychology": {
"acc": 0.26143790849673204,
"acc_stderr": 0.017776947157528037,
"acc_norm": 0.2679738562091503,
"acc_norm_stderr": 0.017917974069594726
},
"mmlu:public_relations": {
"acc": 0.33636363636363636,
"acc_stderr": 0.04525393596302506,
"acc_norm": 0.24545454545454545,
"acc_norm_stderr": 0.041220665028782834
},
    "mmlu:security_studies": {
"acc": 0.30612244897959184,
"acc_stderr": 0.029504896454595968,
"acc_norm": 0.20816326530612245,
"acc_norm_stderr": 0.025991117672813296
},
"mmlu:sociology": {
"acc": 0.21890547263681592,
"acc_stderr": 0.029239174636647,
"acc_norm": 0.23383084577114427,
"acc_norm_stderr": 0.02992941540834839
},
"mmlu:us_foreign_policy": {
"acc": 0.22,
"acc_stderr": 0.04163331998932269,
"acc_norm": 0.19,
"acc_norm_stderr": 0.03942772444036623
},
"mmlu:virology": {
"acc": 0.2469879518072289,
"acc_stderr": 0.03357351982064536,
"acc_norm": 0.3253012048192771,
"acc_norm_stderr": 0.03647168523683228
},
"mmlu:world_religions": {
"acc": 0.18128654970760233,
"acc_stderr": 0.029547741687640027,
"acc_norm": 0.24561403508771928,
"acc_norm_stderr": 0.03301405946987251
},
"openbookqa": {
"acc": 0.132,
"acc_stderr": 0.015152927850580155,
"acc_norm": 0.268,
"acc_norm_stderr": 0.019827714859587578
},
"piqa": {
"acc": 0.571,
"acc_stderr": 0.015658997547870236,
"acc_norm": 0.58,
"acc_norm_stderr": 0.015615500115072959
},
"siqa": {
"acc": 0.358,
"acc_stderr": 0.015167928865407559,
"acc_norm": 0.379,
"acc_norm_stderr": 0.015349091002225352
},
"winogrande": {
"acc": 0.512,
"acc_stderr": 0.015814743314581818,
"acc_norm": 0.509,
"acc_norm_stderr": 0.015816736995005392
},
"arc:_average": {
"acc": 0.27649999999999997,
"acc_stderr": 0.013736793060609235,
"acc_norm": 0.3055,
"acc_norm_stderr": 0.014398024235372016
},
"mmlu:_average": {
"acc": 0.23854149080775466,
"acc_stderr": 0.03163789714288859,
"acc_norm": 0.2567727066277369,
"acc_norm_stderr": 0.03259073413274518
}
}

In [7]:
doremi_data_25k = {
    "arc:_average": {
    "acc": 0.277,
    "acc_stderr": 0.013694891076312077,
    "acc_norm": 0.298,
    "acc_norm_stderr": 0.014299871901289755
    },
    "arc:challenge": {
    "acc": 0.176,
    "acc_stderr": 0.01204861689859751,
    "acc_norm": 0.234,
    "acc_norm_stderr": 0.013394902889660009
    },
    "arc:easy": {
    "acc": 0.378,
    "acc_stderr": 0.015341165254026644,
    "acc_norm": 0.362,
    "acc_norm_stderr": 0.015204840912919501
    },
    "commonsense_qa": {
    "acc": 0.209,
    "acc_stderr": 0.012864077288499344,
    "acc_norm": 0.256,
    "acc_norm_stderr": 0.013807775152234202
    },
    "hellaswag": {
    "acc": 0.281,
    "acc_stderr": 0.014221154708434925,
    "acc_norm": 0.287,
    "acc_norm_stderr": 0.014312087053809965
    },
    "mmlu:_average": {
    "acc": 0.24486225766415484,
    "acc_stderr": 0.03198621725172759,
    "acc_norm": 0.26125950385266233,
    "acc_norm_stderr": 0.032822076137567394
    },
    "mmlu:abstract_algebra": {
    "acc": 0.23,
    "acc_stderr": 0.04229525846816508,
    "acc_norm": 0.21,
    "acc_norm_stderr": 0.040936018074033256
    },
    "mmlu:anatomy": {
    "acc": 0.2814814814814815,
    "acc_stderr": 0.03885004245800253,
    "acc_norm": 0.2518518518518518,
    "acc_norm_stderr": 0.03749850709174022
    },
    "mmlu:astronomy": {
    "acc": 0.21052631578947367,
    "acc_stderr": 0.03317672787533156,
    "acc_norm": 0.28289473684210525,
    "acc_norm_stderr": 0.03665349695640767
    },
    "mmlu:business_ethics": {
    "acc": 0.38,
    "acc_stderr": 0.04878317312145633,
    "acc_norm": 0.28,
    "acc_norm_stderr": 0.04512608598542127
    },
    "mmlu:clinical_knowledge": {
    "acc": 0.21132075471698114,
    "acc_stderr": 0.025125766484827835,
    "acc_norm": 0.30943396226415093,
    "acc_norm_stderr": 0.028450154794118627
    },
    "mmlu:college_biology": {
    "acc": 0.2638888888888889,
    "acc_stderr": 0.03685651095897532,
    "acc_norm": 0.2777777777777778,
    "acc_norm_stderr": 0.03745554791462457
    },
    "mmlu:college_chemistry": {
    "acc": 0.3,
    "acc_stderr": 0.046056618647183814,
    "acc_norm": 0.3,
    "acc_norm_stderr": 0.046056618647183814
    },
    "mmlu:college_computer_science": {
    "acc": 0.22,
    "acc_stderr": 0.04163331998932269,
    "acc_norm": 0.21,
    "acc_norm_stderr": 0.040936018074033256
    },
    "mmlu:college_mathematics": {
    "acc": 0.15,
    "acc_stderr": 0.03588702812826371,
    "acc_norm": 0.23,
    "acc_norm_stderr": 0.04229525846816505
    },
    "mmlu:college_medicine": {
    "acc": 0.26011560693641617,
    "acc_stderr": 0.033450369167889925,
    "acc_norm": 0.2543352601156069,
    "acc_norm_stderr": 0.0332055644308557
    },
    "mmlu:college_physics": {
    "acc": 0.20588235294117646,
    "acc_stderr": 0.040233822736177476,
    "acc_norm": 0.20588235294117646,
    "acc_norm_stderr": 0.04023382273617747
    },
    "mmlu:computer_security": {
    "acc": 0.18,
    "acc_stderr": 0.03861229196653694,
    "acc_norm": 0.25,
    "acc_norm_stderr": 0.04351941398892446
    },
    "mmlu:conceptual_physics": {
    "acc": 0.3191489361702128,
    "acc_stderr": 0.030472973363380042,
    "acc_norm": 0.22127659574468084,
    "acc_norm_stderr": 0.027136349602424063
    },
    "mmlu:econometrics": {
    "acc": 0.20175438596491227,
    "acc_stderr": 0.037752050135836386,
    "acc_norm": 0.20175438596491227,
    "acc_norm_stderr": 0.03775205013583639
    },
    "mmlu:electrical_engineering": {
    "acc": 0.23448275862068965,
    "acc_stderr": 0.035306258743465914,
    "acc_norm": 0.2620689655172414,
    "acc_norm_stderr": 0.036646663372252565
    },
    "mmlu:elementary_mathematics": {
    "acc": 0.22486772486772486,
    "acc_stderr": 0.02150209607822914,
    "acc_norm": 0.22486772486772486,
    "acc_norm_stderr": 0.02150209607822914
    },
    "mmlu:formal_logic": {
    "acc": 0.2698412698412698,
    "acc_stderr": 0.03970158273235171,
    "acc_norm": 0.25396825396825395,
    "acc_norm_stderr": 0.03893259610604674
    },
    "mmlu:global_facts": {
    "acc": 0.29,
    "acc_stderr": 0.045604802157206845,
    "acc_norm": 0.32,
    "acc_norm_stderr": 0.04688261722621504
    },
    "mmlu:high_school_biology": {
    "acc": 0.24838709677419354,
    "acc_stderr": 0.024580028921480996,
    "acc_norm": 0.27419354838709675,
    "acc_norm_stderr": 0.025378139970885193
    },
    "mmlu:high_school_chemistry": {
    "acc": 0.2019704433497537,
    "acc_stderr": 0.028247350122180284,
    "acc_norm": 0.22167487684729065,
    "acc_norm_stderr": 0.029225575892489614
    },
    "mmlu:high_school_computer_science": {
    "acc": 0.25,
    "acc_stderr": 0.04351941398892446,
    "acc_norm": 0.3,
    "acc_norm_stderr": 0.046056618647183814
    },
    "mmlu:high_school_european_history": {
    "acc": 0.21818181818181817,
    "acc_stderr": 0.03225078108306289,
    "acc_norm": 0.3090909090909091,
    "acc_norm_stderr": 0.036085410115739666
    },
    "mmlu:high_school_geography": {
    "acc": 0.25757575757575757,
    "acc_stderr": 0.031156269519646836,
    "acc_norm": 0.29292929292929293,
    "acc_norm_stderr": 0.03242497958178818
    },
    "mmlu:high_school_government_and_politics": {
    "acc": 0.23316062176165803,
    "acc_stderr": 0.030516111371476008,
    "acc_norm": 0.2694300518134715,
    "acc_norm_stderr": 0.03201867122877793
    },
    "mmlu:high_school_macroeconomics": {
    "acc": 0.23333333333333334,
    "acc_stderr": 0.021444547301560476,
    "acc_norm": 0.26666666666666666,
    "acc_norm_stderr": 0.022421273612923714
    },
    "mmlu:high_school_mathematics": {
    "acc": 0.13333333333333333,
    "acc_stderr": 0.020726180448133867,
    "acc_norm": 0.16666666666666666,
    "acc_norm_stderr": 0.02272257846455052
    },
    "mmlu:high_school_microeconomics": {
    "acc": 0.23949579831932774,
    "acc_stderr": 0.02772206549336127,
    "acc_norm": 0.33613445378151263,
    "acc_norm_stderr": 0.030684737115135363
    },
    "mmlu:high_school_physics": {
    "acc": 0.23178807947019867,
    "acc_stderr": 0.03445406271987054,
    "acc_norm": 0.2847682119205298,
    "acc_norm_stderr": 0.03684881521389023
    },
    "mmlu:high_school_psychology": {
    "acc": 0.28256880733944956,
    "acc_stderr": 0.01930424349770715,
    "acc_norm": 0.26605504587155965,
    "acc_norm_stderr": 0.01894602232222559
    },
    "mmlu:high_school_statistics": {
    "acc": 0.2638888888888889,
    "acc_stderr": 0.030058202704309846,
    "acc_norm": 0.2962962962962963,
    "acc_norm_stderr": 0.031141447823536037
    },
    "mmlu:high_school_us_history": {
    "acc": 0.24509803921568626,
    "acc_stderr": 0.03019028245350194,
    "acc_norm": 0.28921568627450983,
    "acc_norm_stderr": 0.031822318676475524
    },
    "mmlu:high_school_world_history": {
    "acc": 0.24050632911392406,
    "acc_stderr": 0.027820781981149675,
    "acc_norm": 0.25738396624472576,
    "acc_norm_stderr": 0.02845882099146029
    },
    "mmlu:human_aging": {
    "acc": 0.3542600896860987,
    "acc_stderr": 0.032100621541349864,
    "acc_norm": 0.3004484304932735,
    "acc_norm_stderr": 0.030769352008229143
    },
    "mmlu:human_sexuality": {
    "acc": 0.32061068702290074,
    "acc_stderr": 0.04093329229834278,
    "acc_norm": 0.33587786259541985,
    "acc_norm_stderr": 0.04142313771996665
    },
    "mmlu:international_law": {
    "acc": 0.1487603305785124,
    "acc_stderr": 0.03248470083807194,
    "acc_norm": 0.21487603305785125,
    "acc_norm_stderr": 0.037494924487096994
    },
    "mmlu:jurisprudence": {
    "acc": 0.2037037037037037,
    "acc_stderr": 0.03893542518824847,
    "acc_norm": 0.25,
    "acc_norm_stderr": 0.04186091791394607
    },
    "mmlu:logical_fallacies": {
    "acc": 0.3006134969325153,
    "acc_stderr": 0.0360251131880677,
    "acc_norm": 0.3312883435582822,
    "acc_norm_stderr": 0.03697983910025588
    },
    "mmlu:machine_learning": {
    "acc": 0.2767857142857143,
    "acc_stderr": 0.04246624336697625,
    "acc_norm": 0.2767857142857143,
    "acc_norm_stderr": 0.042466243366976256
    },
    "mmlu:management": {
    "acc": 0.24271844660194175,
    "acc_stderr": 0.04245022486384495,
    "acc_norm": 0.30097087378640774,
    "acc_norm_stderr": 0.04541609446503947
    },
    "mmlu:marketing": {
    "acc": 0.3076923076923077,
    "acc_stderr": 0.03023638994217309,
    "acc_norm": 0.3162393162393162,
    "acc_norm_stderr": 0.030463656747340265
    },
    "mmlu:medical_genetics": {
    "acc": 0.21,
    "acc_stderr": 0.040936018074033256,
    "acc_norm": 0.27,
    "acc_norm_stderr": 0.044619604333847394
    },
    "mmlu:miscellaneous": {
    "acc": 0.2720306513409962,
    "acc_stderr": 0.015913367447500514,
    "acc_norm": 0.2707535121328225,
    "acc_norm_stderr": 0.015889888362560486
    },
    "mmlu:moral_disputes": {
    "acc": 0.1907514450867052,
    "acc_stderr": 0.021152676966575294,
    "acc_norm": 0.1936416184971098,
    "acc_norm_stderr": 0.021274230317515543
    },
    "mmlu:moral_scenarios": {
    "acc": 0.23798882681564246,
    "acc_stderr": 0.014242630070574915,
    "acc_norm": 0.27262569832402234,
    "acc_norm_stderr": 0.014893391735249588
    },
    "mmlu:nutrition": {
    "acc": 0.2222222222222222,
    "acc_stderr": 0.02380518652488813,
    "acc_norm": 0.29411764705882354,
    "acc_norm_stderr": 0.02609016250427905
    },
    "mmlu:philosophy": {
    "acc": 0.2090032154340836,
    "acc_stderr": 0.023093140398374224,
    "acc_norm": 0.26688102893890675,
    "acc_norm_stderr": 0.025122637608816636
    },
    "mmlu:prehistory": {
    "acc": 0.2623456790123457,
    "acc_stderr": 0.02447722285613512,
    "acc_norm": 0.2222222222222222,
    "acc_norm_stderr": 0.023132376234543353
    },
    "mmlu:professional_accounting": {
    "acc": 0.2553191489361702,
    "acc_stderr": 0.02601199293090202,
    "acc_norm": 0.23404255319148937,
    "acc_norm_stderr": 0.025257861359432414
    },
    "mmlu:professional_law": {
    "acc": 0.236,
    "acc_stderr": 0.013434451402438697,
    "acc_norm": 0.261,
    "acc_norm_stderr": 0.013895037677965126
    },
    "mmlu:professional_medicine": {
    "acc": 0.23529411764705882,
    "acc_stderr": 0.02576725201085597,
    "acc_norm": 0.23529411764705882,
    "acc_norm_stderr": 0.02576725201085598
    },
    "mmlu:professional_medicine": {
    "acc": 0.23529411764705882,
    "acc_stderr": 0.02576725201085597,
    "acc_norm": 0.23529411764705882,
    "acc_norm_stderr": 0.02576725201085598
  },
  "mmlu:professional_psychology": {
    "acc": 0.2549019607843137,
    "acc_stderr": 0.017630827375148383,
    "acc_norm": 0.25,
    "acc_norm_stderr": 0.01751781884501444
  },
  "mmlu:public_relations": {
    "acc": 0.35454545454545455,
    "acc_stderr": 0.04582004841505417,
    "acc_norm": 0.20909090909090908,
    "acc_norm_stderr": 0.038950910157241385
  },
  "mmlu:security_studies": {
    "acc": 0.3142857142857143,
    "acc_stderr": 0.02971932942241746,
    "acc_norm": 0.19591836734693877,
    "acc_norm_stderr": 0.025409301953225678
  },
  "mmlu:sociology": {
    "acc": 0.21393034825870647,
    "acc_stderr": 0.02899690969332891,
    "acc_norm": 0.22388059701492538,
    "acc_norm_stderr": 0.029475250236017173
  },
  "mmlu:us_foreign_policy": {
    "acc": 0.24,
    "acc_stderr": 0.04292346959909283,
    "acc_norm": 0.23,
    "acc_norm_stderr": 0.04229525846816505
  },
  "mmlu:virology": {
    "acc": 0.22289156626506024,
    "acc_stderr": 0.03240004825594687,
    "acc_norm": 0.3253012048192771,
    "acc_norm_stderr": 0.03647168523683227
  },
  "mmlu:world_religions": {
    "acc": 0.15789473684210525,
    "acc_stderr": 0.027966785859160907,
    "acc_norm": 0.23391812865497075,
    "acc_norm_stderr": 0.032467217651178264
  },
  "openbookqa": {
    "acc": 0.14,
    "acc_stderr": 0.015533272840269622,
    "acc_norm": 0.27,
    "acc_norm_stderr": 0.01987435483128749
  },
  "piqa": {
    "acc": 0.568,
    "acc_stderr": 0.0156723202373362,
    "acc_norm": 0.588,
    "acc_norm_stderr": 0.015572363292015091
  },
  "siqa": {
    "acc": 0.342,
    "acc_stderr": 0.015008706182121726,
    "acc_norm": 0.375,
    "acc_norm_stderr": 0.015316971293620996
  },
  "winogrande": {
    "acc": 0.511,
    "acc_stderr": 0.015815471195292693,
    "acc_norm": 0.498,
    "acc_norm_stderr": 0.015819173374302706
  },
  "arc:_average": {
    "acc": 0.277,
    "acc_stderr": 0.013694891076312077,
    "acc_norm": 0.298,
    "acc_norm_stderr": 0.014299871901289755
  },
  "mmlu:_average": {
    "acc": 0.24486225766415484,
    "acc_stderr": 0.03198621725172759,
    "acc_norm": 0.26125950385266233,
    "acc_norm_stderr": 0.032822076137567394
  },
}

### 40k proxy, 30k reference

In [8]:
# reference_data_30k = {
#     "arc:challenge": {
#     "acc": 0.177,
#     "acc_stderr": 0.012075463420375061,
#     "acc_norm": 0.242,
#     "acc_norm_stderr": 0.013550631705555954
#     },
#     "arc:easy": {
#     "acc": 0.309,
#     "acc_stderr": 0.014619600977206484,
#     "acc_norm": 0.319,
#     "acc_norm_stderr": 0.014746404865473479
#     },
#     "commonsense_qa": {
#     "acc": 0.197,
#     "acc_stderr": 0.012583693787968144,
#     "acc_norm": 0.227,
#     "acc_norm_stderr": 0.013253174964763923
#     },
#     "hellaswag": {
#     "acc": 0.265,
#     "acc_stderr": 0.013963164754809954,
#     "acc_norm": 0.259,
#     "acc_norm_stderr": 0.013860415257527911
#     },
#     "mmlu:abstract_algebra": {
#     "acc": 0.19,
#     "acc_stderr": 0.03942772444036625,
#     "acc_norm": 0.24,
#     "acc_norm_stderr": 0.04292346959909284
#     },
#     "mmlu:anatomy": {
#     "acc": 0.2074074074074074,
#     "acc_stderr": 0.035025531706783186,
#     "acc_norm": 0.24444444444444444,
#     "acc_norm_stderr": 0.037125378336148665
#     },
#     "mmlu:astronomy": {
#     "acc": 0.20394736842105263,
#     "acc_stderr": 0.032790004063100515,
#     "acc_norm": 0.28289473684210525,
#     "acc_norm_stderr": 0.03665349695640767
#     },
#     "mmlu:business_ethics": {
#     "acc": 0.38,
#     "acc_stderr": 0.04878317312145634,
#     "acc_norm": 0.25,
#     "acc_norm_stderr": 0.04351941398892446
#     },
#     "mmlu:clinical_knowledge": {
#     "acc": 0.18490566037735848,
#     "acc_stderr": 0.02389335183446432,
#     "acc_norm": 0.30566037735849055,
#     "acc_norm_stderr": 0.028353298073322666
#     },
#     "mmlu:college_biology": {
#     "acc": 0.2013888888888889,
#     "acc_stderr": 0.033536474697138406,
#     "acc_norm": 0.20833333333333334,
#     "acc_norm_stderr": 0.03396116205845335
#     },
#     "mmlu:college_chemistry": {
#     "acc": 0.22,
#     "acc_stderr": 0.041633319989322695,
#     "acc_norm": 0.26,
#     "acc_norm_stderr": 0.0440844002276808
#     },
#     "mmlu:college_computer_science": {
#     "acc": 0.26,
#     "acc_stderr": 0.0440844002276808,
#     "acc_norm": 0.18,
#     "acc_norm_stderr": 0.038612291966536955
#     },
#     "mmlu:college_mathematics": {
#     "acc": 0.18,
#     "acc_stderr": 0.03861229196653695,
#     "acc_norm": 0.23,
#     "acc_norm_stderr": 0.042295258468165044
#     },
#     "mmlu:college_medicine": {
#     "acc": 0.24277456647398843,
#     "acc_stderr": 0.0326926380614177,
#     "acc_norm": 0.27167630057803466,
#     "acc_norm_stderr": 0.0339175032232166
#     },
#     "mmlu:college_physics": {
#     "acc": 0.21568627450980393,
#     "acc_stderr": 0.04092563958237656,
#     "acc_norm": 0.23529411764705882,
#     "acc_norm_stderr": 0.04220773659171453
#     },
#     "mmlu:computer_security": {
#     "acc": 0.26,
#     "acc_stderr": 0.04408440022768078,
#     "acc_norm": 0.24,
#     "acc_norm_stderr": 0.042923469599092816
#     },
#     "mmlu:conceptual_physics": {
#     "acc": 0.2978723404255319,
#     "acc_stderr": 0.029896145682095462,
#     "acc_norm": 0.22127659574468084,
#     "acc_norm_stderr": 0.027136349602424063
#     },
#     "mmlu:econometrics": {
#     "acc": 0.23684210526315788,
#     "acc_stderr": 0.039994238792813344,
#     "acc_norm": 0.2719298245614035,
#     "acc_norm_stderr": 0.04185774424022056
#     },
#     "mmlu:electrical_engineering": {
#     "acc": 0.22758620689655173,
#     "acc_stderr": 0.03493950380131184,
#     "acc_norm": 0.2689655172413793,
#     "acc_norm_stderr": 0.036951833116502325
#     },
#     "mmlu:elementary_mathematics": {
#     "acc": 0.21693121693121692,
#     "acc_stderr": 0.02122708244944505,
#     "acc_norm": 0.22486772486772486,
#     "acc_norm_stderr": 0.02150209607822914
#     },
#     "mmlu:formal_logic": {
#     "acc": 0.2698412698412698,
#     "acc_stderr": 0.03970158273235172,
#     "acc_norm": 0.21428571428571427,
#     "acc_norm_stderr": 0.03670066451047182
#     },
#     "mmlu:global_facts": {
#     "acc": 0.23,
#     "acc_stderr": 0.04229525846816508,
#     "acc_norm": 0.2,
#     "acc_norm_stderr": 0.04020151261036846
#     },
#     "mmlu:high_school_biology": {
#     "acc": 0.22258064516129034,
#     "acc_stderr": 0.023664216671642525,
#     "acc_norm": 0.27419354838709675,
#     "acc_norm_stderr": 0.025378139970885196
#     },
#     "mmlu:high_school_chemistry": {
#     "acc": 0.18719211822660098,
#     "acc_stderr": 0.027444924966882618,
#     "acc_norm": 0.22167487684729065,
#     "acc_norm_stderr": 0.029225575892489614
#     },
#     "mmlu:high_school_computer_science": {
#     "acc": 0.18,
#     "acc_stderr": 0.038612291966536955,
#     "acc_norm": 0.21,
#     "acc_norm_stderr": 0.040936018074033256
#     },
#     "mmlu:high_school_european_history": {
#     "acc": 0.17575757575757575,
#     "acc_stderr": 0.02972094300622445,
#     "acc_norm": 0.23636363636363636,
#     "acc_norm_stderr": 0.03317505930009182
#     },
#     "mmlu:high_school_geography": {
#     "acc": 0.23737373737373738,
#     "acc_stderr": 0.0303137105381989,
#     "acc_norm": 0.31313131313131315,
#     "acc_norm_stderr": 0.03304205087813652
#     },
#     "mmlu:high_school_government_and_politics": {
#     "acc": 0.20725388601036268,
#     "acc_stderr": 0.02925282329180363,
#     "acc_norm": 0.24870466321243523,
#     "acc_norm_stderr": 0.031195840877700286
#     },
#     "mmlu:high_school_macroeconomics": {
#     "acc": 0.24102564102564103,
#     "acc_stderr": 0.021685546665333195,
#     "acc_norm": 0.25384615384615383,
#     "acc_norm_stderr": 0.022066054378726257
#     },
#     "mmlu:high_school_mathematics": {
#     "acc": 0.14444444444444443,
#     "acc_stderr": 0.0214337612741049,
#     "acc_norm": 0.2,
#     "acc_norm_stderr": 0.024388430433987664
#     },
#     "mmlu:high_school_microeconomics": {
#     "acc": 0.23949579831932774,
#     "acc_stderr": 0.02772206549336127,
#     "acc_norm": 0.31092436974789917,
#     "acc_norm_stderr": 0.030066761582977927
#     },
#     "mmlu:high_school_physics": {
#     "acc": 0.25165562913907286,
#     "acc_stderr": 0.035433042343899844,
#     "acc_norm": 0.23178807947019867,
#     "acc_norm_stderr": 0.03445406271987053
#     },
#     "mmlu:high_school_psychology": {
#     "acc": 0.27155963302752295,
#     "acc_stderr": 0.019069098363191442,
#     "acc_norm": 0.25137614678899084,
#     "acc_norm_stderr": 0.018599206360287415
#     },
#     "mmlu:high_school_statistics": {
#     "acc": 0.25925925925925924,
#     "acc_stderr": 0.02988691054762696,
#     "acc_norm": 0.30092592592592593,
#     "acc_norm_stderr": 0.031280390843298825
#     },
#     "mmlu:high_school_us_history": {
#     "acc": 0.24019607843137256,
#     "acc_stderr": 0.02998373305591362,
#     "acc_norm": 0.2647058823529412,
#     "acc_norm_stderr": 0.030964517926923413
#     },
#     "mmlu:high_school_world_history": {
#     "acc": 0.20675105485232068,
#     "acc_stderr": 0.026361651668389087,
#     "acc_norm": 0.28270042194092826,
#     "acc_norm_stderr": 0.02931281415395592
#     },
#     "mmlu:human_aging": {
#     "acc": 0.336322869955157,
#     "acc_stderr": 0.031708824268455,
#     "acc_norm": 0.2600896860986547,
#     "acc_norm_stderr": 0.029442495585857487
#     },
#     "mmlu:human_sexuality": {
#     "acc": 0.29770992366412213,
#     "acc_stderr": 0.04010358942462203,
#     "acc_norm": 0.35877862595419846,
#     "acc_norm_stderr": 0.04206739313864908
#     },
#     "mmlu:international_law": {
#     "acc": 0.11570247933884298,
#     "acc_stderr": 0.029199802455622783,
#     "acc_norm": 0.2066115702479339,
#     "acc_norm_stderr": 0.036959801280988254
#     },
#     "mmlu:jurisprudence": {
#     "acc": 0.1574074074074074,
#     "acc_stderr": 0.035207039905179635,
#     "acc_norm": 0.23148148148148148,
#     "acc_norm_stderr": 0.04077494709252628
#     },
#     "mmlu:logical_fallacies": {
#     "acc": 0.2822085889570552,
#     "acc_stderr": 0.03536117886664742,
#     "acc_norm": 0.31901840490797545,
#     "acc_norm_stderr": 0.03661997551073836
#     },
#     "mmlu:machine_learning": {
#     "acc": 0.26785714285714285,
#     "acc_stderr": 0.04203277291467763,
#     "acc_norm": 0.20535714285714285,
#     "acc_norm_stderr": 0.03834241021419072
#     },
#     "mmlu:management": {
#     "acc": 0.20388349514563106,
#     "acc_stderr": 0.03989139859531769,
#     "acc_norm": 0.24271844660194175,
#     "acc_norm_stderr": 0.04245022486384495
#     },
#     "mmlu:marketing": {
#     "acc": 0.2777777777777778,
#     "acc_stderr": 0.029343114798094476,
#     "acc_norm": 0.3076923076923077,
#     "acc_norm_stderr": 0.0302363899421731
#     },
#     "mmlu:medical_genetics": {
#     "acc": 0.25,
#     "acc_stderr": 0.04351941398892446,
#     "acc_norm": 0.26,
#     "acc_norm_stderr": 0.044084400227680794
#     },
#     "mmlu:miscellaneous": {
#     "acc": 0.24010217113665389,
#     "acc_stderr": 0.015274685213734191,
#     "acc_norm": 0.2656449553001277,
#     "acc_norm_stderr": 0.01579430248788872
#     },
#     "mmlu:moral_disputes": {
#     "acc": 0.22254335260115607,
#     "acc_stderr": 0.02239421566194282,
#     "acc_norm": 0.2138728323699422,
#     "acc_norm_stderr": 0.02207570925175718
#     },
#     "mmlu:moral_scenarios": {
#     "acc": 0.23798882681564246,
#     "acc_stderr": 0.014242630070574915,
#     "acc_norm": 0.27262569832402234,
#     "acc_norm_stderr": 0.014893391735249588
#     },
#     "mmlu:nutrition": {
#     "acc": 0.20261437908496732,
#     "acc_stderr": 0.02301544687798569,
#     "acc_norm": 0.2875816993464052,
#     "acc_norm_stderr": 0.02591780611714716
#     },
#     "mmlu:philosophy": {
#     "acc": 0.2379421221864952,
#     "acc_stderr": 0.024185150647818707,
#     "acc_norm": 0.2733118971061093,
#     "acc_norm_stderr": 0.02531176597542612
#     },
#     "mmlu:prehistory": {
#     "acc": 0.2654320987654321,
#     "acc_stderr": 0.024569223600460852,
#     "acc_norm": 0.22839506172839505,
#     "acc_norm_stderr": 0.023358211840626267
#     },
#     "mmlu:prehistory": {
#     "acc": 0.2654320987654321,
#     "acc_stderr": 0.024569223600460852,
#     "acc_norm": 0.22839506172839505,
#     "acc_norm_stderr": 0.023358211840626267
#     },
#     "mmlu:professional_accounting": {
#     "acc": 0.23049645390070922,
#     "acc_stderr": 0.025123739226872402,
#     "acc_norm": 0.22340425531914893,
#     "acc_norm_stderr": 0.02484792135806396
#     },
#     "mmlu:professional_law": {
#     "acc": 0.255,
#     "acc_stderr": 0.013790038620872828,
#     "acc_norm": 0.27,
#     "acc_norm_stderr": 0.014046255632633915
#     },
#     "mmlu:professional_medicine": {
#     "acc": 0.20955882352941177,
#     "acc_stderr": 0.024723110407677062,
#     "acc_norm": 0.24632352941176472,
#     "acc_norm_stderr": 0.02617343857052
#     },
#     "mmlu:professional_psychology": {
#     "acc": 0.24836601307189543,
#     "acc_stderr": 0.017479487001364764,
#     "acc_norm": 0.26633986928104575,
#     "acc_norm_stderr": 0.017883188134667178
#     },
#     "mmlu:public_relations": {
#     "acc": 0.3181818181818182,
#     "acc_stderr": 0.04461272175910508,
#     "acc_norm": 0.23636363636363636,
#     "acc_norm_stderr": 0.040693063197213775
#     },
#     "mmlu:security_studies": {
#     "acc": 0.30612244897959184,
#     "acc_stderr": 0.029504896454595968,
#     "acc_norm": 0.17142857142857143,
#     "acc_norm_stderr": 0.02412746346265013
#     },
#     "mmlu:sociology": {
#     "acc": 0.19900497512437812,
#     "acc_stderr": 0.028231365092758406,
#     "acc_norm": 0.24875621890547264,
#     "acc_norm_stderr": 0.03056767593891672
#     },
#     "mmlu:us_foreign_policy": {
#     "acc": 0.22,
#     "acc_stderr": 0.04163331998932269,
#     "acc_norm": 0.24,
#     "acc_norm_stderr": 0.042923469599092816
#     },
#     "mmlu:virology": {
#     "acc": 0.26506024096385544,
#     "acc_stderr": 0.03436024037944966,
#     "acc_norm": 0.3072289156626506,
#     "acc_norm_stderr": 0.03591566797824665
#     },
#     "mmlu:world_religions": {
#     "acc": 0.12280701754385964,
#     "acc_stderr": 0.025172984350155764,
#     "acc_norm": 0.2046783625730994,
#     "acc_norm_stderr": 0.03094445977853321
#     },
#     "openbookqa": {
#     "acc": 0.124,
#     "acc_stderr": 0.014754096608517583,
#     "acc_norm": 0.278,
#     "acc_norm_stderr": 0.020055833888070914
#     },
#     "piqa": {
#     "acc": 0.545,
#     "acc_stderr": 0.01575510149834709,
#     "acc_norm": 0.537,
#     "acc_norm_stderr": 0.015775927227262412
#     },
#     "siqa": {
#     "acc": 0.358,
#     "acc_stderr": 0.015167928865407559,
#     "acc_norm": 0.385,
#     "acc_norm_stderr": 0.01539519444541081
#     },
#     "winogrande": {
#     "acc": 0.489,
#     "acc_stderr": 0.015815471195292682,
#     "acc_norm": 0.502,
#     "acc_norm_stderr": 0.015819173374302706
#     },
#     "arc:_average": {
#     "acc": 0.243,
#     "acc_stderr": 0.013347532198790773,
#     "acc_norm": 0.28049999999999997,
#     "acc_norm_stderr": 0.014148518285514717
#     },
#     "mmlu:_average": {
#     "acc": 0.23262840760445191,
#     "acc_stderr": 0.031382594250348235,
#     "acc_norm": 0.2508362609452844,
#     "acc_norm_stderr": 0.03230638300974771
#     }
# }

### 100k proxy, 120k reference

In [3]:
doremi_data_100k_proxy_120k_ckp = doremi_data = {
"arc:challenge": {
"acc": 0.183,
"acc_stderr": 0.012233587399477825,
"acc_norm": 0.235,
"acc_norm_stderr": 0.013414729030247124
},
"arc:easy": {
"acc": 0.418,
"acc_stderr": 0.015605111967541946,
"acc_norm": 0.419,
"acc_norm_stderr": 0.015610338967577794
},
"commonsense_qa": {
"acc": 0.252,
"acc_stderr": 0.013736254390651145,
"acc_norm": 0.263,
"acc_norm_stderr": 0.013929286594259719
},
"hellaswag": {
"acc": 0.301,
"acc_stderr": 0.014512395033543143,
"acc_norm": 0.33,
"acc_norm_stderr": 0.014876872027456732
},
"mmlu:abstract_algebra": {
"acc": 0.23,
"acc_stderr": 0.04229525846816508,
"acc_norm": 0.24,
"acc_norm_stderr": 0.04292346959909283
},
"mmlu:anatomy": {
"acc": 0.24444444444444444,
"acc_stderr": 0.037125378336148665,
"acc_norm": 0.2518518518518518,
"acc_norm_stderr": 0.03749850709174022
},
"mmlu:astronomy": {
"acc": 0.24342105263157895,
"acc_stderr": 0.034923496688842384,
"acc_norm": 0.3026315789473684,
"acc_norm_stderr": 0.037385206761196686
},
"mmlu:business_ethics": {
"acc": 0.41,
"acc_stderr": 0.04943110704237102,
"acc_norm": 0.35,
"acc_norm_stderr": 0.047937248544110196
},
"mmlu:clinical_knowledge": {
"acc": 0.26037735849056604,
"acc_stderr": 0.027008766090708094,
"acc_norm": 0.30943396226415093,
"acc_norm_stderr": 0.028450154794118627
},
"mmlu:college_biology": {
"acc": 0.2708333333333333,
"acc_stderr": 0.03716177437566016,
"acc_norm": 0.25,
"acc_norm_stderr": 0.03621034121889507
},
"mmlu:college_chemistry": {
"acc": 0.29,
"acc_stderr": 0.04560480215720684,
"acc_norm": 0.27,
"acc_norm_stderr": 0.0446196043338474
},
"mmlu:college_computer_science": {
"acc": 0.27,
"acc_stderr": 0.044619604333847394,
"acc_norm": 0.26,
"acc_norm_stderr": 0.04408440022768079
},
"mmlu:college_mathematics": {
"acc": 0.16,
"acc_stderr": 0.03684529491774709,
"acc_norm": 0.25,
"acc_norm_stderr": 0.04351941398892446
},
"mmlu:college_medicine": {
"acc": 0.2774566473988439,
"acc_stderr": 0.034140140070440354,
"acc_norm": 0.24277456647398843,
"acc_norm_stderr": 0.0326926380614177
},
"mmlu:college_physics": {
"acc": 0.1568627450980392,
"acc_stderr": 0.03618664819936246,
"acc_norm": 0.1568627450980392,
"acc_norm_stderr": 0.03618664819936244
},
"mmlu:computer_security": {
"acc": 0.26,
"acc_stderr": 0.04408440022768078,
"acc_norm": 0.31,
"acc_norm_stderr": 0.04648231987117316
},
"mmlu:conceptual_physics": {
"acc": 0.35319148936170214,
"acc_stderr": 0.031245325202761926,
"acc_norm": 0.23829787234042554,
"acc_norm_stderr": 0.027851252973889767
},
"mmlu:econometrics": {
"acc": 0.18421052631578946,
"acc_stderr": 0.03646758875075566,
"acc_norm": 0.22807017543859648,
"acc_norm_stderr": 0.03947152782669415
},
"mmlu:electrical_engineering": {
"acc": 0.2827586206896552,
"acc_stderr": 0.03752833958003336,
"acc_norm": 0.2827586206896552,
"acc_norm_stderr": 0.03752833958003336
},
"mmlu:elementary_mathematics": {
"acc": 0.23544973544973544,
"acc_stderr": 0.02185150982203172,
"acc_norm": 0.24867724867724866,
"acc_norm_stderr": 0.022261817692400168
},
"mmlu:formal_logic": {
"acc": 0.2777777777777778,
"acc_stderr": 0.04006168083848877,
"acc_norm": 0.2698412698412698,
"acc_norm_stderr": 0.03970158273235172
},
"mmlu:global_facts": {
"acc": 0.27,
"acc_stderr": 0.04461960433384741,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720684
},
"mmlu:high_school_biology": {
"acc": 0.24516129032258063,
"acc_stderr": 0.024472243840895535,
"acc_norm": 0.3032258064516129,
"acc_norm_stderr": 0.02614868593067175
},
"mmlu:high_school_chemistry": {
"acc": 0.15763546798029557,
"acc_stderr": 0.025639014131172404,
"acc_norm": 0.2315270935960591,
"acc_norm_stderr": 0.029678333141444437
},
"mmlu:high_school_computer_science": {
"acc": 0.26,
"acc_stderr": 0.044084400227680794,
"acc_norm": 0.32,
"acc_norm_stderr": 0.046882617226215034
},
"mmlu:high_school_european_history": {
"acc": 0.20606060606060606,
"acc_stderr": 0.03158415324047711,
"acc_norm": 0.3090909090909091,
"acc_norm_stderr": 0.036085410115739666
},
"mmlu:high_school_geography": {
"acc": 0.2727272727272727,
"acc_stderr": 0.03173071239071724,
"acc_norm": 0.31313131313131315,
"acc_norm_stderr": 0.033042050878136525
},
"mmlu:high_school_government_and_politics": {
"acc": 0.22797927461139897,
"acc_stderr": 0.030276909945178256,
"acc_norm": 0.2538860103626943,
"acc_norm_stderr": 0.03141024780565319
},
"mmlu:high_school_macroeconomics": {
"acc": 0.2282051282051282,
"acc_stderr": 0.02127839386358628,
"acc_norm": 0.258974358974359,
"acc_norm_stderr": 0.02221110681006166
},
"mmlu:high_school_mathematics": {
"acc": 0.14814814814814814,
"acc_stderr": 0.021659778422118036,
"acc_norm": 0.1925925925925926,
"acc_norm_stderr": 0.0240430751819452
},
"mmlu:high_school_microeconomics": {
"acc": 0.24369747899159663,
"acc_stderr": 0.027886828078380582,
"acc_norm": 0.3403361344537815,
"acc_norm_stderr": 0.03077805742293167
},
"mmlu:high_school_physics": {
"acc": 0.2781456953642384,
"acc_stderr": 0.03658603262763743,
"acc_norm": 0.271523178807947,
"acc_norm_stderr": 0.03631329803969654
},
"mmlu:high_school_psychology": {
"acc": 0.3155963302752294,
"acc_stderr": 0.01992611751386967,
"acc_norm": 0.29908256880733947,
"acc_norm_stderr": 0.019630417285415175
},
"mmlu:high_school_statistics": {
"acc": 0.25,
"acc_stderr": 0.029531221160930918,
"acc_norm": 0.2638888888888889,
"acc_norm_stderr": 0.030058202704309846
},
"mmlu:high_school_us_history": {
"acc": 0.24509803921568626,
"acc_stderr": 0.030190282453501936,
"acc_norm": 0.3088235294117647,
"acc_norm_stderr": 0.03242661719827218
},
"mmlu:high_school_world_history": {
"acc": 0.24050632911392406,
"acc_stderr": 0.027820781981149678,
"acc_norm": 0.2489451476793249,
"acc_norm_stderr": 0.028146970599422644
},
"mmlu:human_aging": {
"acc": 0.34080717488789236,
"acc_stderr": 0.03181149747055359,
"acc_norm": 0.27802690582959644,
"acc_norm_stderr": 0.03006958487449403
},
"mmlu:human_sexuality": {
"acc": 0.35877862595419846,
"acc_stderr": 0.04206739313864908,
"acc_norm": 0.31297709923664124,
"acc_norm_stderr": 0.04066962905677697
},
"mmlu:international_law": {
"acc": 0.11570247933884298,
"acc_stderr": 0.029199802455622793,
"acc_norm": 0.21487603305785125,
"acc_norm_stderr": 0.03749492448709698
},
"mmlu:jurisprudence": {
"acc": 0.17592592592592593,
"acc_stderr": 0.03680918141673881,
"acc_norm": 0.26851851851851855,
"acc_norm_stderr": 0.04284467968052191
},
"mmlu:logical_fallacies": {
"acc": 0.26380368098159507,
"acc_stderr": 0.03462419931615624,
"acc_norm": 0.3803680981595092,
"acc_norm_stderr": 0.03814269893261835
},
"mmlu:machine_learning": {
"acc": 0.22321428571428573,
"acc_stderr": 0.039523019677025116,
"acc_norm": 0.23214285714285715,
"acc_norm_stderr": 0.04007341809755805
},
"mmlu:management": {
"acc": 0.24271844660194175,
"acc_stderr": 0.042450224863844956,
"acc_norm": 0.33980582524271846,
"acc_norm_stderr": 0.04689765937278134
},
"mmlu:marketing": {
"acc": 0.37606837606837606,
"acc_stderr": 0.03173393632969481,
"acc_norm": 0.3803418803418803,
"acc_norm_stderr": 0.03180425204384099
},
"mmlu:medical_genetics": {
"acc": 0.22,
"acc_stderr": 0.04163331998932269,
"acc_norm": 0.26,
"acc_norm_stderr": 0.0440844002276808
},
"mmlu:miscellaneous": {
"acc": 0.3065134099616858,
"acc_stderr": 0.01648695289304151,
"acc_norm": 0.30268199233716475,
"acc_norm_stderr": 0.016428781581749367
},
"mmlu:moral_disputes": {
"acc": 0.2023121387283237,
"acc_stderr": 0.021628077380196134,
"acc_norm": 0.1936416184971098,
"acc_norm_stderr": 0.02127423031751555
},
"mmlu:moral_scenarios": {
"acc": 0.23798882681564246,
"acc_stderr": 0.014242630070574915,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.014893391735249588
},
"mmlu:nutrition": {
"acc": 0.22875816993464052,
"acc_stderr": 0.02405102973991226,
"acc_norm": 0.2875816993464052,
"acc_norm_stderr": 0.02591780611714716
},
"mmlu:philosophy": {
"acc": 0.2379421221864952,
"acc_stderr": 0.024185150647818707,
"acc_norm": 0.26688102893890675,
"acc_norm_stderr": 0.02512263760881664
},
"mmlu:prehistory": {
"acc": 0.2777777777777778,
"acc_stderr": 0.02492200116888634,
"acc_norm": 0.21604938271604937,
"acc_norm_stderr": 0.022899162918445803
},
"mmlu:professional_accounting": {
"acc": 0.24468085106382978,
"acc_stderr": 0.025645553622266726,
"acc_norm": 0.24822695035460993,
"acc_norm_stderr": 0.025770015644290385
},
"mmlu:professional_law": {
"acc": 0.238,
"acc_stderr": 0.013473586661967222,
"acc_norm": 0.252,
"acc_norm_stderr": 0.013736254390651145
},
"mmlu:professional_medicine": {
"acc": 0.24632352941176472,
"acc_stderr": 0.02617343857052,
"acc_norm": 0.28308823529411764,
"acc_norm_stderr": 0.027365861131513805
},
"mmlu:professional_psychology": {
"acc": 0.24509803921568626,
"acc_stderr": 0.01740181671142765,
"acc_norm": 0.25980392156862747,
"acc_norm_stderr": 0.017740899509177788
},
"mmlu:public_relations": {
"acc": 0.4,
"acc_stderr": 0.0469237132203465,
"acc_norm": 0.19090909090909092,
"acc_norm_stderr": 0.03764425585984924
},"mmlu:security_studies": {
"acc": 0.31020408163265306,
"acc_stderr": 0.029613459872484378,
"acc_norm": 0.20408163265306123,
"acc_norm_stderr": 0.025801283475090506
},
"mmlu:sociology": {
"acc": 0.23880597014925373,
"acc_stderr": 0.03014777593540922,
"acc_norm": 0.23383084577114427,
"acc_norm_stderr": 0.02992941540834839
},
"mmlu:us_foreign_policy": {
"acc": 0.27,
"acc_stderr": 0.04461960433384741,
"acc_norm": 0.26,
"acc_norm_stderr": 0.04408440022768079
},
"mmlu:virology": {
"acc": 0.2469879518072289,
"acc_stderr": 0.03357351982064537,
"acc_norm": 0.27710843373493976,
"acc_norm_stderr": 0.034843315926805875
},
"mmlu:world_religions": {
"acc": 0.18128654970760233,
"acc_stderr": 0.029547741687640024,
"acc_norm": 0.24561403508771928,
"acc_norm_stderr": 0.03301405946987251
},
"openbookqa": {
"acc": 0.16,
"acc_stderr": 0.01641154098050231,
"acc_norm": 0.266,
"acc_norm_stderr": 0.019780559675655493
},
"piqa": {
"acc": 0.606,
"acc_stderr": 0.015459721957493379,
"acc_norm": 0.633,
"acc_norm_stderr": 0.015249378464171742
},
"siqa": {
"acc": 0.376,
"acc_stderr": 0.015325105508898129,
"acc_norm": 0.397,
"acc_norm_stderr": 0.015480007449307994
},
"winogrande": {
"acc": 0.522,
"acc_stderr": 0.01580397942816196,
"acc_norm": 0.505,
"acc_norm_stderr": 0.015818508944436656
},
"arc:_average": {
"acc": 0.3005,
"acc_stderr": 0.013919349683509885,
"acc_norm": 0.32699999999999996,
"acc_norm_stderr": 0.014512533998912459
},
"mmlu:_average": {
"acc": 0.25307795098023184,
"acc_stderr": 0.032216775724701545,
"acc_norm": 0.26890191591111795,
"acc_norm_stderr": 0.03308441024718641
}}

In [4]:
reference_data_120k = {"commonsense_qa": {
"acc": 0.275,
"acc_stderr": 0.014127086556490528,
"acc_norm": 0.258,
"acc_norm_stderr": 0.013842963108656603
},
"hellaswag": {
"acc": 0.297,
"acc_stderr": 0.014456832294801098,
"acc_norm": 0.326,
"acc_norm_stderr": 0.014830507204541033
},
"openbookqa": {
"acc": 0.128,
"acc_stderr": 0.014955913837590672,
"acc_norm": 0.27,
"acc_norm_stderr": 0.019874354831287487
},
"piqa": {
"acc": 0.594,
"acc_stderr": 0.015537226438634592,
"acc_norm": 0.611,
"acc_norm_stderr": 0.015424555647308496
},
"siqa": {
"acc": 0.365,
"acc_stderr": 0.015231776226264909,
"acc_norm": 0.381,
"acc_norm_stderr": 0.015364734787007436
},
"winogrande": {
"acc": 0.51,
"acc_stderr": 0.01581613575277321,
"acc_norm": 0.497,
"acc_norm_stderr": 0.015819015179246724
},
"arc:challenge": {
"acc": 0.196,
"acc_stderr": 0.01255952792670737,
"acc_norm": 0.238,
"acc_norm_stderr": 0.013473586661967225
},
"arc:easy": {
"acc": 0.422,
"acc_stderr": 0.01562562511262066,
"acc_norm": 0.408,
"acc_norm_stderr": 0.015549205052920675
},
# "arc:average": {
# "acc": 0.309,
# "acc_stderr": 0.014092576519664016,
# "acc_norm": 0.32299999999999995,
# "acc_norm_stderr": 0.014511395857443949
# },
"mmlu:abstract_algebra": {
"acc": 0.21,
"acc_stderr": 0.04093601807403326,
"acc_norm": 0.24,
"acc_norm_stderr": 0.04292346959909284
},
"mmlu:anatomy": {
"acc": 0.21481481481481482,
"acc_stderr": 0.035478541985608236,
"acc_norm": 0.26666666666666666,
"acc_norm_stderr": 0.038201699145179055
},
"mmlu:astronomy": {
"acc": 0.23026315789473684,
"acc_stderr": 0.03426059424403165,
"acc_norm": 0.27631578947368424,
"acc_norm_stderr": 0.03639057569952925
},
"mmlu:business_ethics": {
"acc": 0.43,
"acc_stderr": 0.04975698519562428,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"mmlu:clinical_knowledge": {
"acc": 0.21509433962264152,
"acc_stderr": 0.025288394502891363,
"acc_norm": 0.3169811320754717,
"acc_norm_stderr": 0.028637235639800925
},
"mmlu:college_biology": {
"acc": 0.25,
"acc_stderr": 0.03621034121889507,
"acc_norm": 0.2152777777777778,
"acc_norm_stderr": 0.03437079344106133
},
"mmlu:college_chemistry": {
"acc": 0.27,
"acc_stderr": 0.044619604333847394,
"acc_norm": 0.26,
"acc_norm_stderr": 0.0440844002276808
},
"mmlu:college_computer_science": {
"acc": 0.26,
"acc_stderr": 0.0440844002276808,
"acc_norm": 0.25,
"acc_norm_stderr": 0.04351941398892446
},
"mmlu:college_mathematics": {
"acc": 0.14,
"acc_stderr": 0.03487350880197771,
"acc_norm": 0.24,
"acc_norm_stderr": 0.04292346959909282
},
"mmlu:college_medicine": {
"acc": 0.24277456647398843,
"acc_stderr": 0.0326926380614177,
"acc_norm": 0.23699421965317918,
"acc_norm_stderr": 0.03242414757483098
},
"mmlu:college_physics": {
"acc": 0.20588235294117646,
"acc_stderr": 0.04023382273617747,
"acc_norm": 0.21568627450980393,
"acc_norm_stderr": 0.04092563958237654
},
"mmlu:computer_security": {
"acc": 0.26,
"acc_stderr": 0.04408440022768078,
"acc_norm": 0.26,
"acc_norm_stderr": 0.04408440022768077
},
"mmlu:conceptual_physics": {
"acc": 0.3191489361702128,
"acc_stderr": 0.03047297336338004,
"acc_norm": 0.225531914893617,
"acc_norm_stderr": 0.02732107841738753
},
"mmlu:econometrics": {
"acc": 0.22807017543859648,
"acc_stderr": 0.03947152782669416,
"acc_norm": 0.19298245614035087,
"acc_norm_stderr": 0.037124548537213684
},
"mmlu:electrical_engineering": {
"acc": 0.23448275862068965,
"acc_stderr": 0.035306258743465914,
"acc_norm": 0.25517241379310346,
"acc_norm_stderr": 0.03632984052707842
},
"mmlu:elementary_mathematics": {
"acc": 0.23015873015873015,
"acc_stderr": 0.021679219663693152,
"acc_norm": 0.2222222222222222,
"acc_norm_stderr": 0.02141168439369419
},
"mmlu:formal_logic": {
"acc": 0.29365079365079366,
"acc_stderr": 0.04073524322147127,
"acc_norm": 0.25396825396825395,
"acc_norm_stderr": 0.038932596106046734
},
"mmlu:global_facts": {
"acc": 0.2,
"acc_stderr": 0.04020151261036846,
"acc_norm": 0.22,
"acc_norm_stderr": 0.041633319989322695
},
"mmlu:high_school_biology": {
"acc": 0.22580645161290322,
"acc_stderr": 0.023785577884181012,
"acc_norm": 0.27419354838709675,
"acc_norm_stderr": 0.025378139970885193
},
"mmlu:high_school_chemistry": {
"acc": 0.18719211822660098,
"acc_stderr": 0.027444924966882618,
"acc_norm": 0.2561576354679803,
"acc_norm_stderr": 0.030712730070982592
},
"mmlu:high_school_computer_science": {
"acc": 0.22,
"acc_stderr": 0.04163331998932269,
"acc_norm": 0.26,
"acc_norm_stderr": 0.0440844002276808
},
"mmlu:high_school_european_history": {
"acc": 0.18787878787878787,
"acc_stderr": 0.03050193405942914,
"acc_norm": 0.296969696969697,
"acc_norm_stderr": 0.03567969772268049
},
"mmlu:high_school_geography": {
"acc": 0.25252525252525254,
"acc_stderr": 0.030954055470365914,
"acc_norm": 0.31313131313131315,
"acc_norm_stderr": 0.03304205087813652
},
"mmlu:high_school_government_and_politics": {
"acc": 0.21761658031088082,
"acc_stderr": 0.02977866303775296,
"acc_norm": 0.25906735751295334,
"acc_norm_stderr": 0.0316187791793541
},
"mmlu:high_school_macroeconomics": {
"acc": 0.23076923076923078,
"acc_stderr": 0.02136202772522273,
"acc_norm": 0.24871794871794872,
"acc_norm_stderr": 0.021916957709213803
},
"mmlu:high_school_mathematics": {
"acc": 0.14074074074074075,
"acc_stderr": 0.0212029303435688,
"acc_norm": 0.1814814814814815,
"acc_norm_stderr": 0.023499264669407282
},
"mmlu:high_school_microeconomics": {
"acc": 0.25210084033613445,
"acc_stderr": 0.028205545033277726,
"acc_norm": 0.3403361344537815,
"acc_norm_stderr": 0.030778057422931673
},
"mmlu:high_school_physics": {
"acc": 0.2781456953642384,
"acc_stderr": 0.03658603262763743,
"acc_norm": 0.26490066225165565,
"acc_norm_stderr": 0.036030385453603826
},
"mmlu:high_school_psychology": {
"acc": 0.28623853211009176,
"acc_stderr": 0.019379436628919965,
"acc_norm": 0.27889908256880735,
"acc_norm_stderr": 0.01922746887646352
},
"mmlu:high_school_statistics": {
"acc": 0.2361111111111111,
"acc_stderr": 0.028963702570791037,
"acc_norm": 0.2777777777777778,
"acc_norm_stderr": 0.030546745264953195
},
"mmlu:high_school_us_history": {
"acc": 0.23529411764705882,
"acc_stderr": 0.02977177522814565,
"acc_norm": 0.28431372549019607,
"acc_norm_stderr": 0.031660096793998116
},
"mmlu:high_school_world_history": {
"acc": 0.24050632911392406,
"acc_stderr": 0.027820781981149675,
"acc_norm": 0.26582278481012656,
"acc_norm_stderr": 0.02875679962965834
},
"mmlu:human_aging": {
"acc": 0.32286995515695066,
"acc_stderr": 0.031381476375754995,
"acc_norm": 0.2645739910313901,
"acc_norm_stderr": 0.02960510321703832
},
"mmlu:human_sexuality": {
"acc": 0.366412213740458,
"acc_stderr": 0.04225875451969637,
"acc_norm": 0.3282442748091603,
"acc_norm_stderr": 0.04118438565806298
},
"mmlu:international_law": {
"acc": 0.14049586776859505,
"acc_stderr": 0.03172233426002159,
"acc_norm": 0.256198347107438,
"acc_norm_stderr": 0.03984979653302871
},
"mmlu:jurisprudence": {
"acc": 0.16666666666666666,
"acc_stderr": 0.036028141763926456,
"acc_norm": 0.19444444444444445,
"acc_norm_stderr": 0.03826076324884864
},
"mmlu:logical_fallacies": {
"acc": 0.22699386503067484,
"acc_stderr": 0.032910995786157686,
"acc_norm": 0.34355828220858897,
"acc_norm_stderr": 0.037311335196738925
},
"mmlu:machine_learning": {
"acc": 0.24107142857142858,
"acc_stderr": 0.04059867246952686,
"acc_norm": 0.24107142857142858,
"acc_norm_stderr": 0.04059867246952687
},
"mmlu:management": {
"acc": 0.22330097087378642,
"acc_stderr": 0.04123553189891431,
"acc_norm": 0.30097087378640774,
"acc_norm_stderr": 0.045416094465039476
},
"mmlu:marketing": {
"acc": 0.3504273504273504,
"acc_stderr": 0.031256108244218796,
"acc_norm": 0.34615384615384615,
"acc_norm_stderr": 0.0311669573672359
},
"mmlu:medical_genetics": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.24,
"acc_norm_stderr": 0.042923469599092816
},
"mmlu:miscellaneous": {
"acc": 0.2950191570881226,
"acc_stderr": 0.016308363772932724,
"acc_norm": 0.28991060025542786,
"acc_norm_stderr": 0.01622501794477095
},
"mmlu:moral_disputes": {
"acc": 0.21965317919075145,
"acc_stderr": 0.022289638852617904,
"acc_norm": 0.18208092485549132,
"acc_norm_stderr": 0.020776761102513
},
"mmlu:moral_scenarios": {
"acc": 0.23798882681564246,
"acc_stderr": 0.014242630070574915,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.014893391735249588
},
"mmlu:nutrition": {
"acc": 0.19934640522875818,
"acc_stderr": 0.02287581699346406,
"acc_norm": 0.2777777777777778,
"acc_norm_stderr": 0.0256468630971379
},
"mmlu:philosophy": {
"acc": 0.2282958199356913,
"acc_stderr": 0.023839303311398212,
"acc_norm": 0.2797427652733119,
"acc_norm_stderr": 0.02549425935069489
},
"mmlu:prehistory": {
"acc": 0.29012345679012347,
"acc_stderr": 0.025251173936495022,
"acc_norm": 0.20987654320987653,
"acc_norm_stderr": 0.02265834408598138
},
"mmlu:professional_law": {
"acc": 0.246,
"acc_stderr": 0.013626065817750638,
"acc_norm": 0.257,
"acc_norm_stderr": 0.013825416526895019
},
"mmlu:professional_medicine": {
"acc": 0.2610294117647059,
"acc_stderr": 0.026679252270103117,
"acc_norm": 0.2610294117647059,
"acc_norm_stderr": 0.026679252270103117
},
"mmlu:professional_psychology": {
"acc": 0.26143790849673204,
"acc_stderr": 0.01777694715752804,
"acc_norm": 0.24509803921568626,
"acc_norm_stderr": 0.017401816711427653
},
"mmlu:public_relations": {
"acc": 0.38181818181818183,
"acc_stderr": 0.04653429807913509,
"acc_norm": 0.2727272727272727,
"acc_norm_stderr": 0.04265792110940588
},
"mmlu:security_studies": {
"acc": 0.2938775510204082,
"acc_stderr": 0.029162738410249765,
"acc_norm": 0.22040816326530613,
"acc_norm_stderr": 0.02653704531214529
},
"mmlu:sociology": {
"acc": 0.208955223880597,
"acc_stderr": 0.028748298931728655,
"acc_norm": 0.22885572139303484,
"acc_norm_stderr": 0.029705284056772426
},
"mmlu:us_foreign_policy": {
"acc": 0.26,
"acc_stderr": 0.04408440022768078,
"acc_norm": 0.21,
"acc_norm_stderr": 0.04093601807403326
},
"mmlu:virology": {
"acc": 0.2469879518072289,
"acc_stderr": 0.03357351982064536,
"acc_norm": 0.3072289156626506,
"acc_norm_stderr": 0.03591566797824665
},
"mmlu:world_religions": {
"acc": 0.18128654970760233,
"acc_stderr": 0.029547741687640027,
"acc_norm": 0.23976608187134502,
"acc_norm_stderr": 0.032744852119469564
},
"mmlu:_average": {
"acc": 0.24539887197950047,
"acc_stderr": 0.031920583010219564,
"acc_norm": 0.2590726074431258,
"acc_norm_stderr": 0.03270896508924344
},
 "mmlu:professional_accounting": {'acc': 0.2624113475177305, 'acc_stderr': 0.026244920349843, 'acc_norm': 0.24822695035460993, 'acc_norm_stderr': 0.025770015644290382},
"arc:_average": {'acc': 0.309, 'acc_stderr': 0.014092576519664016, 'acc_norm': 0.32299999999999995, 'acc_norm_stderr': 0.014511395857443949}
}

In [5]:
len(reference_data_120k)

67

### Eval results

In [6]:
len(doremi_data_100k_proxy_120k_ckp)

67

In [8]:
len(reference_data_120k)

67

In [54]:
[name for name in reference_120k.keys() if name not in reference_data_25k.keys()]

[]

In [10]:
compute_stats(doremi_data_100k_proxy_120k_ckp, reference_data_120k)

total=67, succeed=40, failed=21, same=6, amount_succeed=0.8524100556721435, amount_fail=0.38452347362972156


In [11]:
plot(doremi_data_100k_proxy_120k_ckp, reference_data_120k)

Unnamed: 0,Task,DoReMi ACC,Reference ACC,DoReMi AccNorm,Reference AccNorm
0,arc:challenge,0.183,0.196,0.235,0.238
1,arc:easy,0.418,0.422,0.419,0.408
2,commonsense_qa,0.252,0.275,0.263,0.258
3,hellaswag,0.301,0.297,0.33,0.326
4,mmlu:abstract_algebra,0.23,0.21,0.24,0.24
5,mmlu:anatomy,0.244444,0.214815,0.251852,0.266667
6,mmlu:astronomy,0.243421,0.230263,0.302632,0.276316
7,mmlu:business_ethics,0.41,0.43,0.35,0.3
8,mmlu:clinical_knowledge,0.260377,0.215094,0.309434,0.316981
9,mmlu:college_biology,0.270833,0.25,0.25,0.215278
