In [1]:
from collections import defaultdict
from itertools import combinations

import pandas as pd
from scipy.stats import ttest_rel
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.multitest import multipletests

In [2]:
# model_type_results = {
#     "gpt2": "../data/metrics/gpt2-test-metrics.csv",
#     "bart": "../data/metrics/bart-test-metrics.csv",
#     "mistral": "../data/metrics/mistral-lora-test-metrics.csv",
# }

# data_mixture_results = {
#     "gpt2": "../data/metrics/gpt2-test-metrics.csv",
#     "gpt2-100k": "../data/metrics/gpt2-100k-test-metrics.csv",
#     "gpt2-1m": "../data/metrics/gpt2-1m-lora-test-metrics.csv",
# }

# synthetic queries
comparison_results = {
    "gpt2-1m": "../data/metrics/gpt2-1m-test-metrics.csv",
    "gpt-4o": "../data/metrics/gpt-4o-2024-08-06-test-metrics.csv",
}

# manually annotated queries, samples paired to the first 200 of other evals
annotated_results = {
    "gpt2-1m-annotated": "../data/metrics/gpt2-1m-annotated-test-metrics.csv",
    "gpt-4o-annotated": "../data/metrics/gpt-4o-2024-08-06-annotated-test-metrics.csv",
}

In [3]:
METRICS = {
    "tpr": ttest_rel,
    "iou": ttest_rel,
    "exact": mcnemar,
    "bertscore": ttest_rel,
}

def compute_pvals(results, subset_idxs: None | list[int] = None):
    scores = {k: pd.read_csv(v) for k, v in results.items()}
    if subset_idxs is not None:
        scores = {k: v.loc[subset_idxs] for k, v in scores.items()}

    models = sorted(list(scores.keys()))
    combos = list(combinations(models, 2))

    ret = dict()
    for metric, test in METRICS.items():
        ps = []
        for m1, m2 in combos:
            x1 = scores[m1][metric]
            x2 = scores[m2][metric]

            if test == mcnemar:
                a = ((x1 == 0) & (x2 == 0)).sum()
                b = ((x1 == 0) & (x2 == 1)).sum()
                c = ((x1 == 1) & (x2 == 0)).sum()
                d = ((x1 == 1) & (x2 == 1)).sum()
                mat = [
                    [a, b],
                    [c, d],
                ]
                p = test(mat).pvalue
            elif test == ttest_rel:
                p = test(x1, x2).pvalue
            else:
                raise NotImplementedError(f"Unknown test: {test}")

            ps.append(p)
        _, corrected_ps, _, _ = multipletests(ps, method="bonferroni")
        ret[metric] = corrected_ps
    return ret, combos


def compute_means(results, subset_idxs: None | list[int] = None):
    scores = {k: pd.read_csv(v) for k, v in results.items()}
    if subset_idxs is not None:
        scores = {k: v.loc[subset_idxs] for k, v in scores.items()}
    means = defaultdict(dict)
    stds = defaultdict(dict)
    for model, df in scores.items():
        for metric in METRICS:
            means[model][metric] = df[metric].mean()
            stds[model][metric] = df[metric].std()
    return pd.DataFrame(means).T, pd.DataFrame(stds).T

## Main Results

In [4]:
means, stds = compute_means(comparison_results)
means

Unnamed: 0,tpr,iou,exact,bertscore
gpt2-1m,0.85542,0.832169,0.702,0.918739
gpt-4o,0.719936,0.697954,0.558,0.89394


In [5]:
means, stds = compute_means(comparison_results, subset_idxs=list(range(200)))
means

Unnamed: 0,tpr,iou,exact,bertscore
gpt2-1m,0.918684,0.887094,0.78,0.932567
gpt-4o,0.729883,0.718428,0.585,0.898993


In [6]:
means, stds = compute_means(annotated_results, subset_idxs=list(range(200)))
means

Unnamed: 0,tpr,iou,exact,bertscore
gpt2-1m-annotated,0.711426,0.664612,0.52,0.762498
gpt-4o-annotated,0.736271,0.706616,0.5,0.75469


## Statistical Tests

In [7]:
metrics, combos = compute_pvals(comparison_results)
display(combos)
metrics

[('gpt-4o', 'gpt2-1m')]

{'tpr': array([8.0131364e-37]),
 'iou': array([2.0763773e-36]),
 'exact': array([2.12272171e-37]),
 'bertscore': array([3.57388933e-26])}

In [8]:
metrics, combos = compute_pvals(comparison_results, subset_idxs=list(range(200)))
display(combos)
metrics

[('gpt-4o', 'gpt2-1m')]

{'tpr': array([2.34363464e-09]),
 'iou': array([4.25282132e-08]),
 'exact': array([1.83236288e-08]),
 'bertscore': array([5.31963606e-05])}

In [9]:
metrics, combos = compute_pvals(annotated_results, subset_idxs=list(range(200)))
display(combos)
metrics

[('gpt-4o-annotated', 'gpt2-1m-annotated')]

{'tpr': array([0.4816791]),
 'iou': array([0.23427558]),
 'exact': array([0.67780864]),
 'bertscore': array([0.1697562])}