In [None]:
from collections import defaultdict
from itertools import combinations

import pandas as pd
from scipy.stats import ttest_rel
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.multitest import multipletests

In [None]:
model_type_results = {
    "gpt2": "../data/metrics/gpt2-test-metrics.csv",
    "bart": "../data/metrics/bart-test-metrics.csv",
    "mistral": "../data/metrics/mistral-lora-test-metrics.csv",
}

data_mixture_results = {
    "gpt2": "../data/metrics/gpt2-test-metrics.csv",
    "gpt2-100k": "../data/metrics/gpt2-100k-test-metrics.csv",
    # "gpt2-1m": "../data/metrics/gpt2-1m-lora-test-metrics.csv",
}

comparison_results = {
    "gpt2-100k": "../data/metrics/gpt2-100k-test-metrics.csv",
    "gpt-4o": "../data/metrics/gpt-4o-2024-08-06-test-metrics.csv",
}

In [None]:
METRICS = {
    "tpr": ttest_rel,
    "iou": ttest_rel,
    "exact": mcnemar,
    "bertscore": ttest_rel,
}

def compute_pvals(results):
    scores = {k: pd.read_csv(v) for k, v in results.items()}

    models = sorted(list(scores.keys()))
    combos = list(combinations(models, 2))

    ret = dict()
    for metric, test in METRICS.items():
        ps = []
        for m1, m2 in combos:
            x1 = scores[m1][metric]
            x2 = scores[m2][metric]

            if test == mcnemar:
                a = ((x1 == 0) & (x2 == 0)).sum()
                b = ((x1 == 0) & (x2 == 1)).sum()
                c = ((x1 == 1) & (x2 == 0)).sum()
                d = ((x1 == 1) & (x2 == 1)).sum()
                mat = [
                    [a, b],
                    [c, d],
                ]
                p = test(mat).pvalue
            elif test == ttest_rel:
                p = test(x1, x2).pvalue
            else:
                raise NotImplementedError(f"Unknown test: {test}")

            ps.append(p)
        _, corrected_ps, _, _ = multipletests(ps, method="bonferroni")
        ret[metric] = corrected_ps
    return ret, combos


def compute_means(results):
    scores = {k: pd.read_csv(v) for k, v in results.items()}
    means = defaultdict(dict)
    stds = defaultdict(dict)
    for model, df in scores.items():
        for metric in METRICS:
            means[model][metric] = df[metric].mean()
            stds[model][metric] = df[metric].std()
    return pd.DataFrame(means).T, pd.DataFrame(stds).T

In [None]:
means, stds = compute_means(comparison_results)

In [None]:
means

In [None]:
stds

In [None]:
metrics, combos = compute_pvals(comparison_results)

In [None]:
combos

In [None]:
metrics