## Bootstrapping Confidence Invervals

This notebook demonstrates how the size of confidence intervals for EC data depends on the number of sampled trials, and on the accuracy delta between the two classifiers.

In [None]:
# making sure that updates to imported files are immediately available without restarting the kernel
%reload_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

# not using this here because finer control needed
# font = {"size": 15}
# matplotlib.rc("font", **font)

sys.path.append(os.path.abspath(".."))
from utils import fast_cohen, simulate_trials_from_copy_model

In [None]:
def generate_ci_data():
    # collecting a bunch of data
    gt_ecs = []
    std_ecs = []
    accs = []
    trials = []
    its = []
    for kappa in [0, 0.25, 0.5, 0.75, 0.95, 0.999]:
        for acc in [0.05, 0.25, 0.5, 0.75, 0.95]:
            for n_trials in [100, 200, 400, 800, 1600]:
                for i in range(5000):
                    trials1, trials2 = simulate_trials_from_copy_model(
                        kappa, acc, acc, n_trials
                    )

                    if (
                        np.all(trials1)
                        or np.all(trials2)
                        or not np.any(trials1)
                        or not np.any(trials2)
                    ):
                        std_ecs.append(np.nan)
                    else:
                        std_ecs.append(fast_cohen(trials1, trials2))

                    gt_ecs.append(kappa)
                    accs.append(acc)
                    trials.append(n_trials)
                    its.append(i)

    df = pd.DataFrame(
        {
            "True EC": gt_ecs,
            "Accuracy": accs,
            "Trials": trials,
            "Iteration": its,
            "Empirical EC": std_ecs,
        }
    )
    df = df[~df["Empirical EC"].isnull()]

    return df

In [None]:
generate_data = False
if generate_data:
    os.makedirs("data", exist_ok=True)
    df = generate_ci_data()
    df.to_csv("data/ecdf_equal.csv", na_rep="NULL", index=False)
else:
    df = pd.read_csv("data/ecdf_equal.csv")

display(df)

In [None]:
# # extracting the number of trials from Robert's data
# robert_raw_df = pd.read_parquet(
#     "../geirhos_analysis/data/roberts_raw_data.parquet", engine="pyarrow"
# )
# human_df = robert_raw_df[robert_raw_df["subject_type"] == "human"]
# human_df.groupby(["experiment", "condition", "subj"], observed=True)["category"].count().unique()

# Wiles at al:
# words = [3727, 2729, 3901, 3304, 2792, 3298, 2549, 3472]
# print(np.mean(words))

In [None]:
geirhos = "Geirhos et al. 2021"
li = "Li et al. 2025"
wiles = "Wiles et al. 2024"
ollikka = "Ollikka et al. 2024"

literature_sizes = {
    geirhos: [160, 320, 560, 640, 800, 1_280],
    li: [600],  # seems like the fairer comparison to Robert
    wiles: [3_222],
    ollikka: [147],
}

# specifying markers and colors
symbols = {
    geirhos: "v",  # triangle down
    li: "^",  # triangle up
    ollikka: "o",  # circle,
    wiles: "p",  # pentagon
}
cmap = sns.color_palette("mako", n_colors=len(literature_sizes.keys()), as_cmap=False)
colors = {k: cmap[i] for i, k in enumerate(literature_sizes.keys())}

In [None]:
def put_markers_on_x_axis(ax, annotations):
    handles = []
    labels = []

    # Optionally get current y-limits to place markers below
    ymin, _ = ax.get_ylim()
    y_marker = ymin  # - 1  # adjust based on your data

    for label, xvals in annotations.items():
        sym = symbols.get(label, "o")  # default to circle if not found
        color = colors.get(label, "black")
        handle = ax.plot(
            xvals,
            [y_marker] * len(xvals),
            sym,
            color=color,
            label=label,
            zorder=10,
            clip_on=False,
        )[0]
        handles.append(handle)
        labels.append(label)

    # Add the new legend for the symbols
    legend2 = ax.legend(
        handles,
        labels,
        loc="upper center",  # place it above the anchor point
        bbox_to_anchor=(0.5, -0.15),  # center-bottom, below the axes
        ncol=len(handles),  # horizontal layout with one entry per column
        # frameon=False,
        title="Reference Works",
    )
    ax.add_artist(legend2)  # Keep the original legend


# plotting this as pointplots with 95% PIs
def plot_cis(df, save=False):

    for gt_ec in df["True EC"].unique():
        pdf = df[df["True EC"] == gt_ec]
        pdf.loc[:, "Trials"] = pd.to_numeric(pdf["Trials"], errors="coerce")
        fig, ax = plt.subplots(1, 1, figsize=(12, 5))
        # ax.set_title(f"True EC: {gt_ec}")
        ax.grid(axis="y")
        sns.pointplot(
            data=pdf,
            errorbar=("pi", 95),
            capsize=0.1,
            x="Trials",
            y="Empirical EC",
            hue="Accuracy",
            dodge=0.4,
            linestyle="none",
            legend=True,
            log_scale=(True, False),
            native_scale=True,
            ax=ax,
        )
        ax.set_ylim(
            -0.2, 1.05
        )  # using -0.2 for the paper figure only, better value here is -0.4
        sns.despine()

        # manually setting x-tick labels
        old_ticks = pdf["Trials"].unique()
        old_ticks.sort()
        ax.set_xticks(old_ticks, old_ticks)
        ax.tick_params(axis="x", labelsize=13)

        # manually making sure that the legend stays there, because I will add a new one soon
        original_legend = ax.get_legend()
        ax.add_artist(original_legend)

        ax.set_ylabel("Empirical EC", fontsize=15)
        ax.set_xlabel("Number of Trials", fontsize=15)

        # annotate x axis with literature results
        put_markers_on_x_axis(ax, literature_sizes)
        plt.tight_layout()
        fig.subplots_adjust(bottom=0.3)
        if save:
            if not os.path.exists("figures"):
                os.makedirs("figures")

            plt.savefig(f"figures/pointplot_{gt_ec}_standard.pdf")
        plt.show()
        plt.close()

In [None]:
# make different versions of figure 3, relating CI size to number of trials

# NOTE: the legend with references doesn't show up here, it's only in the pdf
plot_cis(df, save=True)

In [None]:
# plotting ground-truth EC against CI size, for different number of trials as point plot just to understand the data
fig, ax = plt.subplots(1, 1, figsize=(8, 5))
ax.grid(axis="y")
sns.pointplot(
    data=df,
    x="True EC",
    y="Empirical EC",
    hue="Trials",
    errorbar=("pi", 95),
    capsize=0.2,
    dodge=0.4,
    linestyle="none",
    ax=ax,
)
sns.despine()

In [None]:
# plotting ground-truth EC against CI size, for different number of trials
fig, ax = plt.subplots(1, 1, figsize=(8, 5))
ax.grid(axis="y")
sns.lineplot(
    data=df,
    x="True EC",
    y="Empirical EC",
    hue="Trials",
    errorbar=("pi", 95),
    # linestyle='none',
    ax=ax,
)
ax.set_xlim(0.0, 0.95)
ax.set_ylim(-0.2, 1.0)
sns.despine()

plt.tight_layout()
plt.savefig(f"figures/ci_size_plot.pdf")
plt.show()
plt.close()

In [None]:
font = {"size": 15}
matplotlib.rc("font", **font)

# normalized version of this figure
norm_df = df.copy()
norm_df["delta"] = norm_df["Empirical EC"] - norm_df["True EC"]

# plotting ground-truth EC against CI size, for different number of trials
fig, ax = plt.subplots(1, 1, figsize=(12, 5))
ax.grid(axis="y")
palette_name = "crest"
sns.lineplot(
    data=norm_df,
    x="True EC",
    y="delta",
    hue="Trials",
    palette=palette_name,
    errorbar=("pi", 95),
    linestyle="none",
    legend=False,
    ax=ax,
)

# Manually create legend handles
import matplotlib.lines as mlines
import matplotlib.cm as cm

hue_levels = norm_df["Trials"].unique()
n_levels = len(hue_levels)
colormap = cm.get_cmap(palette_name, n_levels)  # Or use the one you passed explicitly
palette = [colormap(i) for i in range(n_levels)]
handles = [
    mlines.Line2D([], [], color=palette[i], marker="o", linestyle="None", label=hue)
    for i, hue in enumerate(hue_levels)
]

# Add the custom legend
ax.legend(handles=handles, title="Trials")

ax.set_ylabel("Delta between empirical and true EC")
ax.set_xlim(0.0, 1)
ax.set_ylim(-0.4, 0.4)
yticks = np.arange(-0.4, 0.41, 0.2)  # Every second value in range
ax.set_yticks(yticks)
sns.despine()
# Fix confidence interval alpha
for collection in ax.collections:
    collection.set_alpha(0.8)
plt.tight_layout()
plt.savefig(f"figures/ci_size_plot_centered.pdf")
plt.show()
plt.close()

## Demonstrating how CI width changes as a function of M, the number of bootstraps

In [None]:
from tqdm import tqdm


def bootstrap(trials):
    """
    Calculates a bootstrapped EC value.

    :param trials: the trials (2, N)
    """
    _, n_trials = trials.shape
    indices = np.random.choice(n_trials, size=n_trials, replace=True)

    resampled = trials[:, indices]

    ec = fast_cohen(resampled[0, :], resampled[1, :])
    return ec


def get_ci(ecs):
    """
    Computes CI and mean.

    :param ecs: list of EC values
    """

    lower, upper = np.quantile(ecs, [0.025, 0.975])
    mean = np.mean(ecs)
    return [lower, mean, upper]


kappa = 0.5
acc1 = 0.8
acc2 = 0.7
n_trials = 1000
n_bootstraps = 10000
trials = np.zeros((2, n_trials))
trials[0, :], trials[1, :] = simulate_trials_from_copy_model(
    kappa, acc1, acc2, n_trials
)

ecs = np.array([bootstrap(trials) for i in range(n_bootstraps)])

lowers = []
means = []
uppers = []
bootstraps = []
n_sims = 100  # how many times to simulate each bootstrap
for j in tqdm(range(1, n_bootstraps, 2)):
    for i in range(n_sims):
        indices = np.array(np.random.choice(n_bootstraps, size=j, replace=True))
        resampled = ecs[indices]
        lower, mean, upper = get_ci(resampled)
        lowers.append(lower)
        means.append(mean)
        uppers.append(upper)
        bootstraps.append(j)

df = pd.DataFrame(
    {
        "Number of Bootstraps": bootstraps,
        "Lower": lowers,
        "Mean": means,
        "Upper": uppers,
    }
)
display(df)

In [None]:
font = {"size": 16}
import matplotlib

matplotlib.rc("font", **font)
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
sns.lineplot(
    data=df,
    x="Number of Bootstraps",
    y="Mean",
    label="Mean",
    c="maroon",
    errorbar=("pi", 95),
    ax=ax,
)
sns.lineplot(
    data=df,
    x="Number of Bootstraps",
    y="Lower",
    label="CI Limits",
    c="navy",
    errorbar=("pi", 95),
    ax=ax,
)
sns.lineplot(
    data=df, x="Number of Bootstraps", y="Upper", c="navy", errorbar=("pi", 95), ax=ax
)
ax.grid(axis="y")
ax.set_ylabel("Error Consistency [Kappa]")
ax.set_xscale("log")
ax.set_ylim(0.4, 0.6)
ax.set_xlim(10, 1e4)
sns.despine()
plt.tight_layout()
plt.savefig("figures/bootstrap_convergence.pdf")