In [None]:
# making sure that updates to imported files are immediately available without restarting the kernel
%reload_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(os.path.abspath(".."))
from utils import (
    simulate_trials_from_copy_model,
    fast_cohen,
    calc_accuracy_bounds_from_kappa,
    filter_df,
)

In [None]:
all_df = pd.read_csv("data/benchmark_scores.csv")
display(all_df)

In [None]:
# focusing on the error consistency columns and dropping all rows where we don't have values
mean_ec_col = "Geirhos2021-error_consistency"
relevant_cols = ["model_name"] + [
    col for col in all_df.columns if "error_consistency" in col
]
df = all_df[relevant_cols].copy()

# ceiling scores for each metric, taken from Brain-Score tests
ceilings = {
    "colour": 0.41543,
    "contrast": 0.43703,
    "cueconflict": 0.33105,
    "edge": 0.31844,
    "eidolonI": 0.38634,
    "eidolonII": 0.45402,
    "eidolonIII": 0.45953,
    "falsecolour": 0.44405,
    "highpass": 0.44014,
    "lowpass": 0.46888,
    "phasescrambling": 0.44667,
    "powerequalisation": 0.51063,
    "rotation": 0.43851,
    "silhouette": 0.47571,
    "sketch": 0.36962,
    "stylized": 0.50058,
    "uniformnoise": 0.43406,
}

# filter and clean the df
for col in relevant_cols:

    if col == "model_name":
        continue

    # coerce to float and drop rows with NaNs anywhere
    df[col] = pd.to_numeric(df[col], errors="coerce")
    df = df[~df[col].isna()]

    if col in [mean_ec_col]:
        continue

    # we're now dealing with one of the constituent EC metrics, and remove the ceiling
    df[col] = (
        df[col] * ceilings[col.split("Geirhos2021")[1].split("-error_consistency")[0]]
    )

# checking if the Geirhoss2021-error_consistency column is the average over the other ones
ec_cols = [col for col in relevant_cols if "error_consistency" in col]
ec_cols.remove(mean_ec_col)

# making the names consistent with Robert
col_name_map = {
    ec_col: ec_col.split("eirhos2021")[1].split("-error_consistency")[0]
    for ec_col in ec_cols
}
df.rename(columns=col_name_map, inplace=True)

col_name_map2 = {
    "cueconflict": "cue-conflict",
    "falsecolour": "false-colour",
    "phasescrambling": "phase-scrambling",
    "powerequalisation": "power-equalisation",
    "uniformnoise": "uniform-noise",
    "lowpass": "low-pass",
    "highpass": "high-pass",
}
df.rename(columns=col_name_map2, inplace=True)

ec_cols = df.columns.copy().tolist()
ec_cols.remove("model_name")
ec_cols.remove(mean_ec_col)

df["raw_ec"] = df[ec_cols].mean(axis=1)
df["delta"] = (df[mean_ec_col] * 0.42899 - df["raw_ec"]).abs()

print(
    "Mean Delta between real and expected final value:", df["delta"].mean()
)  # 0.0017, good enough
print("There are", len(df), "models.")
display(df)

In [None]:
# Let's start by plotting the raw ECs, without ceiling shenanigans
order = df.groupby("model_name")["raw_ec"].mean().sort_values(ascending=False).index
fig, ax = plt.subplots(1, 1, figsize=(25, 5))
plt.grid(axis="y")
sns.pointplot(
    data=df,
    y="raw_ec",
    x="model_name",
    hue="model_name",
    legend=False,
    linestyle="none",
    order=order,
    palette="mako",
    hue_order=order,
    ax=ax,
)
ax.set_xticks([], [])
ax.set_xlabel("Model")
ax.set_ylabel("Error Consistency to Humans [kappa]")
ax.set_ylim(0, 0.8)
ax.tick_params(axis="x", labelrotation=90)
sns.despine()

In [None]:
def estimate_accuracy(acc, kappa, method="lower", eps=0.001):

    def clip(val):
        return min(1 - eps, max(0 + eps, val))

    # get bounds
    lower, upper = calc_accuracy_bounds_from_kappa(acc, kappa)

    assert 0 <= lower <= upper, "Bounds were not sensible!"

    if method == "lower":
        return clip(lower + eps)
    elif method == "upper":
        return clip(upper - eps)
    elif method == "middle":
        return lower + (upper - lower) / 2
    else:
        raise RuntimeError(f"Method not known! {method}")

    # within those bounds, take value that maximizes distance to human accuracy to get smallest CIs?
    if abs(lower - acc) < abs(upper - acc):
        return clip(lower + eps)
    else:
        return clip(upper - eps)

In [None]:
# Next, we load Robert's data and for every model, generate some possible trial data
robert_df = pd.read_parquet(
    "../geirhos_analysis/data/roberts_raw_data.parquet", engine="pyarrow"
)

# keep only humans
robert_df = robert_df[robert_df["subj"].str.contains("subject-")]

# keep only relevant conditions
robert_df = filter_df(robert_df)

# display(robert_df)

# bootstrap consistency values, this takes 35 minutes
method = "lower"
n_bootstraps = 1000
res_dfs = []
excluded_models = []
for exp, exp_df in robert_df.groupby("experiment", observed=True):
    for con, con_df in exp_df.groupby("condition", observed=True):

        n_trials = len(con_df[con_df["subj"] == "subject-01"])
        n_subjects = len(con_df["subj"].unique())
        avg_human_acc = con_df["correct"].mean()  # taking the mean over all humans

        # print(exp, con, n_subjects, n_trials, avg_human_acc)

        # loop over all Brain-Score models
        for idx, row in df.iterrows():
            name = row["model_name"]
            gt_error_consistency = row[exp]

            # todo somewhat dirty fix, just skipping models if they had negative EC anywhere, but only happens for vonenet
            if gt_error_consistency < 0:
                print(f"Excluding model {name} because {exp} {con} had negative EC.")
                excluded_models.append(name)
                continue

            model_acc = estimate_accuracy(avg_human_acc, gt_error_consistency, method)

            model_human_ecs = [gt_error_consistency]
            for i in range(1, n_bootstraps):
                ecs = [
                    fast_cohen(
                        *simulate_trials_from_copy_model(
                            gt_error_consistency,
                            avg_human_acc,
                            model_acc,
                            n_trials,
                        )
                    )
                    for j in range(n_subjects)
                ]

                model_human_ecs.append(np.mean(ecs))

            # make a df of bootstrapped model-human error consistencies
            res_dfs.append(
                pd.DataFrame(
                    {
                        "model": row["model_name"],
                        "bootstrap_id": np.arange(n_bootstraps),
                        "experiment": exp,
                        "condition": con,
                        "model-human-ec": model_human_ecs,
                    }
                )
            )

# join all dfs
res_df = pd.concat(res_dfs)
display(res_df)
res_df.to_parquet(
    f"data/brainscore_bootstrapped_ecs_{n_bootstraps}_{method}.parquet",
    engine="pyarrow",
)