## Model Differences

The point of this notebook is to evaluate inhowfar EC is a suitable measure for finding differences between models. Are differences between DNN models large enough to support claims about their similarity to humans?

We will do this by first loading Robert's original df, and filtering for only those conditions that he actually includes in his analysis.

Then, we will loop over all models, and for every model, loop over all conditions. We then bootstrap and aggregate ECs just like we did for the main figure 4.

This is done in `bootstrap_models.py` and we plot the results here.


In [None]:
from os.path import join as pjoin
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

font = {"size": 15}

matplotlib.rc("font", **font)

In [None]:
# loading our bootstrap-results
method = "lower"
standard_df = pd.read_parquet(
    f"data/brainscore_bootstrapped_ecs_1000_{method}.parquet",
    engine="pyarrow",
)
display(standard_df)

# Robert takes the average EC by first averaging within each experiment, then averaging across them.
# (how you average within each experiment doesn't matter, because first conditions then humans = first humans then conditions = all at once)

# take the average within each of the 12 experiments, like Robert does
exp_mean_df = standard_df.groupby(
    ["bootstrap_id", "experiment", "model"], observed=True, as_index=False
).mean(numeric_only=True)

# take the average across the experiments, like Robert does
mean_df = exp_mean_df.groupby(
    ["bootstrap_id", "model"], observed=True, as_index=False
).mean(numeric_only=True)

# find the top-k models
topk_models = (
    mean_df.groupby(["model"], observed=True)["model-human-ec"]
    .mean()
    .reset_index()
    .nlargest(n=30, columns=["model-human-ec"])["model"]
    .tolist()
)

# retain only the top-k models
topk_df = mean_df[mean_df["model"].isin(topk_models)].reset_index()
standard_df = standard_df[standard_df["model"].isin(topk_models)].reset_index()

In [None]:
# now do the same aggregation for humans
human_df = pd.read_parquet(
    pjoin("../geirhos_analysis/data", f"bootstrapped_human_ecs_standard_10000.parquet"),
    engine="pyarrow",
)

# take the average within each of the 12 experiments, like Robert does
human_mean_df = human_df.groupby(
    ["bootstrap_id", "experiment"], observed=True, as_index=False
).mean(numeric_only=True)
human_mean_df["name"] = "Humans"

# take the average across the experiments, like Robert does
human_final_df = human_mean_df.groupby(
    ["bootstrap_id"], observed=True, as_index=False
).mean(numeric_only=True)
human_final_df["name"] = "Humans"
print("Mean human-human EC:", human_final_df["human-human-ec"].mean())

In [None]:
def plot_df(df, hdf, ylim_min, ylim_max, name, show_original=True):
    order = (
        df.groupby("model", observed=True)["model-human-ec"]
        .mean()
        .sort_values(ascending=False)
        .index
    )
    fig, ax = plt.subplots(
        1, 1, figsize=(12, 5)
    )  # 12, 5 if you're not plotting names at 90° rotation
    plt.grid(axis="y")

    # plot humans
    sns.pointplot(
        data=hdf,
        estimator=np.mean,
        errorbar=("pi", 95),
        capsize=0.4,
        x="name",
        y="human-human-ec",
        legend=False,
        color="maroon",
        ax=ax,
    )

    # plot CIs
    sns.pointplot(
        data=df,
        palette="mako",
        estimator=np.mean,
        errorbar=("pi", 95),
        x="model",
        y="model-human-ec",
        hue="model",
        legend=False,
        linestyle="none",
        capsize=0.4,
        order=order,
        hue_order=order,
    )

    if show_original:
        # plot Geirhos-datapoints for models
        sns.pointplot(
            data=topk_df[topk_df["bootstrap_id"] == 0],
            x="model",
            y="model-human-ec",
            estimator=np.mean,
            errorbar=None,
            markers="x",
            color="blue",
            legend=False,
            linestyle="none",
            order=order,
        )

        # plot Geirhos-datapoints for humans
        plt_df = human_final_df.copy()
        plt_df = pd.concat([plt_df, plt_df], ignore_index=True)
        sns.pointplot(
            data=plt_df[plt_df["bootstrap_id"] == 0],
            x="name",
            y="human-human-ec",
            estimator=np.mean,
            errorbar=None,
            markers="x",
            color="blue",
            legend=False,
            linestyle="none",
        )

    ax.set_ylim(ylim_min, ylim_max)
    ax.set_xlabel("Models")

    # use either top for paper-version or bottom for debugging
    ax.set_xticklabels([])
    # ax.tick_params(axis="x", labelrotation=90)

    ax.set_ylabel("Error Consistency to Humans [Kappa]")
    sns.despine()

    plt.tight_layout()
    plt.savefig(
        pjoin("figures", f"brainscore_model_comparison_{name}_{method}.pdf"),
        bbox_inches="tight",
    )
    plt.show()

## Aggregating like Robert does

If one defines Geirhos Error Consistency as the mean EC over all conditions and bootstraps all the way through, i.e. obtaining a final mean for every bootstrap, and then reports a confidence interval over the means, the picture already looks a bit dubious because for the first 15 models the CIs overlap, and for a huge sequence of models in the middle, the CIs overlap as well, as shown here:

In [None]:
plot_df(topk_df, human_final_df, 0.2, 0.45, "mean", show_original=False)

## Without hiding the variance

But what this aggregation hides is that these means themselves were obtained by averaging over values with a very wide spread, so the true variance in the data looks more like this:

In [None]:
plot_df(standard_df, human_mean_df, -0.1, 0.8, "all", show_original=False)

## Plotting without humans

Because this makes the CIs seem smaller because of y-axis scaling.

In [None]:
def plot_df_wo_humans(df, hdf, ylim_min, ylim_max, name, show_original=True):
    order = (
        df.groupby("model", observed=True)["model-human-ec"]
        .mean()
        .sort_values(ascending=False)
        .index
    )
    fig, ax = plt.subplots(
        1, 1, figsize=(12, 5)
    )  # 12, 5 if you're not plotting names at 90° rotation
    plt.grid(axis="y")

    # plot humans
    # sns.pointplot(
    #     data=hdf,
    #     estimator=np.mean,
    #     errorbar=("pi", 95),
    #     capsize=0.4,
    #     x="name",
    #     y="human-human-ec",
    #     legend=False,
    #     color="maroon",
    #     ax=ax,
    # )

    # plot CIs
    sns.pointplot(
        data=df,
        palette="mako",
        estimator=np.mean,
        errorbar=("pi", 95),
        x="model",
        y="model-human-ec",
        hue="model",
        legend=False,
        linestyle="none",
        capsize=0.4,
        order=order,
        hue_order=order,
    )

    if show_original:
        # plot Geirhos-datapoints for models
        sns.pointplot(
            data=topk_df[topk_df["bootstrap_id"] == 0],
            x="model",
            y="model-human-ec",
            estimator=np.mean,
            errorbar=None,
            markers="x",
            color="blue",
            legend=False,
            linestyle="none",
            order=order,
        )

        # plot Geirhos-datapoints for humans
        plt_df = human_final_df.copy()
        plt_df = pd.concat([plt_df, plt_df], ignore_index=True)
        sns.pointplot(
            data=plt_df[plt_df["bootstrap_id"] == 0],
            x="name",
            y="human-human-ec",
            estimator=np.mean,
            errorbar=None,
            markers="x",
            color="blue",
            legend=False,
            linestyle="none",
        )

    ax.set_ylim(ylim_min, ylim_max)
    ax.set_xlabel("Models")

    # use either top for paper-version or bottom for debugging
    ax.set_xticklabels([])
    # ax.tick_params(axis="x", labelrotation=90)

    ax.set_ylabel("Error Consistency to Humans [Kappa]")
    sns.despine()

    plt.tight_layout()
    plt.savefig(
        pjoin("figures", f"brainscore_model_comparison_{name}_{method}.pdf"),
        bbox_inches="tight",
    )
    plt.show()

In [None]:
plot_df_wo_humans(
    topk_df, human_final_df, 0.2, 0.35, "meanwohumans", show_original=False
)