## Model Differences

The point of this notebook is to evaluate inhowfar EC is a suitable measure for finding differences between models. Are differences between DNN models large enough to support claims about their similarity to humans?

We will do this by first loading Robert's original df, and filtering for only those conditions that he actually includes in his analysis.

Then, we will loop over all models, and for every model, loop over all conditions. We then bootstrap and aggregate ECs just like we did for the main figure 4.

This is done in `bootstrap_models.py` and we plot the results here.


In [None]:
from os.path import join as pjoin
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List
import matplotlib

font = {"size": 15}
matplotlib.rc("font", **font)

In [None]:
def get_mean_df(fname: str, experiments: List[str]):
    """
    Given a bootstrap-result-df, aggregate the data like Robert does:
    Robert takes the average EC by first averaging within each experiment, then averaging across them.
    How you average within each experiment doesn't matter, because
    "first conditions then humans" == "first humans then conditions" == "all at once".

    :param fname: path to parquet file
    :param experiments: list of experiment names to include, or None if all should be included

    :return: todo write
    """

    # loading bootstrap-results
    standard_df = pd.read_parquet(fname, engine="pyarrow")

    # filtering experiments
    if experiments is None:
        experiments = standard_df["experiment"].unique()
    standard_df = standard_df[standard_df["experiment"].isin(experiments)]

    # take the average within each of the 17 experiments, like Robert does
    exp_mean_df = standard_df.groupby(
        ["bootstrap_id", "experiment", "model"], observed=True, as_index=False
    ).mean(numeric_only=True)

    # take the average across the experiments, like Robert does
    mean_df = exp_mean_df.groupby(
        ["bootstrap_id", "model"], observed=True, as_index=False
    ).mean(numeric_only=True)

    # list for each model what its EC is, just for comparing to Robert's values
    # print(mean_df.groupby(["model"], observed=True)["model-human-ec"].mean())

    return mean_df, standard_df, exp_mean_df

In [None]:
# df of mean model-machine ECs, for all experiments
mean_df, standard_df, exp_mean_df = get_mean_df(
    pjoin("data", f"model_wise_bootstrapped_ecs_standard_10000.parquet"), None
)

In [None]:
# df of mean model-machine ECs, for only the core experiments
core_mean_df, _, _ = get_mean_df(
    pjoin("data", f"model_wise_bootstrapped_ecs_standard_10000.parquet"),
    [
        "sketch",
        "eidolonII",
        "power-equalisation",
        "cue-conflict",
        "colour",
        "high-pass",
        "false-colour",
        "phase-scrambling",
        "rotation",
    ],
)

In [None]:
# now do the same aggregation for humans
human_df = pd.read_parquet(
    pjoin("data", f"bootstrapped_human_ecs_standard_10000.parquet"),
    engine="pyarrow",
)

# take the average within each of the 12 experiments, like Robert does
human_mean_df = human_df.groupby(
    ["bootstrap_id", "experiment"], observed=True, as_index=False
).mean(numeric_only=True)
human_mean_df["name"] = "Humans"

# take the average across the experiments, like Robert does
human_final_df = human_mean_df.groupby(
    ["bootstrap_id"], observed=True, as_index=False
).mean(numeric_only=True)
human_final_df["name"] = "Humans"
print("Mean human-human EC:", human_final_df["human-human-ec"].mean())

In [None]:
def plot_df(
    df: pd.DataFrame,
    hdf: pd.DataFrame,
    ylim_min: float,
    ylim_max: float,
    name: str,
    show_original: bool = True,
    show_models: bool = True,
    name_order=None,
):
    """
    Plot the ECs with CIs of models and humans based on bootstrapped data.

    :param df: the df with columns bootstrap, model, and final model-human-ec
    :param hdf: the same df for humans, so with final human-human-ec
    :param ylim_min: y-limit of the plot
    :param ylim_max: y-limit of the plot
    :param name: only used in the filename of the pdf
    :param show_original: whether to show the values of bootstrap 0 as x's
    :param show_models: whether to label the x-axis with model names
    :param name_order: in case we're reusing a name mapping
    """

    if name_order is not None:
        name_map = {o: i for i, o in enumerate(name_order)}

    # order models by mean of the bootstraps
    order = (
        df.groupby("model", observed=True)["model-human-ec"]
        .mean()
        .sort_values(ascending=False)
        .index
    )

    fsize = (12, 5)
    fig, ax = plt.subplots(1, 1, figsize=fsize)
    plt.grid(axis="y")

    # plot humans
    sns.pointplot(
        data=hdf,
        estimator=np.mean,
        errorbar=("pi", 95),
        capsize=0.4,
        x="name",
        y="human-human-ec",
        legend=False,
        color="maroon",
        ax=ax,
    )

    # plot CIs
    sns.pointplot(
        data=df,
        palette="mako",
        estimator=np.mean,
        errorbar=("pi", 95),
        x="model",
        y="model-human-ec",
        hue="model",
        legend=False,
        linestyle="none",
        capsize=0.4,
        order=order,
        hue_order=order,
        ax=ax,
    )

    if show_original:

        # plot Geirhos-datapoints for models
        sns.pointplot(
            data=mean_df[mean_df["bootstrap_id"] == 0],
            x="model",
            y="model-human-ec",
            estimator=np.mean,
            errorbar=None,
            markers="x",
            color="blue",
            legend=False,
            linestyle="none",
            order=order,
            ax=ax,
        )

        # plot Geirhos-datapoints for humans
        sns.pointplot(
            data=human_final_df[human_final_df["bootstrap_id"] == 0],
            x="name",
            y="human-human-ec",
            estimator=np.mean,
            errorbar=None,
            markers="x",
            color="blue",
            legend=False,
            linestyle="none",
            ax=ax,
        )

    ax.set_ylim(ylim_min, ylim_max)
    ax.set_yticks([0.1, 0.2, 0.3, 0.4, 0.5])
    ax.set_xlabel("Models")

    if show_models:
        if name_order is not None:
            ax.set_xticklabels(
                labels=["H"] + [name_map[o] for o in order], fontdict={"size": 10}
            )
        else:
            ax.set_xticklabels(
                labels=["H"] + [i for i in range(len(order))], fontdict={"size": 10}
            )
        # ax.tick_params(axis="x", labelrotation=90)
    else:
        ax.set_xticklabels([])

    ax.set_ylabel("Error Consistency to Humans [Kappa]")
    sns.despine()

    plt.tight_layout()
    plt.savefig(pjoin("figures", f"model_comparison_{name}.pdf"), bbox_inches="tight")
    plt.show()

    return order

## Aggregating like Robert does

If one defines Geirhos Error Consistency as the mean EC over all conditions and bootstraps all the way through, i.e. obtaining a final mean for every bootstrap, and then reports a confidence interval over the means, the picture already looks a bit dubious because for the first 15 models the CIs overlap, and for a huge sequence of models in the middle, the CIs overlap as well, as shown here:

In [None]:
robert_mean_order = plot_df(
    mean_df, human_final_df, 0.1, 0.5, "mean", show_original=False, show_models=True
)

model_appdx_df = pd.DataFrame(
    {"Index": np.arange(0, len(robert_mean_order)), "Model Name": robert_mean_order}
)


def format_func(x: str) -> str:
    return x.replace("_", "\_")


latex = model_appdx_df.to_latex(
    index=False, formatters={"Model Name": format_func}, float_format="{:.1f}".format
)
print(latex)
display(model_appdx_df)
print("Robert's order:", robert_mean_order)

## Only plotting the core experiments

Here we plot the means obtained from filtering experiments, but still plot the reference values we'd be getting by using all experiments. 

In [None]:
plot_df(
    core_mean_df,
    human_final_df,
    0,
    0.5,
    "mean_core",
    show_original=False,
    show_models=False,
)

## Without hiding the variance

But what this aggregation hides is that these means themselves were obtained by averaging over values with a very wide spread, so the true variance in the data looks more like this:

In [None]:
all_agg_order = plot_df(
    exp_mean_df,
    human_mean_df,
    -0.1,
    0.6,
    "all",
    show_original=False,
    show_models=True,
    name_order=robert_mean_order,
)

print(robert_mean_order)
print(all_agg_order)

## Root Cause

By plotting the distributions of EC values obtained by the different experiments, we see where the variance is coming from. We do this for one exemplary model.

In [None]:
model_df = standard_df[(standard_df["model"] == "clip")].reset_index()

fig, ax = plt.subplots(1, 1, figsize=(10, 5))
sns.kdeplot(
    data=model_df, x="model-human-ec", hue="experiment", fill=False, legend=True, ax=ax
)
sns.despine()
ax.set_xlabel("Error Consistency to Humans [kappa]")
ax.set_xlim(-0.2, 0.8)
plt.tight_layout()
plt.savefig(pjoin("figures", f"experiment_distributions.pdf"), bbox_inches="tight")
plt.show()

We can now make the same point that we make in the Alignment of Alignment paper: If you want to aggregate over different distributions that are not centered at the same location, you cannot simply take their mean. It would probably be better to z-transform values first.

We can also take a look at the distribution for humans, which is noticeably tighter.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
sns.kdeplot(
    data=human_mean_df,
    x="human-human-ec",
    hue="experiment",
    fill=False,
    legend=True,
    ax=ax,
)
sns.despine()
ax.set_xlabel("Error Consistency to Humans [kappa]")
ax.set_xlim(-0.2, 0.8)
plt.tight_layout()
plt.savefig(
    pjoin("figures", f"experiment_human_distributions.pdf"), bbox_inches="tight"
)
plt.show()

## Checking the distributions

Earlier, we had a bug in the plotting code which made it look like Robert's value differs from the mean value of the bootstraps. It's fixed now, but in the process I looked at the shape of the distributions of EC values, finding that they are more or less normally distributed.

In [None]:
# mean_df, standard_df

from scipy.stats import norm

ncols = 4
nrows = int(np.ceil(len(mean_df["model"].unique()) / ncols))
fig, axes = plt.subplots(
    nrows=nrows,
    ncols=ncols,
    figsize=(ncols * 3.5, nrows * 3.5),
    sharex=True,
    sharey=True,
)
flatax = axes.flatten()
for ax, (model, model_df) in zip(flatax, mean_df.groupby("model", observed=True)):

    # plot the distribution of original data
    sns.kdeplot(data=model_df, x="model-human-ec", color="blue", fill=True, ax=ax)

    # plot the best fitting gaussian
    mu, sigma = norm.fit(model_df["model-human-ec"].values)
    gaussian = np.random.normal(loc=mu, scale=sigma, size=1000)
    sns.kdeplot(data=gaussian, color="red", ax=ax)
    ax.set_title(model)
    ax.axvline(
        x=model_df[model_df["bootstrap_id"] == 0]["model-human-ec"].values[0],
    )
    sns.despine()

## Plotting CIs for all individual experiments

I think it might be cool to be able to say "look, every individual experiment has CIs like this, only by averaging all of them do you get something with small CIs".

In [None]:
exp_mean_df = standard_df.groupby(
    ["bootstrap_id", "experiment", "model"], observed=True, as_index=False
).mean(numeric_only=True)

show_models = True
name_order = robert_mean_order
name_map = {o: i for i, o in enumerate(name_order)}

for exp, exp_df in exp_mean_df.groupby("experiment", observed=True):

    # order models by mean of the bootstraps
    order = (
        exp_df.groupby("model", observed=True)["model-human-ec"]
        .mean()
        .sort_values(ascending=False)
        .index
    )

    fig, ax = plt.subplots(1, 1, figsize=(12, 5))
    plt.grid(axis="y")

    # plot CIs
    sns.pointplot(
        data=exp_df,
        palette="mako",
        estimator=np.mean,
        errorbar=("pi", 95),
        x="model",
        y="model-human-ec",
        hue="model",
        legend=False,
        linestyle="none",
        capsize=0.4,
        order=order,
        hue_order=order,
        ax=ax,
    )

    # ax.set_ylim(0, 0.5)
    ax.set_xlabel("Models")
    n_conditions = len(
        standard_df[standard_df["experiment"] == exp]["condition"].unique()
    )
    ax.set_title(f"{exp} ({n_conditions} condition{'s' if n_conditions > 1 else ''})")
    if show_models:
        if name_order is not None:
            ax.set_xticklabels(
                labels=["H"] + [name_map[o] for o in order], fontdict={"size": 10}
            )
        else:
            ax.set_xticklabels(
                labels=["H"] + [i for i in range(len(order))], fontdict={"size": 10}
            )
        # ax.tick_params(axis="x", labelrotation=90)
    else:
        ax.set_xticklabels([])

    ax.set_ylabel("Error Consistency to Humans [Kappa]")
    sns.despine()

    plt.tight_layout()
    plt.savefig(pjoin("figures", f"model_comparison_{exp}.pdf"), bbox_inches="tight")
    plt.show()