## Aggregating Robert's data

The point of this notebook is to aggregate Robert's data into one joined df for easy access, which we save to parquet.

Next, the `bootstrap.py` script should be run to bootstrap confidence intervals.

Then, `plot_roberts_data.ipynb` should be run to recreate the plots.

Robert has two sets of human data:
- for the 2018 NeurIPS paper, they didn't yet plan to calculate EC, so the humans didn't see the same images.
- for the closing-the-gap-paper, they re-ran the experiment, this time showing the same images to all humans.

So we can't use the 2018 data at all, and the model data for the closing-the-gap paper is not in the public repo.

Robert gave us this data, and this notebook assumes that it is located in a folder called `modelvshuman-raw-data`, which should exist at the same top-level location as our repo.

In [None]:
import os
import sys
from os.path import join as pjoin
import numpy as np
import pandas as pd

sys.path.append(os.path.abspath(".."))
from utils import filter_df

In [None]:
def get_eidolon_condition(eid_condition: str) -> str:
    """Get the condition of the eidolon experiment from the condition string."""
    parts = eid_condition.split("-")
    return str(int(np.log2(int(parts[0]))))


def fix_conditions(row):
    exp = row["experiment"]
    condition = row["condition"]

    # first, fix eidolons from strings to 1 - 7
    if "eidolon" in exp:
        condition = get_eidolon_condition(condition)

    # second, remove nans
    if condition == np.nan or condition == "nan" or pd.isna(condition):
        condition = 0

    return condition


def get_img_identifier(row: pd.Series) -> str:
    """
    Create a unique image identifier from the image name.

    :param row: one row of the dataframe
    """

    img_name = row["imagename"]
    experiment = row["experiment"]
    condition = row["condition"]

    if experiment in ["cue-conflict", "edge", "silhouette", "sketch", "stylized"]:
        img_id = img_name.split("_")[-1].split(".png")[0]
    else:
        # standard experiments
        assert len(img_name.split("_")) == 8, "img_name should have 8 parts"
        _idx, _exp, _subj, _cond, category, number, cls_id, img_id = img_name.split("_")
        img_id = f"{cls_id}-{img_id.split('.png')[0]}"

    return f"{experiment}_{condition}_{img_id}"


def aggregate_data(root: str) -> pd.DataFrame:
    dfs = []
    _, dirs, files = next(os.walk(root))
    for dir in dirs:
        if dir in [
            "colour",
            "contrast",
            "eidolonI",
            "eidolonII",
            "eidolonIII",
            "false-colour",
            "high-pass",
            "low-pass",
            "power-equalisation",
            "rotation",
            "phase-scrambling",
            "uniform-noise",
            "cue-conflict",
            "edge",
            "silhouette",
            "sketch",
            "stylized",
        ]:
            _, _, files = next(os.walk(pjoin(root, dir)))
            for f in files:
                if f.endswith(".csv"):
                    sub_df = pd.read_csv(pjoin(root, dir, f))
                    sub_df["experiment"] = dir
                    sub_df["correct"] = sub_df["object_response"] == sub_df["category"]
                    dfs.append(sub_df)

    df = pd.concat(dfs)

    df["condition"] = df.apply(
        fix_conditions,
        axis=1,
    )
    df["img_identifier"] = df.apply(get_img_identifier, axis=1)
    df["subject_type"] = df["subj"].apply(
        lambda x: "human" if "subject" in x else "machine"
    )
    df.drop(columns=["Session", "session", "trial", "rt", "imagename"], inplace=True)

    # On silhouette data and potentially elsewhere, more models were evaluated.
    # Here, I make sure that we have exactly the same set of models in all experiments
    color_models = df[df["experiment"] == "colour"]["subj"].unique()
    df = df[(df["subj"].isin(color_models)) | (df["subj"].str.contains("subject-"))]

    return df

In [None]:
df = aggregate_data("../../../modelvshuman-raw-data/")

# properly setting the column types because otherwise pd defaults to using slow comparisons
df["subj"] = df["subj"].astype("category")
df["object_response"] = df["object_response"].astype("category")
df["category"] = df["category"].astype("category")
df["condition"] = df["condition"].astype(str).astype("category")
df["experiment"] = df["experiment"].astype("category")
df["img_identifier"] = df["img_identifier"].astype("string")
df["subject_type"] = df["subject_type"].astype("category")

print("Created df with", len(df), "lines!")
print(df.info())

# using parquet files to store this, because it respects data types
df.to_parquet("data/roberts_raw_data.parquet", engine="pyarrow")

In [None]:
# Calculating the accuracy per experiment, condition and model for adding them to the countour-plot
import pandas as pd

df = pd.read_parquet("data/roberts_raw_data.parquet")
df = filter_df(df)
df["correct"] = df["correct"].astype(float)
accuracies = (
    df.groupby(["experiment", "condition", "subj"], observed=True)["correct"]
    .mean()
    .reset_index()
)
accuracies.to_csv("data/accuracies.csv")