## Plot Rankings implied by different experiments

Robert just averages, but it would be funny to show that different experiments imply different rankings of models.

In [None]:
from os.path import join as pjoin
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def get_ranking_df(fname, colname):
    # loading the df containing only the EC for the actual empirical data, without bootstrapping
    standard_df = pd.read_parquet(pjoin("data", fname), engine="pyarrow")

    # take the average within each of the 17 experiments, like Robert does (over conditions)
    experiment_df = standard_df.groupby(
        ["experiment", "model", "bootstrap_id"], observed=True, as_index=False
    ).mean(numeric_only=True)
    experiment_df.drop(columns=["bootstrap_id"], inplace=True)

    # display(experiment_df)

    # what I need next is a df like the following:
    # model  exp_1  exp_2  exp_3 ...
    #     A    0.4    0.2    0.2
    #     B    0.1    0.3    0.2
    # then, I can calculate the Spearman correlation between numeric columns

    pivoted = experiment_df.pivot(
        columns=["experiment"], index="model", values="model-human-ec"
    )

    # display(pivoted)

    return pivoted

In [None]:
# Calculate Spearman rank correlation matrix between the different metrics
rankings_df = get_ranking_df(
    f"model_wise_bootstrapped_ecs_standard_1.parquet", "model-human-ec"
)
corr_matrix = rankings_df.corr(method="kendall", numeric_only=True)

# Plot heatmap
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1, fmt=".2f", ax=ax)
ax.set_xlabel("Experiment")
ax.set_ylabel("Experiment")

plt.tight_layout()
plt.savefig(pjoin("figures", f"model_rankings.pdf"), bbox_inches="tight")
plt.show()

## Plot Rankings implied by different bootstraps

In [None]:
# loading the bootstrap-results
standard_df = pd.read_parquet(
    pjoin("data", f"model_wise_bootstrapped_ecs_standard_10000.parquet"),
    engine="pyarrow",
)

# Robert takes the average EC by first averaging within each experiment, then averaging across them.
# (how you average within each experiment doesn't matter, because first conditions then humans = first humans then conditions = all at once)

# take the average within each of the 12 experiments, like Robert does
exp_mean_df = standard_df.groupby(
    ["bootstrap_id", "experiment", "model"], observed=True, as_index=False
).mean(numeric_only=True)

# take the average across the experiments, like Robert does
mean_df = exp_mean_df.groupby(
    ["bootstrap_id", "model"], observed=True, as_index=False
).mean(numeric_only=True)

In [None]:
from scipy.stats import spearmanr, kendalltau

# get canonical ordering of models
models = mean_df[mean_df["bootstrap_id"] == 0]["model"].unique().tolist()


# given a df, get the model-human-ecs for the models
def get_sorted_values_from_df(df):
    return df.sort_values(by=["model"])["model-human-ec"].values


ranking_0 = get_sorted_values_from_df(mean_df[mean_df["bootstrap_id"] == 0])

spearman_rs = []
spearman_ps = []
kendall_ts = []
kendall_ps = []
ids = []
for id, rdf in mean_df[mean_df["bootstrap_id"] > 0].groupby("bootstrap_id"):
    ranking = get_sorted_values_from_df(rdf)
    r = spearmanr(ranking_0, ranking)
    tau = kendalltau(ranking_0, ranking, nan_policy="raise")
    ids.append(id)
    spearman_rs.append(r.statistic)
    spearman_ps.append(r.pvalue)
    kendall_ts.append(tau.statistic)
    kendall_ps.append(tau.pvalue)

res_df = pd.DataFrame(
    {
        "id": ids,
        "r": spearman_rs,
        "r_p": spearman_ps,
        "t": kendall_ts,
        "t_p": kendall_ps,
    }
)

In [None]:
def prepare_df(df, method):
    if method == "spearman":
        stat = "r"
        pval = "r_p"
    elif method == "kendall":
        stat = "t"
        pval = "t_p"
    else:
        raise RuntimeError("Method unknown")

    df["Significance"] = df[pval] < 0.05
    label_map = {False: "p > 0.05", True: "p < 0.05 (significant)"}
    df["Significance"] = df["Significance"].map(lambda x: label_map[x])
    method_str = "Kendall's Tau" if method == "kendall" else "Spearman's r"
    print(
        f"{len(df[df[pval] < 0.05]) / len(df) * 100:.2f}% of {method_str} are significant at alpha = 0.05."
    )
    print(f"The average {method_str} is {df[stat].mean()}")

    return df, stat

In [None]:
method = "kendall"
plot_df, xval = prepare_df(res_df, method)
fig, ax = plt.subplots(1, 1, figsize=(12, 5))
sns.kdeplot(
    data=plot_df,
    x=xval,
    hue="Significance",
    legend=True,
    palette=["maroon", "blue"],  #'crest',
    fill=True,
    ax=ax,
)
sns.despine()
ax.set_xlabel("Kendall's tau" if method == "kendall" else "Spearman's rho")


plt.tight_layout()
plt.savefig(pjoin("figures", f"bootstrap_rankings_{method}.pdf"), bbox_inches="tight")
plt.show()