In [1]:
import pandas as pd
from scipy.stats import (mannwhitneyu, wilcoxon)

In [2]:
DATASET_NAME = "xsum" # ["cochrane","medeasi","asset","xsum","cnn"]
MODEL_NAME = "trained" # ["baseline","trained"]
SUFFIX = "_finetune"
LOGITS_PATH = f"logits/{DATASET_NAME}_{MODEL_NAME}_logits.pt"

In [3]:
df = pd.read_csv(f"experiment_{DATASET_NAME}_{MODEL_NAME}{SUFFIX}.csv")
df = df.drop("nli_flag_gpt_label", axis=1)

### NLL
* Mean NLL by whether it is entailed or not
* For sentences with BOTH entailed and non-entailed, are the NLLs similar
* If the sentence has non-entailed entities, are the NLL different if there are also entailed entities vs none?

### MI
* Mean MI by whether it is entailed or not
* For sentences with BOTH entailed and non-entailed, are the MIs similar
* If the sentence has non-entailed entities, are the MI different if there are also entailed entities vs none?


In [14]:
# df.groupby("nli_flag_ent_label").aggregate({"mi_mean":["mean","std"], "mi_ent_0":["mean","std"], "mi_ent_1":["mean","std"], "mi_ent_-1":["mean","std"]})

In [16]:
df.groupby("nli_flag_ent_label").aggregate({"nll_mean":["mean","std"],
                                            "nll_ent_-1":["mean","std"],
                                            "nll_ent_1":["mean","std"],
                                            "nll_ent_0":["mean","std"]})

Unnamed: 0_level_0,nll_mean,nll_mean,nll_ent_-1,nll_ent_-1,nll_ent_1,nll_ent_1,nll_ent_0,nll_ent_0
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
nli_flag_ent_label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
-1,0.048788,0.02297,1.177732,2.190173,1.354346,2.454466,0.046018,0.022894
1,0.043194,0.018394,,,1.776455,2.745833,0.04298,0.018423


In [6]:
mannwhitneyu(
    df.loc[df.nli_flag_ent_label==1, "nll_mean"], 
    df.loc[df.nli_flag_ent_label==-1, "nll_mean"],
    alternative="less"
)

MannwhitneyuResult(statistic=982606811.0, pvalue=9.232137131491471e-197)

In [7]:
df.loc[df["nll_ent_1"].notnull()&df["nll_ent_-1"].notnull(), ["nll_ent_-1","nll_ent_1","nll_ent_0"]].mean()

nll_ent_-1    0.945795
nll_ent_1     1.354346
nll_ent_0     0.047574
dtype: float64

In [8]:
wilcoxon(df.loc[df["nll_ent_1"].notnull()&df["nll_ent_-1"].notnull(), "nll_ent_1"],
         df.loc[df["nll_ent_1"].notnull()&df["nll_ent_-1"].notnull(), "nll_ent_-1"],
         alternative="less")

WilcoxonResult(statistic=208382.0, pvalue=0.19080187124133813)

In [17]:
print("Non-entailed", df.loc[df["nll_ent_1"].notnull()&df["nll_ent_-1"].notnull(), "nll_ent_1"].mean())
print("Entailed", df.loc[df["nll_ent_1"].notnull()&df["nll_ent_-1"].isnull(), "nll_ent_1"].mean())
mannwhitneyu(
    df.loc[df["nll_ent_1"].notnull()&df["nll_ent_-1"].notnull(), "nll_ent_1"],
    df.loc[df["nll_ent_1"].notnull()&df["nll_ent_-1"].isnull(), "nll_ent_1"]
)

Non-entailed 1.354345666298286
Entailed 1.7764545757297614


MannwhitneyuResult(statistic=5486044.0, pvalue=0.0014710887429618454)

In [10]:
df.groupby("nli_flag_ent_output").aggregate({"mi_mean":"mean"})

Unnamed: 0_level_0,mi_mean
nli_flag_ent_output,Unnamed: 1_level_1
-1,
1,


In [11]:
mannwhitneyu(
    df.loc[df.nli_flag_ent_output==1,  "mi_mean"], 
    df.loc[df.nli_flag_ent_output==-1, "mi_mean"],
    alternative="less"
)

MannwhitneyuResult(statistic=nan, pvalue=nan)

In [12]:
df.groupby("nli_flag_ent_output").aggregate({"mi_mean":"mean",
                                             "mi_ent_-1":["mean","std"],
                                             "mi_ent_1":["mean","std"],
                                             "mi_ent_0":["mean","std"]})

Unnamed: 0_level_0,mi_mean,mi_ent_-1,mi_ent_-1,mi_ent_1,mi_ent_1,mi_ent_0,mi_ent_0
Unnamed: 0_level_1,mean,mean,std,mean,std,mean,std
nli_flag_ent_output,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
-1,,,,,,,
1,,,,,,,


In [13]:
df.loc[df["mi_ent_1"].notnull()&df["mi_ent_-1"].notnull(), ["mi_ent_-1","mi_ent_1","mi_ent_0"]].mean()

mi_ent_-1   NaN
mi_ent_1    NaN
mi_ent_0    NaN
dtype: float64

In [13]:
wilcoxon(df.loc[df["mi_ent_1"].notnull()&df["mi_ent_-1"].notnull(), "mi_ent_-1"],
         df.loc[df["mi_ent_1"].notnull()&df["mi_ent_-1"].notnull(), "mi_ent_1"])

WilcoxonResult(statistic=436893.0, pvalue=7.101690918539725e-111)

In [25]:
print("Non-entailed", df.loc[df["mi_ent_1"].notnull()&df["mi_ent_-1"].notnull(), "mi_ent_1"].mean())
print("Entailed", df.loc[df["mi_ent_1"].notnull()&df["mi_ent_-1"].isnull(), "mi_ent_1"].mean())
mannwhitneyu(
    df.loc[df["mi_ent_1"].notnull()&df["mi_ent_-1"].notnull(), "mi_ent_1"],
    df.loc[df["mi_ent_1"].notnull()&df["mi_ent_-1"].isnull(), "mi_ent_1"]
)

Non-entailed 21.177354659479768
Entailed 21.362212979576775


MannwhitneyuResult(statistic=9925761.5, pvalue=0.3061356857479055)