# Analyze the baseline results
We run word-level RoBERTa coreference model on the movie scripts, on both the dev and train set. \
The model has been trained on OntoNotes v5 dataset.

In [1]:
# Import modules
import pandas as pd
import numpy as np
import collections
import os
import tqdm
import scipy.stats

pd.set_option("display.max_columns", 100)

In [2]:
# Read train and dev baseline csv files
train_baseline_df = pd.read_csv(
    os.path.join(os.getenv("DATA_DIR"),
                 "mica_text_coref/movie_coref/results/coreference/baselines/train.genre_wb.baseline.tsv"),
    sep="\t", index_col=None)
dev_baseline_df = pd.read_csv(
    os.path.join(os.getenv("DATA_DIR"), "mica_text_coref/movie_coref/results/coreference/baselines/dev.baseline.tsv"),
    sep="\t", index_col=None)

In [3]:
# Summarize baseline dataframes
print("train baseline (6 full-length screenplays):")
print(f"columns = \n{train_baseline_df.columns.tolist()}")
print(f"shape = {train_baseline_df.shape}")
for column, dtype in zip(train_baseline_df.columns, train_baseline_df.dtypes):
    if dtype != float:
        unique_vals = train_baseline_df[column].unique().tolist()
        print(f"{column:25s}: {unique_vals}")
print()

print("dev baseline (3 short script excerpts)")
print(f"columns = \n{dev_baseline_df.columns.tolist()}")
print(f"shape = {dev_baseline_df.shape}")
for column, dtype in zip(dev_baseline_df.columns, dev_baseline_df.dtypes):
    if dtype != float:
        unique_vals = dev_baseline_df[column].unique().tolist()
        print(f"{column:25s}: {unique_vals}")

train baseline (6 full-length screenplays):
columns = 
['preprocess', 'genre', 'entity', 'merge_speakers', 'provide_gold_mentions', 'remove_gold_singletons', 'split_len', 'overlap_len', 'merge_strategy', 'movie', 'metric', 'P', 'R', 'F']
shape = (108864, 14)
preprocess               : ['addsays', 'nocharacters', 'none']
genre                    : ['wb']
entity                   : ['all', 'person', 'speaker']
merge_speakers           : [False, True]
provide_gold_mentions    : [False, True]
remove_gold_singletons   : [False, True]
split_len                : [2048, 3072, 4096, 5120]
overlap_len              : [128, 256, 512]
merge_strategy           : ['after', 'average', 'before', 'max', 'min', 'none']
movie                    : ['avengers_endgame', 'dead_poets_society', 'john_wick', 'prestige', 'quiet_place', 'zootopia', 'all']
metric                   : ['muc', 'bcub', 'ceafe']

dev baseline (3 short script excerpts)
columns = 
['preprocess', 'genre', 'entity', 'merge_speakers', 'provi

In [96]:
# Find the performance for each hyperparam setting and the best performing hyperparameter for the train baseline
train_pred_baseline_df = train_baseline_df[~train_baseline_df["provide_gold_mentions"]
                                           & ~train_baseline_df["remove_gold_singletons"]]
hyperparams = ["preprocess", "genre", "entity", "merge_speakers", "provide_gold_mentions",
               "remove_gold_singletons", "split_len", "overlap_len", "merge_strategy"]
header = ([f"macro_{metric}_{score}_{agg}" for metric in ["muc", "bcub", "ceafe"]
                                          for score in ["p", "r", "f"]
                                          for agg in ["mean", "std"]]
          + ["macro_conll_f_mean", "macro_conll_f_std"]
          + [f"micro_{metric}_{score}" for metric in ["muc", "bcub", "ceafe"]
                                     for score in ["p", "r", "f"]]
          + ["micro_conll_f"])
rows = []
groups = train_pred_baseline_df.groupby(hyperparams)
for hyperparam_values, hyperparam_df in tqdm.tqdm(groups, total=groups.ngroups, unit="hyperparam"):
    row = list(hyperparam_values)

    # Macro value
    movie_muc_df = hyperparam_df.loc[(hyperparam_df["movie"] != "all")
                                     & (hyperparam_df["metric"] == "muc"), ["P", "R", "F"]]
    movie_bcub_df = hyperparam_df.loc[(hyperparam_df["movie"] != "all")
                                       & (hyperparam_df["metric"] == "bcub"), ["P", "R", "F"]]
    movie_ceafe_df = hyperparam_df.loc[(hyperparam_df["movie"] != "all")
                                        & (hyperparam_df["metric"] == "ceafe"), ["P", "R", "F"]]
    movie_conll_F1 = (hyperparam_df.loc[hyperparam_df["movie"] != "all", ["movie", "F"]].groupby("movie").mean()
                                  .agg(["mean", "std"]).values.flatten().tolist())
    row += movie_muc_df.agg(["mean", "std"]).values.T.flatten().tolist()
    row += movie_bcub_df.agg(["mean", "std"]).values.T.flatten().tolist()
    row += movie_ceafe_df.agg(["mean", "std"]).values.T.flatten().tolist()
    row += movie_conll_F1

    # Micro value
    row += hyperparam_df.loc[hyperparam_df["movie"] == "all", ["P", "R", "F"]].values.flatten().tolist()
    row.append(hyperparam_df.loc[hyperparam_df["movie"] == "all", "F"].mean())
    rows.append(row)

# Create column index
index = pd.MultiIndex.from_product([["macro"], ["muc", "bcub", "ceafe"], ["p", "r", "f"], ["mean", "std"]],
                                   sortorder=None)
index = index.union(pd.MultiIndex.from_tuples([("macro", "conll", "f", "mean"), ("macro", "conll", "f", "std")],
                                              sortorder=None), sort=False)
index = index.union(pd.MultiIndex.from_product([["micro"], ["muc", "bcub", "ceafe"], ["p", "r", "f"], ["value"]],
                                              sortorder=None), sort=False)
index = index.union(pd.MultiIndex.from_tuples([("micro", "conll", "f", "value")]), sort=False)
index.names = ["agg", "metric", "score", "func"]

# Create dataframe for train baseline
train_hyperparam_df = pd.DataFrame(rows, columns=hyperparams + header)
train_hyperparam_df = train_hyperparam_df.set_index(hyperparams)
train_hyperparam_df.columns = index

display(train_hyperparam_df)

100%|██████████| 1296/1296 [00:20<00:00, 62.92hyperparam/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,agg,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,micro,micro,micro,micro,micro,micro,micro,micro,micro,micro
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,metric,muc,muc,muc,muc,muc,muc,bcub,bcub,bcub,bcub,bcub,bcub,ceafe,ceafe,ceafe,ceafe,ceafe,ceafe,conll,conll,muc,muc,muc,bcub,bcub,bcub,ceafe,ceafe,ceafe,conll
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,score,p,p,r,r,f,f,p,p,r,r,f,f,p,p,r,r,f,f,f,f,p,r,f,p,r,f,p,r,f,f
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,func,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,value,value,value,value,value,value,value,value,value,value
preprocess,genre,entity,merge_speakers,provide_gold_mentions,remove_gold_singletons,split_len,overlap_len,merge_strategy,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4,Unnamed: 24_level_4,Unnamed: 25_level_4,Unnamed: 26_level_4,Unnamed: 27_level_4,Unnamed: 28_level_4,Unnamed: 29_level_4,Unnamed: 30_level_4,Unnamed: 31_level_4,Unnamed: 32_level_4,Unnamed: 33_level_4,Unnamed: 34_level_4,Unnamed: 35_level_4,Unnamed: 36_level_4,Unnamed: 37_level_4,Unnamed: 38_level_4
addsays,wb,all,False,False,False,2048,128,after,84.704428,3.510968,67.569224,1.932926,75.123055,1.579729,23.906690,8.539546,51.959970,2.208573,32.157988,7.383917,52.900082,10.057755,4.021637,2.607343,7.260816,4.290337,38.180620,3.648889,84.166887,67.565941,74.958267,22.794145,51.767727,31.651595,50.647364,3.934755,7.302208,37.970690
addsays,wb,all,False,False,False,2048,128,average,84.686564,3.508866,67.596401,1.924081,75.133477,1.590378,23.889096,8.548903,51.973466,2.208483,32.146607,7.415512,52.894563,10.062336,4.025306,2.610868,7.267240,4.297565,38.182441,3.674206,84.149246,67.587673,74.964639,22.777977,51.775760,31.637503,50.647854,3.934793,7.302278,37.968140
addsays,wb,all,False,False,False,2048,128,before,84.686564,3.508866,67.596401,1.924081,75.133477,1.590378,23.889096,8.548903,51.973466,2.208483,32.146607,7.415512,52.894563,10.062336,4.025306,2.610868,7.267240,4.297565,38.182441,3.674206,84.149246,67.587673,74.964639,22.777977,51.775760,31.637503,50.647854,3.934793,7.302278,37.968140
addsays,wb,all,False,False,False,2048,128,max,84.718997,3.498933,67.547279,1.912026,75.115906,1.572913,23.912984,8.537684,51.859560,2.222960,32.147594,7.387931,52.803522,10.094063,4.008322,2.594607,7.238884,4.273542,38.167461,3.646231,84.184529,67.546622,74.953370,22.801621,51.644817,31.635779,50.557087,3.923834,7.282461,37.957204
addsays,wb,all,False,False,False,2048,128,min,84.678523,3.518965,67.625502,1.940787,75.147532,1.590451,23.884102,8.550059,52.011713,2.223470,32.148383,7.416120,52.911102,10.044332,4.036476,2.626197,7.284836,4.319433,38.193584,3.678680,84.140425,67.616516,74.978876,22.772246,51.814393,31.639181,50.685808,3.943634,7.317895,37.978651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
none,wb,speaker,True,False,False,5120,512,average,78.925721,7.294623,89.350540,1.655350,83.599021,3.833931,62.266165,8.675425,48.651359,15.012258,52.930082,8.509630,35.092679,9.117064,59.266051,5.681856,43.570027,7.821554,60.033043,5.249406,79.800653,89.093505,84.191425,63.123561,44.943142,52.504076,32.676780,60.326364,42.391499,59.695666
none,wb,speaker,True,False,False,5120,512,before,78.925721,7.294623,89.350540,1.655350,83.599021,3.833931,62.266165,8.675425,48.651359,15.012258,52.930082,8.509630,35.092679,9.117064,59.266051,5.681856,43.570027,7.821554,60.033043,5.249406,79.800653,89.093505,84.191425,63.123561,44.943142,52.504076,32.676780,60.326364,42.391499,59.695666
none,wb,speaker,True,False,False,5120,512,max,78.951570,7.357232,89.245925,1.708058,83.562584,3.839512,62.344053,8.751624,49.062019,13.441808,53.408207,7.438240,34.982782,8.313858,59.205636,5.546237,43.507008,7.216865,60.159266,4.721184,79.835935,88.935836,84.140560,63.183478,45.740850,53.065574,32.811421,60.218609,42.477857,59.894663
none,wb,speaker,True,False,False,5120,512,min,78.884879,7.274302,89.400178,1.691510,83.598329,3.819809,62.264049,8.677034,47.684086,16.131158,52.069726,9.427955,34.567680,9.070954,59.168199,5.417400,43.125615,7.694457,59.597890,5.359643,79.752139,89.153478,84.191168,63.124440,43.846473,51.748349,32.087981,60.309941,41.888913,59.276143


In [104]:
# best baseline performance on train set
train_hyperparam_df.loc[train_hyperparam_df["macro"]["conll"]["f"]["mean"] == (
                        train_hyperparam_df["macro"]["conll"]["f"]["mean"].max()),
                        ("macro", slice(None), "f")]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,agg,macro,macro,macro,macro,macro,macro,macro,macro
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,metric,muc,muc,bcub,bcub,ceafe,ceafe,conll,conll
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,score,f,f,f,f,f,f,f,f
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,func,mean,std,mean,std,mean,std,mean,std
preprocess,genre,entity,merge_speakers,provide_gold_mentions,remove_gold_singletons,split_len,overlap_len,merge_strategy,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
addsays,wb,speaker,True,False,False,3072,512,min,84.057317,4.080106,59.603098,6.616547,49.407743,11.40428,64.356053,6.285581


The baseline achieves 64.36 conll F1 on train set

In [98]:
dev_pred_baseline_df = dev_baseline_df[~dev_baseline_df["provide_gold_mentions"]
                                       & ~dev_baseline_df["remove_gold_singletons"]]
hyperparams = ["preprocess", "genre", "entity", "merge_speakers", "provide_gold_mentions", "remove_gold_singletons"]
header = ([f"macro_{metric}_{score}_{agg}" for metric in ["muc", "bcub", "ceafe"]
                                          for score in ["p", "r", "f"]
                                          for agg in ["mean", "std"]]
          + ["macro_conll_f_mean", "macro_conll_f_std"]
          + [f"micro_{metric}_{score}" for metric in ["muc", "bcub", "ceafe"]
                                     for score in ["p", "r", "f"]]
          + ["micro_conll_f"])
rows = []
groups = dev_pred_baseline_df.groupby(hyperparams)
for hyperparam_values, hyperparam_df in tqdm.tqdm(groups, total=groups.ngroups, unit="hyperparam"):
    row = list(hyperparam_values)

    # Macro value
    movie_muc_df = hyperparam_df.loc[(hyperparam_df["movie"] != "all")
                                     & (hyperparam_df["metric"] == "muc"), ["precision", "recall", "f1"]]
    movie_bcub_df = hyperparam_df.loc[(hyperparam_df["movie"] != "all")
                                       & (hyperparam_df["metric"] == "bcub"), ["precision", "recall", "f1"]]
    movie_ceafe_df = hyperparam_df.loc[(hyperparam_df["movie"] != "all")
                                        & (hyperparam_df["metric"] == "ceafe"), ["precision", "recall", "f1"]]
    movie_conll_F1 = (hyperparam_df.loc[hyperparam_df["movie"] != "all", ["movie", "f1"]].groupby("movie").mean()
                                   .agg(["mean", "std"]).values.flatten().tolist())
    row += movie_muc_df.agg(["mean", "std"]).values.T.flatten().tolist()
    row += movie_bcub_df.agg(["mean", "std"]).values.T.flatten().tolist()
    row += movie_ceafe_df.agg(["mean", "std"]).values.T.flatten().tolist()
    row += movie_conll_F1

    # Micro value
    row += hyperparam_df.loc[hyperparam_df["movie"] == "all", ["precision", "recall", "f1"]].values.flatten().tolist()
    row.append(hyperparam_df.loc[hyperparam_df["movie"] == "all", "f1"].mean())
    rows.append(row)

# Create column index
index = pd.MultiIndex.from_product([["macro"], ["muc", "bcub", "ceafe"], ["p", "r", "f"], ["mean", "std"]],
                                   sortorder=None)
index = index.union(pd.MultiIndex.from_tuples([("macro", "conll", "f", "mean"), ("macro", "conll", "f", "std")],
                                              sortorder=None), sort=False)
index = index.union(pd.MultiIndex.from_product([["micro"], ["muc", "bcub", "ceafe"], ["p", "r", "f"], ["value"]],
                                              sortorder=None), sort=False)
index = index.union(pd.MultiIndex.from_tuples([("micro", "conll", "f", "value")]), sort=False)
index.names = ["agg", "metric", "score", "func"]

# Create dataframe for dev baseline
dev_hyperparam_df = pd.DataFrame(rows, columns=hyperparams + header)
dev_hyperparam_df = dev_hyperparam_df.set_index(hyperparams)
dev_hyperparam_df.columns = index

display(dev_hyperparam_df)

100%|██████████| 126/126 [00:02<00:00, 57.00hyperparam/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,agg,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,micro,micro,micro,micro,micro,micro,micro,micro,micro,micro
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,metric,muc,muc,muc,muc,muc,muc,bcub,bcub,bcub,bcub,bcub,bcub,ceafe,ceafe,ceafe,ceafe,ceafe,ceafe,conll,conll,muc,muc,muc,bcub,bcub,bcub,ceafe,ceafe,ceafe,conll
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,score,p,p,r,r,f,f,p,p,r,r,f,f,p,p,r,r,f,f,f,f,p,r,f,p,r,f,p,r,f,f
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,func,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,value,value,value,value,value,value,value,value,value,value
preprocess,genre,entity,merge_speakers,provide_gold_mentions,remove_gold_singletons,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4,Unnamed: 24_level_4,Unnamed: 25_level_4,Unnamed: 26_level_4,Unnamed: 27_level_4,Unnamed: 28_level_4,Unnamed: 29_level_4,Unnamed: 30_level_4,Unnamed: 31_level_4,Unnamed: 32_level_4,Unnamed: 33_level_4,Unnamed: 34_level_4,Unnamed: 35_level_4
addsays,bc,all,False,False,False,60.863333,4.300492,87.596667,1.201763,71.773333,3.269103,41.833333,6.037254,70.346667,7.232167,51.976667,3.067839,10.960000,2.219820,51.983333,10.269490,17.970000,3.112411,47.240000,1.106682,60.76,87.56,71.74,42.03,70.02,52.53,11.08,50.73,18.19,47.486667
addsays,bc,all,True,False,False,61.063333,4.503791,87.873333,1.108708,72.006667,3.435598,40.996667,5.828828,72.006667,6.319116,51.830000,3.297514,10.653333,2.445574,49.473333,10.790942,17.413333,3.579027,47.083333,1.439147,60.97,87.85,71.98,41.18,71.72,52.32,10.81,48.44,17.67,47.323333
addsays,bc,person,False,False,False,87.713333,3.910298,74.453333,7.401651,80.280000,3.110193,62.433333,13.073608,61.280000,5.250552,61.453333,8.180320,58.613333,8.170155,23.476667,17.152496,30.856667,15.321672,57.530000,7.496806,87.41,74.60,80.50,63.91,61.21,62.53,57.22,19.97,29.61,57.546667
addsays,bc,person,True,False,False,87.630000,3.789076,74.936667,7.124958,80.543333,3.043687,61.020000,12.807955,62.353333,5.487771,61.280000,8.303849,58.020000,8.601517,21.160000,14.487571,28.896667,13.678046,56.906667,6.876455,87.37,75.08,80.76,62.41,62.27,62.34,56.82,18.22,27.59,56.896667
addsays,bc,speaker,False,False,False,85.530000,2.982901,81.180000,2.149860,83.266667,1.615714,62.143333,10.211926,64.873333,4.816185,62.920000,4.377979,52.560000,14.913816,29.753333,10.993063,36.096667,9.327863,60.761111,2.241191,85.41,81.30,83.30,62.81,64.66,63.72,51.65,28.74,36.93,61.316667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
none,wb,all,True,False,False,62.293333,4.298399,86.236667,0.401290,72.283333,3.019674,40.666667,6.770970,65.940000,12.206658,49.356667,2.850550,11.436667,1.500411,55.560000,14.088957,18.786667,1.830911,46.808889,0.603511,62.18,86.26,72.27,40.95,65.40,50.37,11.48,52.98,18.87,47.170000
none,wb,person,False,False,False,88.063333,3.981574,70.840000,9.491243,78.113333,4.217183,66.933333,12.712708,53.063333,14.073565,57.943333,9.724635,59.223333,7.291161,26.390000,17.937277,33.500000,14.860178,56.518889,6.485416,87.67,70.86,78.37,68.28,52.60,59.42,57.24,22.68,32.48,56.756667
none,wb,person,True,False,False,87.833333,3.747323,73.236667,7.033991,79.623333,2.443611,60.023333,13.601597,56.216667,14.186474,56.430000,8.413014,61.653333,4.517348,25.603333,18.468704,33.480000,16.709366,56.511111,6.051085,87.54,73.15,79.70,61.01,55.69,58.23,60.79,21.79,32.08,56.670000
none,wb,speaker,False,False,False,85.673333,3.298429,76.780000,6.047446,80.900000,4.126148,66.140000,8.719794,58.433333,14.666269,60.683333,7.488473,54.466667,20.120503,32.996667,12.942354,38.966667,11.912352,60.183333,4.930346,85.57,76.45,80.75,66.36,57.77,61.77,53.47,31.78,39.87,60.796667


In [99]:
# best baseline performance on dev set
dev_hyperparam_df[dev_hyperparam_df["macro"]["conll"]["f"]["mean"] == (
                        dev_hyperparam_df["macro"]["conll"]["f"]["mean"].max())]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,agg,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,macro,micro,micro,micro,micro,micro,micro,micro,micro,micro,micro
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,metric,muc,muc,muc,muc,muc,muc,bcub,bcub,bcub,bcub,bcub,bcub,ceafe,ceafe,ceafe,ceafe,ceafe,ceafe,conll,conll,muc,muc,muc,bcub,bcub,bcub,ceafe,ceafe,ceafe,conll
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,score,p,p,r,r,f,f,p,p,r,r,f,f,p,p,r,r,f,f,f,f,p,r,f,p,r,f,p,r,f,f
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,func,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,value,value,value,value,value,value,value,value,value,value
preprocess,genre,entity,merge_speakers,provide_gold_mentions,remove_gold_singletons,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4,Unnamed: 24_level_4,Unnamed: 25_level_4,Unnamed: 26_level_4,Unnamed: 27_level_4,Unnamed: 28_level_4,Unnamed: 29_level_4,Unnamed: 30_level_4,Unnamed: 31_level_4,Unnamed: 32_level_4,Unnamed: 33_level_4,Unnamed: 34_level_4,Unnamed: 35_level_4
addsays,wb,speaker,True,False,False,86.26,3.15073,80.753333,2.638301,83.393333,2.273551,63.993333,12.329932,65.923333,3.97827,64.306667,5.77313,60.716667,13.577958,29.946667,10.318965,38.946667,9.915938,62.215556,2.469893,86.19,80.89,83.46,64.6,65.75,65.17,61.47,28.99,39.4,62.676667


The baseline achieves 62.68 conll F1 on dev set

In [100]:
# Find how baseline performance varies with preprocess, entity, merge_speakers, split_len, overlap_len, merge_strategy
hyperparams = ["preprocess", "entity", "merge_speakers", "split_len", "overlap_len", "merge_strategy"]
for hyperparam in hyperparams:
    # Create paired data for t-test
    levels = sorted(train_hyperparam_df.index.get_level_values(hyperparam).unique().astype("<U18").tolist())
    print(f"{hyperparam:15s}: {levels}")
    group_hyperparams = hyperparams.copy()
    group_hyperparams.remove(hyperparam)
    paired_values = []
    for _, df in train_hyperparam_df.groupby(group_hyperparams, axis=0):
        assert len(df) == len(levels)
        df.sort_values(hyperparam, inplace=True)
        paired_values.append(tuple(df["macro"]["conll"]["f"]["mean"].values.tolist()))
    paired = np.array(paired_values)
    means = np.round(np.mean(paired, axis=0), 3).tolist()
    print(f"{'means':15s}: {means}")

    # Conduct t-tests and correct p-value
    n_comparisons = len(levels)*(len(levels) - 1)/2
    for i in range(len(levels)):
        for j in range(i + 1, len(levels)):
            statistic = scipy.stats.ttest_rel(paired[:, i], paired[:, j], alternative="two-sided")
            print(f"\t{levels[i]:12s} (\u03bc = {np.mean(paired[:, i]):.2f}) vs {levels[j]:12s} "
                  f"(\u03bc = {np.mean(paired[:, j]):.2f}): pvalue = {n_comparisons * statistic.pvalue:.2e} "
                  f"statistic = {statistic.statistic:.3f}")
    print()

preprocess     : ['addsays', 'nocharacters', 'none']
means          : [49.315, 29.424, 48.062]
	addsays      (μ = 49.31) vs nocharacters (μ = 29.42): pvalue = 5.97e-64 statistic = 20.221
	addsays      (μ = 49.31) vs none         (μ = 48.06): pvalue = 8.65e-55 statistic = 18.187
	nocharacters (μ = 29.42) vs none         (μ = 48.06): pvalue = 2.85e-60 statistic = -19.406

entity         : ['all', 'person', 'speaker']
means          : [40.219, 46.252, 40.33]
	all          (μ = 40.22) vs person       (μ = 46.25): pvalue = 3.60e-158 statistic = -43.183
	all          (μ = 40.22) vs speaker      (μ = 40.33): pvalue = 2.73e+00 statistic = -0.113
	person       (μ = 46.25) vs speaker      (μ = 40.33): pvalue = 1.97e-09 statistic = 6.319

merge_speakers : ['False', 'True']
means          : [39.956, 44.578]
	False        (μ = 39.96) vs True         (μ = 44.58): pvalue = 1.70e-115 statistic = -28.341

split_len      : ['2048', '3072', '4096', '5120']
means          : [40.372, 41.855, 42.828, 44.012

In [101]:
# Find how baseline performance varies with preprocess, entity, and merge_speakers for dev set
hyperparams = ["preprocess", "entity", "genre", "merge_speakers"]
for hyperparam in hyperparams:
    # Create paired data for t-test
    levels = sorted(dev_hyperparam_df.index.get_level_values(hyperparam).unique().astype("<U18").tolist())
    print(f"{hyperparam:15s}: {levels}")
    group_hyperparams = hyperparams.copy()
    group_hyperparams.remove(hyperparam)
    paired_values = []
    for _, df in dev_hyperparam_df.groupby(group_hyperparams, axis=0):
        assert len(df) == len(levels)
        df.sort_values(hyperparam, inplace=True)
        paired_values.append(tuple(df["macro"]["conll"]["f"]["mean"].values.tolist()))
    paired = np.array(paired_values)
    means = np.round(np.mean(paired, axis=0), 3).tolist()
    print(f"{'means':15s}: {means}")

    # Conduct t-tests and correct p-value
    n_comparisons = len(levels)*(len(levels) - 1)/2
    for i in range(len(levels)):
        for j in range(i + 1, len(levels)):
            statistic = scipy.stats.ttest_rel(paired[:, i], paired[:, j], alternative="two-sided")
            print(f"\t{levels[i]:12s} (\u03bc = {np.mean(paired[:, i]):.2f}) vs {levels[j]:12s} "
                  f"(\u03bc = {np.mean(paired[:, j]):.2f}): pvalue = {n_comparisons * statistic.pvalue:.2e} "
                  f"statistic = {statistic.statistic:.3f}")
    print()

preprocess     : ['addsays', 'nocharacters', 'none']
means          : [55.119, 38.921, 54.702]
	addsays      (μ = 55.12) vs nocharacters (μ = 38.92): pvalue = 2.10e-07 statistic = 6.556
	addsays      (μ = 55.12) vs none         (μ = 54.70): pvalue = 3.56e-06 statistic = 5.692
	nocharacters (μ = 38.92) vs none         (μ = 54.70): pvalue = 3.27e-07 statistic = -6.421

entity         : ['all', 'person', 'speaker']
means          : [45.694, 55.193, 47.855]
	all          (μ = 45.69) vs person       (μ = 55.19): pvalue = 7.32e-52 statistic = -110.253
	all          (μ = 45.69) vs speaker      (μ = 47.86): pvalue = 1.16e+00 statistic = -0.878
	person       (μ = 55.19) vs speaker      (μ = 47.86): pvalue = 1.47e-02 statistic = 2.975

genre          : ['bc', 'bn', 'mz', 'nw', 'pt', 'tc', 'wb']
means          : [49.631, 49.639, 49.492, 49.572, 49.595, 49.665, 49.472]
	bc           (μ = 49.63) vs bn           (μ = 49.64): pvalue = 1.92e+01 statistic = -0.112
	bc           (μ = 49.63) vs mz       

In [103]:
for _, df in dev_hyperparam_df.groupby(["preprocess", "genre", "merge_speakers"], axis=0):
    display(df.loc[:, ("macro", "conll", "f", "mean")])

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     bc     all      False           False                  False                     47.240000
                   person   False           False                  False                     57.530000
                   speaker  False           False                  False                     60.761111
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     bc     all      True            False                  False                     47.083333
                   person   True            False                  False                     56.906667
                   speaker  True            False                  False                     60.614444
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     bn     all      False           False                  False                     47.340000
                   person   False           False                  False                     57.473333
                   speaker  False           False                  False                     60.704444
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     bn     all      True            False                  False                     47.186667
                   person   True            False                  False                     56.848889
                   speaker  True            False                  False                     60.558889
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     mz     all      False           False                  False                     47.793333
                   person   False           False                  False                     56.348889
                   speaker  False           False                  False                     60.926667
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     mz     all      True            False                  False                     47.811111
                   person   True            False                  False                     56.155556
                   speaker  True            False                  False                     61.282222
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     nw     all      False           False                  False                     47.431111
                   person   False           False                  False                     57.413333
                   speaker  False           False                  False                     60.675556
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     nw     all      True            False                  False                     47.283333
                   person   True            False                  False                     56.805556
                   speaker  True            False                  False                     60.538889
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     pt     all      False           False                  False                     47.472222
                   person   False           False                  False                     57.496667
                   speaker  False           False                  False                     60.771111
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     pt     all      True            False                  False                     47.313333
                   person   True            False                  False                     56.871111
                   speaker  True            False                  False                     60.610000
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     tc     all      False           False                  False                     47.592222
                   person   False           False                  False                     57.580000
                   speaker  False           False                  False                     60.772222
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     tc     all      True            False                  False                     47.575556
                   person   True            False                  False                     57.308889
                   speaker  True            False                  False                     61.044444
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     wb     all      False           False                  False                     47.793333
                   person   False           False                  False                     56.253333
                   speaker  False           False                  False                     60.811111
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
addsays     wb     all      True            False                  False                     48.143333
                   person   True            False                  False                     56.645556
                   speaker  True            False                  False                     62.215556
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  bc     all      False           False                  False                     42.171111
                     person   False           False                  False                     51.895556
                     speaker  False           False                  False                     23.036667
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  bc     all      True            False                  False                     42.171111
                     person   True            False                  False                     51.895556
                     speaker  True            False                  False                     23.036667
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  bn     all      False           False                  False                     42.335556
                     person   False           False                  False                     52.328889
                     speaker  False           False                  False                     22.540000
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  bn     all      True            False                  False                     42.335556
                     person   True            False                  False                     52.328889
                     speaker  True            False                  False                     22.540000
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  mz     all      False           False                  False                     42.791111
                     person   False           False                  False                     52.026667
                     speaker  False           False                  False                     21.201111
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  mz     all      True            False                  False                     42.791111
                     person   True            False                  False                     52.026667
                     speaker  True            False                  False                     21.201111
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  nw     all      False           False                  False                     42.403333
                     person   False           False                  False                     52.162222
                     speaker  False           False                  False                     22.960000
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  nw     all      True            False                  False                     42.403333
                     person   True            False                  False                     52.162222
                     speaker  True            False                  False                     22.960000
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  pt     all      False           False                  False                     42.138889
                     person   False           False                  False                     51.800000
                     speaker  False           False                  False                     23.034444
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  pt     all      True            False                  False                     42.138889
                     person   True            False                  False                     51.800000
                     speaker  True            False                  False                     23.034444
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  tc     all      False           False                  False                     42.203333
                     person   False           False                  False                     51.697778
                     speaker  False           False                  False                     23.002222
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  tc     all      True            False                  False                     42.203333
                     person   True            False                  False                     51.697778
                     speaker  True            False                  False                     23.002222
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  wb     all      False           False                  False                     43.071111
                     person   False           False                  False                     51.972222
                     speaker  False           False                  False                     20.575556
Name: (macro, conll, f, mean), dtype: float64

preprocess    genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
nocharacters  wb     all      True            False                  False                     43.071111
                     person   True            False                  False                     51.972222
                     speaker  True            False                  False                     20.575556
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        bc     all      False           False                  False                     47.065556
                   person   False           False                  False                     57.615556
                   speaker  False           False                  False                     60.630000
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        bc     all      True            False                  False                     46.564444
                   person   True            False                  False                     56.948889
                   speaker  True            False                  False                     60.187778
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        bn     all      False           False                  False                     47.292222
                   person   False           False                  False                     57.201111
                   speaker  False           False                  False                     60.817778
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        bn     all      True            False                  False                     46.775556
                   person   True            False                  False                     56.531111
                   speaker  True            False                  False                     60.354444
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        mz     all      False           False                  False                     47.744444
                   person   False           False                  False                     56.367778
                   speaker  False           False                  False                     60.601111
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        mz     all      True            False                  False                     47.343333
                   person   True            False                  False                     56.018889
                   speaker  True            False                  False                     60.417778
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        nw     all      False           False                  False                     47.251111
                   person   False           False                  False                     56.608889
                   speaker  False           False                  False                     60.258889
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        nw     all      True            False                  False                     46.816667
                   person   True            False                  False                     56.162222
                   speaker  True            False                  False                     59.997778
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        pt     all      False           False                  False                     47.211111
                   person   False           False                  False                     57.172222
                   speaker  False           False                  False                     60.498889
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        pt     all      True            False                  False                     46.724444
                   person   True            False                  False                     56.502222
                   speaker  True            False                  False                     60.122222
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        tc     all      False           False                  False                     47.531111
                   person   False           False                  False                     56.494444
                   speaker  False           False                  False                     60.691111
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        tc     all      True            False                  False                     47.096667
                   person   True            False                  False                     56.047778
                   speaker  True            False                  False                     60.431111
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        wb     all      False           False                  False                     47.617778
                   person   False           False                  False                     56.518889
                   speaker  False           False                  False                     60.183333
Name: (macro, conll, f, mean), dtype: float64

preprocess  genre  entity   merge_speakers  provide_gold_mentions  remove_gold_singletons
none        wb     all      True            False                  False                     46.808889
                   person   True            False                  False                     56.511111
                   speaker  True            False                  False                     59.747778
Name: (macro, conll, f, mean), dtype: float64