In [2]:
import yaml
import pandas as pd
import numpy as np
import os
import scipy.stats
import collections

pd.set_option("display.min_rows", 50)

### Create cross validation dataframe

In [3]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec19-21")
header = ["preprocess", "bert_lr", "model_lr", "warmup", "movie", "dev_lea", "train_lea", "best_epoch", "dev_scores",
          "train_scores"]
rows = []

for dir_ in os.listdir(results_dir):
    result_file1 = os.path.join(results_dir, dir_, "result.yaml")
    result_file2 = os.path.join(results_dir, dir_, "result2.yaml")
    if os.path.exists(result_file1):
        with open(result_file1) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
    elif os.path.exists(result_file2):
        with open(result_file2) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
    else:
        continue
    preprocess = result["preprocess"]
    bert_lr = result["bert_lr"]
    model_lr = result["coref_lr"]
    warmup = result["warmup_steps"]
    movie = result["test_movie"]
    dev_lea = result["dev_metric"]["span"]["lea"]["f1"]
    epoch = result["best_epoch"]
    dev_scores = result["dev_scores"]
    train_scores = result["train_scores"]
    train_lea = train_scores[epoch - 1]
    rows.append([preprocess, bert_lr, model_lr, warmup, movie, dev_lea, train_lea, epoch, dev_scores, train_scores])

df = pd.DataFrame(rows, columns=header)
print(df.shape)
print()

for col, dtype in zip(df.columns, df.dtypes):
    print(f"{col} ({dtype})")

(648, 10)

preprocess (object)
bert_lr (float64)
model_lr (float64)
warmup (float64)
movie (object)
dev_lea (float64)
train_lea (float64)
best_epoch (int64)
dev_scores (object)
train_scores (object)


### Find the macro scores (averaged across movies) for cross validation

In [None]:
print("macro average:")
macro_df = (df.groupby(["preprocess", "bert_lr", "model_lr", "warmup"])
            .agg({"dev_lea": "mean", "train_lea": "mean", "best_epoch": "mean"})
            .sort_values(by="dev_lea", ascending=False))
display(macro_df)

macro average:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,dev_lea,train_lea,best_epoch
preprocess,bert_lr,model_lr,warmup,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
regular,0.00002,0.0002,50.0,61.600000,89.835683,6.833333
addsays,0.00002,0.0001,-1.0,61.592167,89.766367,6.333333
addsays,0.00001,0.0002,-1.0,61.572500,90.609750,8.166667
regular,0.00002,0.0002,100.0,61.405833,88.221367,8.166667
addsays,0.00002,0.0002,50.0,61.316833,86.292750,5.666667
addsays,0.00002,0.0001,0.0,61.287667,88.543567,7.166667
regular,0.00005,0.0002,50.0,61.252167,89.076450,7.666667
addsays,0.00002,0.0005,-1.0,61.202667,90.920317,6.833333
addsays,0.00005,0.0001,50.0,60.993833,93.758217,11.500000
regular,0.00002,0.0005,0.0,60.984667,90.512150,7.000000


### Save the cross validation dataframe

In [9]:
df.to_csv(os.path.join(os.getenv("DATA_DIR"), "mica_text_coref/movie_coref/results/coreference/cross_val_1.csv"),
          index=False)

In [6]:
# mean per movie
df.groupby("movie").agg({"dev_lea": "mean", "train_lea": "mean", "best_epoch": "mean"})

Unnamed: 0_level_0,dev_lea,train_lea,best_epoch
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
avengers_endgame,43.708306,88.734183,8.472222
dead_poets_society,55.264519,86.943589,8.212963
john_wick,67.745519,87.336787,9.342593
prestige,55.560056,83.633989,6.314815
quiet_place,60.651444,84.949249,7.87037
zootopia,61.659074,86.566273,8.444444


### Read baseline dataframe for full-length scripts

In [7]:
baseline_df = pd.read_csv(os.path.join(os.getenv("DATA_DIR"),
                                       "mica_text_coref/movie_coref/results/coreference/full_length.baseline.tsv"),
                          sep="\t", index_col=None)
print(baseline_df.shape)
print()

for col, dtype in zip(baseline_df.columns, baseline_df.dtypes):
    print(f"{col} ({dtype})")

(254016, 14)

preprocess (object)
genre (object)
split_len (int64)
overlap_len (int64)
merge_strategy (object)
merge_speakers (bool)
entity (object)
remove_gold_singletons (bool)
provide_gold_mentions (bool)
movie (object)
metric (object)
P (float64)
R (float64)
F (float64)


In [8]:
# summary of columns
print(baseline_df["preprocess"].unique())
print(baseline_df["genre"].unique())
print(baseline_df["entity"].unique())
print(baseline_df["split_len"].unique())
print(baseline_df["overlap_len"].unique())
print(baseline_df["merge_strategy"].unique())

['nocharacters' 'addsays' 'none']
['mz' 'tc' 'wb' 'bc' 'pt' 'nw' 'bn']
['all' 'speaker' 'person']
[3072 2048 5120 4096]
[256 128 512]
['pre' 'post' 'avg' 'max' 'min' 'none']


### Find the macro baseline scores (averaged across movies)

In [9]:
macro_baseline_df = (baseline_df[(baseline_df["movie"] != "all")
                                 & ~baseline_df["remove_gold_singletons"]
                                 & ~baseline_df["provide_gold_mentions"]
                                 & (baseline_df["metric"] == "lea")]
                     .groupby(["preprocess", "entity", "genre", "split_len", "overlap_len", "merge_strategy",
                               "merge_speakers"])
                     .agg({"F": "mean"})
                     .sort_values(by="F", ascending=False))
display(macro_baseline_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,F
preprocess,entity,genre,split_len,overlap_len,merge_strategy,merge_speakers,Unnamed: 7_level_1
addsays,speaker,wb,3072,512,avg,True,59.432041
addsays,speaker,wb,3072,512,min,True,59.413063
addsays,speaker,wb,3072,512,post,True,59.381315
addsays,speaker,wb,5120,128,none,True,59.241314
addsays,speaker,tc,3072,512,avg,True,59.165600
addsays,speaker,mz,5120,128,none,True,59.056600
addsays,speaker,tc,3072,512,post,True,59.033369
addsays,speaker,wb,5120,128,max,True,58.976455
addsays,speaker,wb,3072,512,max,True,58.968410
addsays,speaker,wb,5120,128,avg,True,58.966080


In [10]:
hyperparam_to_movie_scores = {}
for (preprocess, bert_lr, model_lr, warmup), gdf in df.groupby(["preprocess", "bert_lr", "model_lr", "warmup"]):
    gdf.sort_values(by="movie", inplace=True)
    hyperparam_to_movie_scores[(preprocess, bert_lr, model_lr, warmup)] = gdf["dev_lea"].tolist()

In [12]:
hyperparams = sorted(hyperparam_to_movie_scores.keys(), key=lambda hp: np.mean(hyperparam_to_movie_scores[hp]),
                     reverse=True)
print(hyperparams[:10])

[('regular', 2e-05, 0.0002, 50.0), ('addsays', 2e-05, 0.0001, -1.0), ('addsays', 1e-05, 0.0002, -1.0), ('regular', 2e-05, 0.0002, 100.0), ('addsays', 2e-05, 0.0002, 50.0), ('addsays', 2e-05, 0.0001, 0.0), ('regular', 5e-05, 0.0002, 50.0), ('addsays', 2e-05, 0.0005, -1.0), ('addsays', 5e-05, 0.0001, 50.0), ('regular', 2e-05, 0.0005, 0.0)]


In [18]:
rows = [list(hyperparams[0]) + [np.mean(hyperparam_to_movie_scores[hyperparams[0]]), 0, 0]]
rank = 0
for i in range(len(hyperparams) - 1):
    arr1 = hyperparam_to_movie_scores[hyperparams[0]]
    arr2 = hyperparam_to_movie_scores[hyperparams[i + 1]]
    ttest_result = scipy.stats.ttest_rel(arr1, arr2, alternative="greater")
    if ttest_result.pvalue < 0.05:
        rank += 1
    rows.append(list(hyperparams[i + 1]) + [np.mean(arr2), ttest_result.pvalue, rank])
rank_df = pd.DataFrame(rows, columns=["preprocess", "bert_lr", "model_lr", "warmup", "macro_score", "pvalue", "rank"])
rank_df

Unnamed: 0,preprocess,bert_lr,model_lr,warmup,macro_score,pvalue,rank
0,regular,0.00002,0.0002,50.0,61.600000,0.000000,0
1,addsays,0.00002,0.0001,-1.0,61.592167,0.499146,0
2,addsays,0.00001,0.0002,-1.0,61.572500,0.496678,0
3,regular,0.00002,0.0002,100.0,61.405833,0.409886,0
4,addsays,0.00002,0.0002,50.0,61.316833,0.463974,0
5,addsays,0.00002,0.0001,0.0,61.287667,0.462817,0
6,regular,0.00005,0.0002,50.0,61.252167,0.416443,0
7,addsays,0.00002,0.0005,-1.0,61.202667,0.452464,0
8,addsays,0.00005,0.0001,50.0,60.993833,0.430795,0
9,regular,0.00002,0.0005,0.0,60.984667,0.287324,0


In [19]:
rank_df.to_csv(os.path.join(os.getenv("DATA_DIR"), "mica_text_coref/temp/rank.csv"), index=False)

In [26]:
collections.Counter(rank_df.loc[rank_df["rank"] == 0, "bert_lr"])
collections.Counter(rank_df.loc[rank_df["rank"] == 0, "model_lr"])

Counter({0.0002: 12, 0.0001: 9, 0.0005: 15})

In [4]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25-26")
header = ["weight_decay", "dropout", "movie", "dev_lea", "best_epoch"]
rows = []

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if os.path.exists(result_file):
        with open(result_file) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
    else:
        continue
    weight_decay = result["weight_decay"]
    dropout = result["dropout"]
    movie = result["test_movie"]
    dev_lea = result["dev_metric"]["span"]["lea"]["f1"]
    epoch = result["best_epoch"]
    rows.append([weight_decay, dropout, movie, dev_lea, epoch])

df2 = pd.DataFrame(rows, columns=header)
print(df2.shape)
print()

for col, dtype in zip(df2.columns, df2.dtypes):
    print(f"{col} ({dtype})")

(194, 5)

weight_decay (float64)
dropout (float64)
movie (object)
dev_lea (float64)
best_epoch (int64)


In [5]:
macro_df2 = df2.groupby(["weight_decay", "dropout"]).agg({"movie": "count", "dev_lea": "mean"})
macro_df2 = macro_df2[macro_df2["movie"] == 6].sort_values(by="dev_lea", ascending=False)
macro_df2

Unnamed: 0_level_0,Unnamed: 1_level_0,movie,dev_lea
weight_decay,dropout,Unnamed: 2_level_1,Unnamed: 3_level_1
0.001,0.0,6,61.6
0.01,0.0,6,61.085833
1.0,0.0,6,61.080167
0.1,0.0,6,60.1195
0.0,0.0,6,59.836667
0.0001,0.0,6,59.836667
10.0,0.0,6,59.632
10.0,0.2,6,58.728167
0.1,0.2,6,57.957
1.0,0.2,6,57.753333
