In [81]:
from mica_text_coref.coref.movie_coref import rules

import collections
import os
import yaml
import re
import pandas as pd
from scipy import stats
import numpy as np
import jsonlines
from scorch import scores

In [2]:
data_dir = os.path.join(os.getenv("DATA_DIR"), "mica_text_coref/movie_coref/results/coreference")

# Correct result files with missing values

In [3]:
n_dirs = 0
n_result_files = 0
dir_with_no_result_files = []
keys = []

for dir in os.listdir(data_dir):
    if dir != "baselines" and dir != "logs" and not dir.startswith("."):
        n_dirs += 1
        result_file = os.path.join(data_dir, dir, "result.yaml")
        if os.path.exists(result_file):
            n_result_files += 1
            with open(result_file) as fr:
                result = yaml.load(fr, Loader=yaml.FullLoader)
                keys.extend(list(result.keys()))
        else:
            dir_with_no_result_files.append(dir)

print(f"Number of directories = {n_dirs}")
print(f"Number of result files = {n_result_files}")
print(f"Directories with no result file = {dir_with_no_result_files}\n")

key_distribution = collections.Counter(keys)
max_count = max(key_distribution.values())
missing_keys = []
for key, count in key_distribution.items():
    if count < max_count:
        print(f"key={key}, occurs in {count}/{max_count} files")
        missing_keys.append(key)

for dir in os.listdir(data_dir):
    if dir != "baselines" and dir != "logs" and not dir.startswith("."):
        result_file = os.path.join(data_dir, dir, "result.yaml")
        if os.path.exists(result_file):
            with open(result_file) as fr:
                result = yaml.load(fr, Loader=yaml.FullLoader)
                if any(key not in result.keys() for key in missing_keys):
                    log_file = os.path.join(data_dir, dir, "train.log")
                    assert os.path.exists(log_file), dir
                    with open(log_file, "r") as fr:
                        content = fr.read()
                        if "preprocess" not in result.keys():
                            match = re.search(r"train_file\s+=\s[\w/]+/movie_coref/results/([a-z]+)/train\.jsonlines", content)
                            if match:
                                result["preprocess"] = match.group(1)
                        if "warmup" not in result.keys():
                            match = re.search(r"warmup_epochs\s+=\s(.+)", content)
                            if match:
                                result["warmup"] = float(match.group(1))
                            else:
                                result["warmup"] = -1
                    with open(result_file, "w") as fw:
                        yaml.dump(result, fw)
            

Number of directories = 220
Number of result files = 218
Directories with no result file = ['Nov25_05:11:22AM', 'Nov25_05:41:58AM']



# Create dataframe from result files

In [9]:
rows = []
for dir in os.listdir(data_dir):
    if dir != "baselines" and dir != "logs" and not dir.startswith("."):
        result_file = os.path.join(data_dir, dir, "result.yaml")
        if os.path.exists(result_file):
            with open(result_file) as fr:
                result = yaml.load(fr, Loader=yaml.FullLoader)
                rows.append([dir, result["preprocess"], result["bert_lr"], result["coref_lr"], result["character_lr"], result["warmup"], result["weight_decay"], result["dropout"], 
                             result["epoch"], result["dev_score"], result["train_score"]])
df = pd.DataFrame(rows, columns=["dir", "preprocess", "bert_lr", "coref_lr", "character_lr", "warmup", "weight_decay", "dropout", "epoch", "dev_score", "train_score"])

# Best dev performance

In [10]:
df[df["dev_score"] == df["dev_score"].max()]

Unnamed: 0,dir,preprocess,bert_lr,coref_lr,character_lr,warmup,weight_decay,dropout,epoch,dev_score,train_score
53,Nov24_11:50:00AM,regular,2e-05,0.0002,0.0002,-1.0,0.001,0.0,11,74.057,84.647


# Best train performance

In [56]:
df[df["train_score"] == df["train_score"].max()]

Unnamed: 0,dir,preprocess,bert_lr,coref_lr,character_lr,warmup,weight_decay,dropout,epoch,dev_score,train_score
160,Nov24_01:52:20PM,addsays,2e-05,0.0002,0.0002,1.0,0.0,0.0,13,71.877,92.83


# Variation around best dev performance

In [59]:
vars = ["preprocess", "bert_lr", "coref_lr", "warmup", "weight_decay", "dropout"]
best_dev_vars = df[df["dev_score"] == df["dev_score"].max()].iloc[0].to_dict()
index = np.full((len(df),), False, dtype=bool)
for i, row in df.iterrows():
    n_diff = sum(int(best_dev_vars[var] != row[var]) for var in vars)
    index[i] = n_diff == 1 or n_diff == 0
display(df[df["dev_score"] == df["dev_score"].max()])
df[index].groupby(vars)[["dev_score", "train_score", "epoch"]].mean()

Unnamed: 0,dir,preprocess,bert_lr,coref_lr,character_lr,warmup,weight_decay,dropout,epoch,dev_score,train_score
53,Nov24_11:50:00AM,regular,2e-05,0.0002,0.0002,-1.0,0.001,0.0,11,74.057,84.647


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,dev_score,train_score,epoch
preprocess,bert_lr,coref_lr,warmup,weight_decay,dropout,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
addsays,2e-05,0.0002,-1.0,0.001,0.0,71.737,90.157,7.0
nocharacters,2e-05,0.0002,-1.0,0.001,0.0,66.653,75.693,6.0
regular,1e-05,0.0002,-1.0,0.001,0.0,70.287,72.453,4.0
regular,2e-05,0.0001,-1.0,0.001,0.0,69.567,75.157,5.0
regular,2e-05,0.0002,-1.0,0.0,0.0,70.353,71.34,3.0
regular,2e-05,0.0002,-1.0,0.001,0.0,74.057,84.647,11.0
regular,2e-05,0.0002,-1.0,0.001,0.3,64.877,64.463,2.0
regular,2e-05,0.0002,0.0,0.001,0.0,72.25,82.84,7.0
regular,2e-05,0.0002,1.0,0.001,0.0,71.853,84.863,10.0
regular,5e-05,0.0002,-1.0,0.001,0.0,69.72,79.233,5.0


# Test preprocess

In [22]:
df[(df["bert_lr"] == 2e-5) & (df["coref_lr"] == 2e-4) & (df["warmup"] == -1) & (df["weight_decay"] == 1e-3) & (df["dropout"] == 0)]

Unnamed: 0,dir,preprocess,bert_lr,coref_lr,character_lr,warmup,weight_decay,dropout,epoch,dev_score,train_score
51,Nov24_07:54:17AM,nocharacters,2e-05,0.0002,0.0002,-1.0,0.001,0.0,6,66.653,75.693
53,Nov24_11:50:00AM,regular,2e-05,0.0002,0.0002,-1.0,0.001,0.0,11,74.057,84.647
212,Nov24_11:16:00AM,addsays,2e-05,0.0002,0.0002,-1.0,0.001,0.0,7,71.737,90.157


In [28]:
regular_arr, nocharacters_arr, addsays_arr = [], [], []
for (bert_lr, coref_lr, warmup, weight_decay, dropout), _df in df.groupby(["bert_lr", "coref_lr", "warmup", "weight_decay", "dropout"]):
    if _df["preprocess"].unique().size == 3:
        regular_arr.append(_df.loc[_df["preprocess"] == "regular", "dev_score"].mean())
        nocharacters_arr.append(_df.loc[_df["preprocess"] == "nocharacters", "dev_score"].mean())
        addsays_arr.append(_df.loc[_df["preprocess"] == "addsays", "dev_score"].mean())

In [32]:
print(stats.ttest_rel(regular_arr, nocharacters_arr))
print(stats.ttest_rel(regular_arr, addsays_arr))
print(stats.ttest_rel(addsays_arr, nocharacters_arr))

Ttest_relResult(statistic=15.263022247251326, pvalue=8.270439515480995e-24)
Ttest_relResult(statistic=-2.1985598411997276, pvalue=0.03126536534398492)
Ttest_relResult(statistic=19.461034458275336, pvalue=9.916310764718426e-30)


nocharacters < regular = addsays

# Test dropout

In [49]:
df[(df["preprocess"] == "regular") & (df["bert_lr"] == 2e-5) & (df["coref_lr"] == 2e-4) & (df["warmup"] == -1) & (df["weight_decay"] == 1e-3)]

Unnamed: 0,dir,preprocess,bert_lr,coref_lr,character_lr,warmup,weight_decay,dropout,epoch,dev_score,train_score
53,Nov24_11:50:00AM,regular,2e-05,0.0002,0.0002,-1.0,0.001,0.0,11,74.057,84.647
167,Nov24_12:49:03PM,regular,2e-05,0.0002,0.0002,-1.0,0.001,0.3,2,64.877,64.463


In [39]:
nodropout_arr, dropout_arr = [], []
for (preprocess, bert_lr, coref_lr, warmup, weight_decay), _df in df.groupby(["preprocess", "bert_lr", "coref_lr", "warmup", "weight_decay"]):
    if _df["dropout"].unique().size == 2:
        nodropout_arr.append(_df.loc[_df["dropout"] == 0, "dev_score"].mean())
        dropout_arr.append(_df.loc[_df["dropout"] == 0.3, "dev_score"].mean())

In [40]:
print(stats.ttest_rel(nodropout_arr, dropout_arr))

Ttest_relResult(statistic=15.816380621388703, pvalue=1.5136944788478756e-29)


0.3 dropout < zero dropout

# Test bert_lr

In [70]:
bert_1eminus5_arr, bert_2eminus5_arr, bert_5eminus5_arr = [], [], []
for _, _df in df.groupby(["preprocess", "coref_lr", "warmup", "weight_decay", "dropout"]):
   if _df["bert_lr"].unique().size == 3:
    bert_1eminus5_arr.append(_df.loc[_df["bert_lr"] == 1e-5, "dev_score"].mean()) 
    bert_2eminus5_arr.append(_df.loc[_df["bert_lr"] == 2e-5, "dev_score"].mean()) 
    bert_5eminus5_arr.append(_df.loc[_df["bert_lr"] == 5e-5, "dev_score"].mean())
print("all schedules")
print(stats.ttest_rel(bert_1eminus5_arr, bert_2eminus5_arr))
print(stats.ttest_rel(bert_2eminus5_arr, bert_5eminus5_arr))
print(stats.ttest_rel(bert_5eminus5_arr, bert_1eminus5_arr))
print()

bert_1eminus5_arr, bert_2eminus5_arr, bert_5eminus5_arr = [], [], []
for _, _df in df[df["warmup"] == -1].groupby(["preprocess", "coref_lr", "weight_decay", "dropout"]):
   if _df["bert_lr"].unique().size == 3:
    bert_1eminus5_arr.append(_df.loc[_df["bert_lr"] == 1e-5, "dev_score"].mean()) 
    bert_2eminus5_arr.append(_df.loc[_df["bert_lr"] == 2e-5, "dev_score"].mean()) 
    bert_5eminus5_arr.append(_df.loc[_df["bert_lr"] == 5e-5, "dev_score"].mean())
print("no schedule")
print(stats.ttest_rel(bert_1eminus5_arr, bert_2eminus5_arr))
print(stats.ttest_rel(bert_2eminus5_arr, bert_5eminus5_arr))
print(stats.ttest_rel(bert_5eminus5_arr, bert_1eminus5_arr))
print()

bert_1eminus5_arr, bert_2eminus5_arr, bert_5eminus5_arr = [], [], []
for _, _df in df[df["warmup"] == 0].groupby(["preprocess", "coref_lr", "weight_decay", "dropout"]):
   if _df["bert_lr"].unique().size == 3:
    bert_1eminus5_arr.append(_df.loc[_df["bert_lr"] == 1e-5, "dev_score"].mean()) 
    bert_2eminus5_arr.append(_df.loc[_df["bert_lr"] == 2e-5, "dev_score"].mean()) 
    bert_5eminus5_arr.append(_df.loc[_df["bert_lr"] == 5e-5, "dev_score"].mean())
print("schedule with no warmup")
print(stats.ttest_rel(bert_1eminus5_arr, bert_2eminus5_arr))
print(stats.ttest_rel(bert_2eminus5_arr, bert_5eminus5_arr))
print(stats.ttest_rel(bert_5eminus5_arr, bert_1eminus5_arr))
print()

bert_1eminus5_arr, bert_2eminus5_arr, bert_5eminus5_arr = [], [], []
for _, _df in df[df["warmup"] == 1].groupby(["preprocess", "coref_lr", "weight_decay", "dropout"]):
   if _df["bert_lr"].unique().size == 3:
    bert_1eminus5_arr.append(_df.loc[_df["bert_lr"] == 1e-5, "dev_score"].mean()) 
    bert_2eminus5_arr.append(_df.loc[_df["bert_lr"] == 2e-5, "dev_score"].mean()) 
    bert_5eminus5_arr.append(_df.loc[_df["bert_lr"] == 5e-5, "dev_score"].mean())
print("schedule with 1 epoch warmup")
print(stats.ttest_rel(bert_1eminus5_arr, bert_2eminus5_arr))
print(stats.ttest_rel(bert_2eminus5_arr, bert_5eminus5_arr))
print(stats.ttest_rel(bert_5eminus5_arr, bert_1eminus5_arr))

all schedules
Ttest_relResult(statistic=-7.8190532065541305, pvalue=4.282835688950486e-11)
Ttest_relResult(statistic=3.9504100043621757, pvalue=0.00018585459464098706)
Ttest_relResult(statistic=1.5372265351848184, pvalue=0.1288111778336569)

no schedule
Ttest_relResult(statistic=-3.575218660571907, pvalue=0.0016045039622772208)
Ttest_relResult(statistic=2.2235320439592994, pvalue=0.03628478137232081)
Ttest_relResult(statistic=0.1415415007288095, pvalue=0.8886746438759797)

schedule with no warmup
Ttest_relResult(statistic=-5.799541919010412, pvalue=6.572239376443888e-06)
Ttest_relResult(statistic=2.743894121747891, pvalue=0.011564218008498238)
Ttest_relResult(statistic=0.8819148945785081, pvalue=0.38694698847148434)

schedule with 1 epoch warmup
Ttest_relResult(statistic=-4.304604688611293, pvalue=0.0003136206264990181)
Ttest_relResult(statistic=1.815614447345321, pvalue=0.08373685935248419)
Ttest_relResult(statistic=1.763896882076014, pvalue=0.09229367535305728)


for all schedules, bert_lr 2e-5 > 1e-5, 5e-5

for no schedule, bert_lr 2e-5 > 1e-5

for schedule with 0 warmup, 2e-5 > 1e-5

for schedule with 1 epoch warmup, 2e-5 > 1e-5

# Test coref/character_lr

In [69]:
coref_1eminus4_arr, coref_2eminus4_arr = [], []
for _, _df in df.groupby(["preprocess", "bert_lr", "warmup", "weight_decay", "dropout"]):
   if _df["coref_lr"].unique().size == 2:
      coref_1eminus4_arr.append(_df.loc[_df["coref_lr"] == 1e-4, "dev_score"].mean()) 
      coref_2eminus4_arr.append(_df.loc[_df["coref_lr"] == 2e-4, "dev_score"].mean())
print("all schedules", stats.ttest_rel(coref_2eminus4_arr, coref_1eminus4_arr))

coref_1eminus4_arr, coref_2eminus4_arr = [], []
for _, _df in df[df["warmup"] == -1].groupby(["preprocess", "bert_lr", "warmup", "weight_decay", "dropout"]):
   if _df["coref_lr"].unique().size == 2:
      coref_1eminus4_arr.append(_df.loc[_df["coref_lr"] == 1e-4, "dev_score"].mean()) 
      coref_2eminus4_arr.append(_df.loc[_df["coref_lr"] == 2e-4, "dev_score"].mean())
print("no schedule", stats.ttest_rel(coref_2eminus4_arr, coref_1eminus4_arr))

coref_1eminus4_arr, coref_2eminus4_arr = [], []
for _, _df in df[df["warmup"] == 0].groupby(["preprocess", "bert_lr", "warmup", "weight_decay", "dropout"]):
   if _df["coref_lr"].unique().size == 2:
      coref_1eminus4_arr.append(_df.loc[_df["coref_lr"] == 1e-4, "dev_score"].mean()) 
      coref_2eminus4_arr.append(_df.loc[_df["coref_lr"] == 2e-4, "dev_score"].mean())
print("schedule with no warmup", stats.ttest_rel(coref_2eminus4_arr, coref_1eminus4_arr))

coref_1eminus4_arr, coref_2eminus4_arr = [], []
for _, _df in df[df["warmup"] == 1].groupby(["preprocess", "bert_lr", "warmup", "weight_decay", "dropout"]):
   if _df["coref_lr"].unique().size == 2:
      coref_1eminus4_arr.append(_df.loc[_df["coref_lr"] == 1e-4, "dev_score"].mean()) 
      coref_2eminus4_arr.append(_df.loc[_df["coref_lr"] == 2e-4, "dev_score"].mean())
print("schedule with 1 epoch warmup", stats.ttest_rel(coref_2eminus4_arr, coref_1eminus4_arr))

all schedules Ttest_relResult(statistic=-0.6535535711576492, pvalue=0.5148281437947139)
no schedule Ttest_relResult(statistic=0.044817163586460096, pvalue=0.9645077746396036)
schedule with no warmup Ttest_relResult(statistic=-0.37071766044614485, pvalue=0.7130821910911166)
schedule with 1 epoch warmup Ttest_relResult(statistic=-0.9971205120639133, pvalue=0.3259617579514006)


for all schedules, coref_lr 1e-4 = 2e-4

# Test warmup

In [72]:
noschedule_arr, schedulezerowarmup_arr, scheduleonewarmup_arr = [], [], []
for _, _df in df.groupby(["preprocess", "bert_lr", "coref_lr", "weight_decay", "dropout"]):
    if _df["warmup"].unique().size == 3:
        noschedule_arr.append(_df.loc[_df["warmup"] == -1, "dev_score"].mean())
        schedulezerowarmup_arr.append(_df.loc[_df["warmup"] == 0, "dev_score"].mean())
        scheduleonewarmup_arr.append(_df.loc[_df["warmup"] == 1, "dev_score"].mean())
print("All learning rates")
print(stats.ttest_rel(noschedule_arr, schedulezerowarmup_arr))
print(stats.ttest_rel(schedulezerowarmup_arr, scheduleonewarmup_arr))
print(stats.ttest_rel(scheduleonewarmup_arr, noschedule_arr))
print()

noschedule_arr, schedulezerowarmup_arr, scheduleonewarmup_arr = [], [], []
for _, _df in df[df["bert_lr"] == 1e-5].groupby(["preprocess", "bert_lr", "coref_lr", "weight_decay", "dropout"]):
    if _df["warmup"].unique().size == 3:
        noschedule_arr.append(_df.loc[_df["warmup"] == -1, "dev_score"].mean())
        schedulezerowarmup_arr.append(_df.loc[_df["warmup"] == 0, "dev_score"].mean())
        scheduleonewarmup_arr.append(_df.loc[_df["warmup"] == 1, "dev_score"].mean())
print("bert_lr = 1e-5")
print(stats.ttest_rel(noschedule_arr, schedulezerowarmup_arr))
print(stats.ttest_rel(schedulezerowarmup_arr, scheduleonewarmup_arr))
print(stats.ttest_rel(scheduleonewarmup_arr, noschedule_arr))
print()

noschedule_arr, schedulezerowarmup_arr, scheduleonewarmup_arr = [], [], []
for _, _df in df[df["bert_lr"] == 2e-5].groupby(["preprocess", "bert_lr", "coref_lr", "weight_decay", "dropout"]):
    if _df["warmup"].unique().size == 3:
        noschedule_arr.append(_df.loc[_df["warmup"] == -1, "dev_score"].mean())
        schedulezerowarmup_arr.append(_df.loc[_df["warmup"] == 0, "dev_score"].mean())
        scheduleonewarmup_arr.append(_df.loc[_df["warmup"] == 1, "dev_score"].mean())
print("bert_lr = 2e-5")
print(stats.ttest_rel(noschedule_arr, schedulezerowarmup_arr))
print(stats.ttest_rel(schedulezerowarmup_arr, scheduleonewarmup_arr))
print(stats.ttest_rel(scheduleonewarmup_arr, noschedule_arr))
print()

noschedule_arr, schedulezerowarmup_arr, scheduleonewarmup_arr = [], [], []
for _, _df in df[df["bert_lr"] == 5e-5].groupby(["preprocess", "bert_lr", "coref_lr", "weight_decay", "dropout"]):
    if _df["warmup"].unique().size == 3:
        noschedule_arr.append(_df.loc[_df["warmup"] == -1, "dev_score"].mean())
        schedulezerowarmup_arr.append(_df.loc[_df["warmup"] == 0, "dev_score"].mean())
        scheduleonewarmup_arr.append(_df.loc[_df["warmup"] == 1, "dev_score"].mean())
print("bert_lr = 5e-5")
print(stats.ttest_rel(noschedule_arr, schedulezerowarmup_arr))
print(stats.ttest_rel(schedulezerowarmup_arr, scheduleonewarmup_arr))
print(stats.ttest_rel(scheduleonewarmup_arr, noschedule_arr))

All learning rates
Ttest_relResult(statistic=0.24825556530949816, pvalue=0.8046740418482301)
Ttest_relResult(statistic=1.3777702391129951, pvalue=0.17272647347293318)
Ttest_relResult(statistic=-1.9390689959803296, pvalue=0.05658330061898119)

bert_lr = 1e-5
Ttest_relResult(statistic=1.5741549216795436, pvalue=0.1291080469314441)
Ttest_relResult(statistic=1.2633848340927663, pvalue=0.21910777894732586)
Ttest_relResult(statistic=-2.7829520790948465, pvalue=0.010575865179234076)

bert_lr = 2e-5
Ttest_relResult(statistic=-0.34667574960968883, pvalue=0.7319866516876962)
Ttest_relResult(statistic=2.272780511286195, pvalue=0.0326996084395385)
Ttest_relResult(statistic=-1.8588312081624148, pvalue=0.07589421506888916)

bert_lr = 5e-5
Ttest_relResult(statistic=-0.0703899917077399, pvalue=0.9445492794941016)
Ttest_relResult(statistic=0.020829289782804426, pvalue=0.9835784781766992)
Ttest_relResult(statistic=0.059389741488134896, pvalue=0.9532032285565804)


for all learning rates, no schedule = schedule with 0 warmup = schedule with 1 epoch warmup

# Test if removing characters from addsays/nocharacters decrease score

In [75]:
with jsonlines.open(os.path.join(data_dir, "Nov24_11:50:00AM", "dev.jsonlines"), "r") as reader:
    dev_data = [doc for doc in reader]
print(len(dev_data))

3


In [88]:
f1s = []
for doc in dev_data:
    gold_clusters = [set((mention[0], mention[1]) for mention in cluster) for cluster in doc["gold"]]
    pred_clusters = [set(tuple(span) for span in cluster) for cluster in doc["pred_span"]]
    parse = doc["parse"]

    # # remove speakers
    # for i, clusters in enumerate([gold_clusters, pred_clusters]):
    #     _clusters = []
    #     for cluster in clusters:
    #         _cluster = set()
    #         for i, j in cluster:
    #             if set(parse[i: j + 1]).pop() != "C":
    #                 _cluster.add((i, j))
    #         if _cluster:
    #             _clusters.append(_cluster)
    #     if i == 0:
    #         gold_clusters = _clusters
    #     else:
    #         pred_clusters = _clusters
    
    # Merge speakers
    # pred_clusters = rules.merge_speakers(doc["token"], doc["parse"], pred_clusters)

    # Keep speakers
    # pred_clusters = rules.keep_speakers(doc["parse"], pred_clusters)

    f1 = 100*scores.conll2012(gold_clusters, pred_clusters)
    f1s.append(f1)
print(np.mean(f1s))

74.02178441484585
