In [1]:
import pandas as pd
import yaml
import os
import re
import numpy as np

Create cross validation dataframe for variation in

1. bert learning rate
2. model learning rate
3. preprocess
4. warmup

In [2]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec19-21")
header = ["preprocess", "bert_lr", "model_lr", "warmup", "movie", "dev_lea", "train_lea", "best_epoch", "dev_scores",
          "train_scores"]
rows = []

for dir_ in os.listdir(results_dir):
    result_file1 = os.path.join(results_dir, dir_, "result.yaml")
    result_file2 = os.path.join(results_dir, dir_, "result2.yaml")
    if os.path.exists(result_file1):
        with open(result_file1) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
    elif os.path.exists(result_file2):
        with open(result_file2) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
    else:
        continue
    preprocess = result["preprocess"]
    bert_lr = result["bert_lr"]
    model_lr = result["coref_lr"]
    warmup = result["warmup_steps"]
    movie = result["test_movie"]
    dev_lea = result["dev_metric"]["span"]["lea"]["f1"]
    epoch = result["best_epoch"]
    dev_scores = result["dev_scores"]
    train_scores = result["train_scores"]
    train_lea = train_scores[epoch - 1]
    rows.append([preprocess, bert_lr, model_lr, warmup, movie, dev_lea, train_lea, epoch, dev_scores, train_scores])

df = pd.DataFrame(rows, columns=header)
print(df.shape)
print()

for col, dtype in zip(df.columns, df.dtypes):
    print(f"{col} ({dtype})")

(648, 10)

preprocess (object)
bert_lr (float64)
model_lr (float64)
warmup (float64)
movie (object)
dev_lea (float64)
train_lea (float64)
best_epoch (int64)
dev_scores (object)
train_scores (object)


In [3]:
df.to_csv(os.path.join(os.getenv("DATA_DIR"), "mica_text_coref/movie_coref/results/coreference/cross_val_1.csv"),
          index=False)

Create cross validation dataframe for variation in

1. weight decay
2. dropout

In [9]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25")
dirs_with_no_result_file = []

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if not os.path.exists(result_file):
        dir__ = os.path.join(results_dir, dir_)
        print(dir__)
        dirs_with_no_result_file.append(dir__)

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25/Dec25_12:40:52PM_prestige_excerpts_20332
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25/Dec25_12:59:59PM_quiet_place_excerpts_20768
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25/Dec25_02:01:20PM_dead_poets_society_excerpts_22143
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25/Dec25_12:03:21PM_dead_poets_society_excerpts_19492
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25/Dec25_03:24:52PM_zootopia_excerpts_23961
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25/Dec25_03:19:32PM_dead_poets_society_excerpts_16196
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25/Dec25_05:06:05PM_quiet_place_excerpts_26186
/proj/sbaruah/data/mica_text_core

In [19]:
header = ["weight_decay", "dropout", "movie", "dev_lea", "best_epoch", "dev_scores"]
rows = []
epoch_pattern = re.compile(r"Epoch = (\d+)\n"
                           r"dev:: loss=[0-9\.]+, metric:Word=[0-9\.]+, Span=([0-9\.]+), Character=[0-9\.]+")

for dir_ in dirs_with_no_result_file:
    log_file = os.path.join(dir_, "train.log")
    with open(log_file) as f:
        content = f.read()
    match = re.search(r"weight_decay\s+= ([0-9\.]+)", content)
    assert match is not None
    weight_decay = float(match.group(1))
    match = re.search(r"dropout\s+= ([0-9\.]+)", content)
    assert match is not None
    dropout = float(match.group(1))
    match = re.search(r"test_movie\s+= (\w+)", content)
    assert match is not None
    movie = match.group(1)
    
    epochs = []
    dev_scores = []

    for match in re.finditer(epoch_pattern, content):
        epochs.append(int(match.group(1)))
        dev_scores.append(float(match.group(2)))
    
    best_epoch_index = np.argmax(dev_scores).item()
    max_dev_score = dev_scores[best_epoch_index]
    best_epoch = epochs[best_epoch_index]

    print(dir_)
    print(weight_decay, dropout, movie, max_dev_score, best_epoch, dev_scores)
    rows.append([weight_decay, dropout, movie, max_dev_score, best_epoch, dev_scores])
    print()

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25/Dec25_12:40:52PM_prestige_excerpts_20332
100.0 0.0 prestige 46.4 1 [46.4, 38.5, 17.0, 5.0, 1.8, 6.4]

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25/Dec25_12:59:59PM_quiet_place_excerpts_20768
100.0 0.0 quiet_place 55.3 2 [41.0, 55.3, 32.7, 6.3, 0.0, 0.1, 0.0]

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25/Dec25_02:01:20PM_dead_poets_society_excerpts_22143
100.0 0.2 dead_poets_society 41.9 1 [41.9, 28.2, 23.0, 3.4, 4.6, 4.3]

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25/Dec25_12:03:21PM_dead_poets_society_excerpts_19492
100.0 0.0 dead_poets_society 37.9 1 [37.9, 0.9, 11.9, 3.7, 2.3, 1.7]

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25/Dec25_03:24:52PM_zootopia_excerpts_23961
100.0 0.2 zootopia 61.3 1 [61.3, 57.1, 21

In [20]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec25")

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if os.path.exists(result_file):
        with open(result_file) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
    else:
        continue
    weight_decay = result["weight_decay"]
    dropout = result["dropout"]
    movie = result["test_movie"]
    dev_lea = result["dev_metric"]["span"]["lea"]["f1"]
    epoch = result["best_epoch"]
    dev_scores = result["dev_scores"]
    rows.append([weight_decay, dropout, movie, dev_lea, epoch, dev_scores])

df = pd.DataFrame(rows, columns=header)
print(df.shape)
print()

for col, dtype in zip(df.columns, df.dtypes):
    print(f"{col} ({dtype})")

(240, 6)

weight_decay (float64)
dropout (float64)
movie (object)
dev_lea (float64)
best_epoch (int64)
dev_scores (object)


In [21]:
df.to_csv(os.path.join(os.getenv("DATA_DIR"), "mica_text_coref/movie_coref/results/coreference/cross_val_2.csv"),
          index=False)

Create cross validation dataframe for variation in

1. dev document len
2. dev overlap len

In [22]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec26")
dirs_with_no_result_file = []

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if not os.path.exists(result_file):
        dir__ = os.path.join(results_dir, dir_)
        print(dir__)
        dirs_with_no_result_file.append(dir__)

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec26/Dec26_07:56:43AM_dead_poets_society_excerpts_21719
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec26/Dec26_05:13:00AM_avengers_endgame_excerpts_18334
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec26/Dec26_04:04:30PM_quiet_place_excerpts_31725
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec26/Dec26_11:11:20AM_john_wick_excerpts_25606
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec26/Dec26_07:14:21PM_zootopia_excerpts_3041
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec26/Dec26_01:33:32PM_prestige_excerpts_28638


In [25]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec26")
header = ["dev_document_len", "dev_overlap_len", "movie", "dev_lea", "best_epoch", "dev_scores"]
rows = []

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if os.path.exists(result_file):
        with open(result_file) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
    else:
        continue
    dev_document_len = result["dev_document_len"]
    dev_overlap_len = result["dev_overlap_len"]
    movie = result["test_movie"]
    dev_lea = result["dev_metric"]["span"]["lea"]["f1"]
    epoch = result["best_epoch"]
    dev_scores = result["dev_scores"]
    rows.append([dev_document_len, dev_overlap_len, movie, dev_lea, epoch, dev_scores])

df = pd.DataFrame(rows, columns=header)
print(df.shape)
print()

for col, dtype in zip(df.columns, df.dtypes):
    print(f"{col} ({dtype})")

(204, 6)

dev_document_len (int64)
dev_overlap_len (int64)
movie (object)
dev_lea (float64)
best_epoch (int64)
dev_scores (object)


In [26]:
df.to_csv(os.path.join(os.getenv("DATA_DIR"), "mica_text_coref/movie_coref/results/coreference/cross_val_3.csv"),
          index=False)

Create cross validation dataframe for variation in (second run)

1. dev document len
2. dev overlap len

In [1]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec28")
dirs_with_no_result_file = []

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if not os.path.exists(result_file):
        dir__ = os.path.join(results_dir, dir_)
        print(dir__)
        dirs_with_no_result_file.append(dir__)

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec28/Dec28_07:17:56AM_prestige_excerpts_25218
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec28/Dec28_04:34:13AM_prestige_excerpts_21742
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec28/Dec28_12:39:38AM_avengers_endgame_excerpts_16828
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec28/Dec28_03:17:09AM_quiet_place_excerpts_13198
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec28/Dec28_06:38:41AM_quiet_place_excerpts_24463
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec28/Dec28_08:00:39AM_zootopia_excerpts_26199
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec28/Dec28_02:08:56AM_prestige_excerpts_11679
/proj/sbaruah/data/mica_text_coref/movie_coref/results/

In [4]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec28")
header = ["dev_document_len", "dev_overlap_len", "movie", "dev_lea", "best_epoch", "dev_scores"]
rows = []

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if os.path.exists(result_file):
        with open(result_file) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
    else:
        continue
    dev_document_len = result["dev_document_len"]
    dev_overlap_len = result["dev_overlap_len"]
    movie = result["test_movie"]
    dev_lea = result["dev_metric"]["span"]["lea"]["f1"]
    epoch = result["best_epoch"]
    dev_scores = result["dev_scores"]
    rows.append([dev_document_len, dev_overlap_len, movie, dev_lea, epoch, dev_scores])

df = pd.DataFrame(rows, columns=header)
print(df.shape)
print()

for col, dtype in zip(df.columns, df.dtypes):
    print(f"{col} ({dtype})")

(60, 6)

dev_document_len (int64)
dev_overlap_len (int64)
movie (object)
dev_lea (float64)
best_epoch (int64)
dev_scores (object)


In [5]:
macro_df = df.groupby(["dev_document_len", "dev_overlap_len"]).agg({"dev_lea": "mean", "movie": "count",
                                                                    "best_epoch": "mean"})
macro_df = macro_df[macro_df["movie"] == 6]
macro_df

Unnamed: 0_level_0,Unnamed: 1_level_0,dev_lea,movie,best_epoch
dev_document_len,dev_overlap_len,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5120,2048,75.227333,6,10.0
8192,2048,76.658,6,8.166667
8192,3072,77.310667,6,6.666667
10240,2048,76.955833,6,10.5
10240,3072,78.889333,6,7.666667
10240,4096,79.530167,6,7.666667
20480,2048,78.874833,6,8.666667
20480,3072,79.197833,6,7.5
20480,4096,79.5035,6,8.333333
20480,5120,79.380667,6,8.0


In [6]:
old_df = pd.read_csv(os.path.join(os.getenv("DATA_DIR"),
                                  "mica_text_coref/movie_coref/results/coreference/cross_val_3.csv"), index_col=None)
new_df = pd.concat([old_df, df])
new_df.to_csv(os.path.join(os.getenv("DATA_DIR"), "mica_text_coref/movie_coref/results/coreference/cross_val_4.csv"),
              index=False)

Create cross validation data frame for hierarchical model
Variation in dev document len

In [2]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29")
dirs_with_no_result_file = []

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if not os.path.exists(result_file):
        dir__ = os.path.join(results_dir, dir_)
        print(dir__)
        dirs_with_no_result_file.append(dir__)
print()
print(f"{len(dirs_with_no_result_file)} directories have no result files")

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29/Dec30_01:03:59AM_avengers_endgame_excerpts_hi_21693
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29/Dec29_11:42:25PM_dead_poets_society_excerpts_hi_2374
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29/Dec29_09:20:40PM_dead_poets_society_excerpts_hi_17243
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29/Dec30_02:07:39AM_avengers_endgame_excerpts_hi_21361
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29/Dec30_02:20:24AM_avengers_endgame_excerpts_hi_12698
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29/Dec30_08:18:30AM_avengers_endgame_excerpts_hi_12692
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29/Dec30_09:40:51AM_ave

In [5]:
for dir_ in dirs_with_no_result_file:
    log_file = os.path.join(dir_, "train.log")
    with open(log_file) as fr:
        content = fr.read()
    print(dir_)
    print(content.rstrip().endswith("Testing"))
    print("")

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29/Dec30_01:03:59AM_avengers_endgame_excerpts_hi_21693
True

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29/Dec29_11:42:25PM_dead_poets_society_excerpts_hi_2374
True

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29/Dec29_09:20:40PM_dead_poets_society_excerpts_hi_17243
True

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29/Dec30_02:07:39AM_avengers_endgame_excerpts_hi_21361
False

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29/Dec30_02:20:24AM_avengers_endgame_excerpts_hi_12698
False

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29/Dec30_08:18:30AM_avengers_endgame_excerpts_hi_12692
False

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val

In [11]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec29")
epoch_pattern = re.compile(r"Epoch = (\d+)\n"
                           r"dev:: loss=[0-9\.]+, metric:Word=[0-9\.]+, Span=([0-9\.]+), Character=[0-9\.]+\n"
                           r"([0-9\.]+) GB GPU memory allocated during dev inference")
header = ["dev_document_len", "repk", "movie", "dev_lea", "best_epoch", "dev_scores", "dev_gpu_memory"]
rows = []

def search_regex(pattern, content):
    match = re.search(pattern, content)
    assert match is not None
    return match.group(1)

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    log_file = os.path.join(results_dir, dir_, "train.log")
    with open(log_file) as fr:
        content = fr.read()
    if os.path.exists(result_file):
        with open(result_file) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
        dev_document_len = result["dev_document_len"]
        movie = result["test_movie"]
        dev_lea = result["dev_metric"]["span"]["lea"]["f1"]
        epoch = result["best_epoch"]
        dev_scores = result["dev_scores"]
        dev_gpu_memory = result["dev_gpu_memory"][epoch - 1]
    else:
        if re.search(r"\nTesting", content) is not None:
            dev_document_len = int(search_regex(r"dev_document_len\s+= (\d+)", content))
            movie = search_regex(r"test_movie\s+= (\w+)", content)
            dev_scores, epochs, dev_gpu_memory = [], [], []
            for match in re.finditer(epoch_pattern, content):
                dev_scores.append(float(match.group(2)))
                epochs.append(int(match.group(1)))
                dev_gpu_memory.append(float(match.group(3)))
            i = np.argmax(dev_scores)
            dev_lea = dev_scores[i]
            epoch = epochs[i]
            dev_gpu_memory = dev_gpu_memory[i]
        else:
            continue
    repk = int(search_regex(r"n_representative_mentions\s+= (\d+)", content))
    rows.append([dev_document_len, repk, movie, dev_lea, epoch, dev_scores, dev_gpu_memory])

df = pd.DataFrame(rows, columns=header)
print(df.shape)
print()

for col, dtype in zip(df.columns, df.dtypes):
    print(f"{col} ({dtype})")

(165, 7)

dev_document_len (int64)
repk (int64)
movie (object)
dev_lea (float64)
best_epoch (int64)
dev_scores (object)
dev_gpu_memory (float64)


In [12]:
macro_df = df.groupby(["dev_document_len", "repk"]).agg({"dev_lea": "mean", "movie": "count", "best_epoch": "mean",
                                                 "dev_gpu_memory": "mean"})
macro_df = macro_df[macro_df["movie"] == 6]
macro_df

Unnamed: 0_level_0,Unnamed: 1_level_0,dev_lea,movie,best_epoch,dev_gpu_memory
dev_document_len,repk,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2048,1,65.067333,6,9.333333,8.86
2048,2,67.722833,6,8.0,12.94
2048,3,68.848333,6,7.833333,19.486667
3072,1,66.0215,6,7.666667,8.533333
3072,2,68.347333,6,9.0,11.503333
3072,3,70.927333,6,8.666667,16.506667
4096,1,67.065333,6,9.166667,8.381667
4096,2,70.519833,6,10.5,10.873333
4096,3,70.619833,6,7.5,15.518333
5120,1,61.7085,6,6.166667,8.405


In [16]:
df.to_csv(os.path.join(os.getenv("DATA_DIR"), "mica_text_coref/movie_coref/results/coreference/cross_val_5.csv"),
          index=False)

Create cross validation data frame for hierarchical model
Variation in dev document len (2nd run)

In [37]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31")
dirs_with_no_result_file = []

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if not os.path.exists(result_file):
        dir__ = os.path.join(results_dir, dir_)
        print(dir__)
        dirs_with_no_result_file.append(dir__)
print()
print(f"{len(dirs_with_no_result_file)} directories have no result files")

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31/Dec31_11:59:20AM_avengers_endgame_excerpts_hi_19148
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31/Dec31_04:00:25PM_avengers_endgame_excerpts_hi_18756
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31/Dec31_11:59:32AM_avengers_endgame_excerpts_hi_13795
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31/Dec31_03:24:19PM_avengers_endgame_excerpts_hi_17946
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31/Dec31_11:59:32AM_avengers_endgame_excerpts_hi_13768
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31/Dec31_03:50:53PM_avengers_endgame_excerpts_hi_23712
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31/Dec31_03:37:40PM_avenge

In [39]:
for dir_ in dirs_with_no_result_file:
    log_file = os.path.join(dir_, "train.log")
    with open(log_file) as fr:
        content = fr.read()
    print(dir_)
    print(content.rstrip().endswith("Testing"))
    # print(content)
    print("")

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31/Dec31_11:59:20AM_avengers_endgame_excerpts_hi_19148
False

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31/Dec31_04:00:25PM_avengers_endgame_excerpts_hi_18756
False

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31/Dec31_11:59:32AM_avengers_endgame_excerpts_hi_13795
False

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31/Dec31_03:24:19PM_avengers_endgame_excerpts_hi_17946
False

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31/Dec31_11:59:32AM_avengers_endgame_excerpts_hi_13768
False

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31/Dec31_03:50:53PM_avengers_endgame_excerpts_hi_23712
False

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val

In [40]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_hi_Dec31")
epoch_pattern = re.compile(r"Epoch = (\d+)\n"
                           r"dev:: loss=[0-9\.]+, metric:Word=[0-9\.]+, Span=([0-9\.]+), Character=[0-9\.]+\n"
                           r"([0-9\.]+) GB GPU memory allocated during dev inference")
header = ["dev_document_len", "repk", "movie", "dev_lea", "best_epoch", "dev_scores", "dev_gpu_memory"]
rows = []

def search_regex(pattern, content):
    match = re.search(pattern, content)
    assert match is not None
    return match.group(1)

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    log_file = os.path.join(results_dir, dir_, "train.log")
    with open(log_file) as fr:
        content = fr.read()
    if os.path.exists(result_file):
        with open(result_file) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
        dev_document_len = result["dev_document_len"]
        repk = result["repk"]
        movie = result["test_movie"]
        dev_lea = result["dev_metric"]["span"]["lea"]["f1"]
        epoch = result["best_epoch"]
        dev_scores = result["dev_scores"]
        dev_gpu_memory = result["dev_gpu_memory"][epoch - 1]
    else:
        if re.search(r"\nTesting", content) is not None:
            dev_document_len = int(search_regex(r"dev_document_len\s+= (\d+)", content))
            repk = int(search_regex(r"n_representative_mentions\s+= (\d+)", content))
            movie = search_regex(r"test_movie\s+= (\w+)", content)
            dev_scores, epochs, dev_gpu_memory = [], [], []
            for match in re.finditer(epoch_pattern, content):
                dev_scores.append(float(match.group(2)))
                epochs.append(int(match.group(1)))
                dev_gpu_memory.append(float(match.group(3)))
            i = np.argmax(dev_scores)
            dev_lea = dev_scores[i]
            epoch = epochs[i]
            dev_gpu_memory = dev_gpu_memory[i]
        else:
            continue
    rows.append([dev_document_len, repk, movie, dev_lea, epoch, dev_scores, dev_gpu_memory])

df = pd.DataFrame(rows, columns=header)
print(df.shape)
print()

for col, dtype in zip(df.columns, df.dtypes):
    print(f"{col} ({dtype})")

(41, 7)

dev_document_len (int64)
repk (int64)
movie (object)
dev_lea (float64)
best_epoch (int64)
dev_scores (object)
dev_gpu_memory (float64)


In [41]:
df[df["movie"] == "avengers_endgame"]

Unnamed: 0,dev_document_len,repk,movie,dev_lea,best_epoch,dev_scores,dev_gpu_memory
4,20480,5,avengers_endgame,67.904,8,"[54.7656, 55.5278, 30.723, 47.3549, 47.2504, 5...",26.56


In [9]:
macro_df = df.groupby(["dev_document_len", "repk"]).agg({"dev_lea": "mean", "movie": "count", "best_epoch": "mean",
                                                 "dev_gpu_memory": "mean"})
# macro_df = macro_df[macro_df["movie"] == 6]
macro_df

Unnamed: 0_level_0,Unnamed: 1_level_0,dev_lea,movie,best_epoch,dev_gpu_memory
dev_document_len,repk,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5120,5,78.1036,5,10.6,17.25
5120,6,77.8622,5,9.2,21.184
8192,5,81.1418,5,11.6,14.296
8192,6,81.2606,5,12.2,17.03
10240,5,78.5274,5,9.2,15.268
10240,6,79.1442,5,8.4,18.68
20480,5,76.946667,6,8.333333,17.008333
20480,6,79.4044,5,10.0,16.754


In [None]:
df.to_csv(os.path.join(os.getenv("DATA_DIR"), "mica_text_coref/movie_coref/results/coreference/cross_val_5.csv"),
          index=False)

Create cross validation dataframe for the less important hyperparameters

In [2]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30")
dirs_with_no_result_file = []

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if not os.path.exists(result_file):
        dir__ = os.path.join(results_dir, dir_)
        print(dir__)
        dirs_with_no_result_file.append(dir__)
print()
print(f"{len(dirs_with_no_result_file)} directories have no result files")

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30/Dec30_06:55:20AM_avengers_endgame_excerpts_17977
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30/Dec30_02:20:25PM_dead_poets_society_excerpts_12519
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30/Dec31_11:55:13PM_zootopia_excerpts_27639
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30/Dec30_11:03:52AM_dead_poets_society_excerpts_16957
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30/Dec30_05:53:44AM_avengers_endgame_excerpts_2365
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30/Dec31_06:24:50AM_prestige_excerpts_7317
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30/Dec31_10:38:15PM_zootopia_excerpts_26231
/proj/sbaruah/data/mica_text_coref/

In [3]:
for dir_ in dirs_with_no_result_file:
    log_file = os.path.join(dir_, "train.log")
    with open(log_file) as fr:
        content = fr.read()
    if not content.rstrip().endswith("Testing"):
        print(dir_)
        # print(content)
        print("")

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30/Jan01_02:45:35AM_zootopia_excerpts_31029

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30/Jan01_02:08:42AM_zootopia_excerpts_21677

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30/Jan01_02:31:54AM_zootopia_excerpts_30742

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30/Jan01_02:33:48AM_zootopia_excerpts_22159

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30/Jan01_02:29:51AM_quiet_place_excerpts_3742



In [4]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Dec30")
epoch_pattern = re.compile(r"Epoch = (\d+)\n"
                           r"dev:: loss=[0-9\.]+, metric:Word=[0-9\.]+, Span=([0-9\.]+), Character=[0-9\.]+")
header = ["dev_document_len", "dev_overlap_len", "merge_strategy", "load_bert", "freeze_bert", "add_cr_to_coarse",
          "filter_by_cr", "remove_singleton", "movie", "dev_lea", "best_epoch"]
rows = []

def search_regex(pattern, content):
    match = re.search(pattern, content)
    assert match is not None
    return match.group(1)

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    log_file = os.path.join(results_dir, dir_, "train.log")
    with open(log_file) as fr:
        content = fr.read()
    if os.path.exists(result_file):
        with open(result_file) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
        dev_document_len = result["dev_document_len"]
        dev_overlap_len = result["dev_overlap_len"]
        merge_strategy = result["dev_merge_strategy"]
        freeze_bert = result["freeze_bert"]
        add_cr_to_coarse = result["add_cr_to_coarse"]
        filter_by_cr = result["filter_mentions_by_cr"]
        remove_singleton = result["remove_singleton_cr"]
        movie = result["test_movie"]
        dev_lea = result["dev_metric"]["span"]["lea"]["f1"]
        epoch = result["best_epoch"]
    else:
        if re.search(r"\nTesting", content) is not None:
            dev_document_len = int(search_regex(r"dev_document_len\s+= (\d+)", content))
            dev_overlap_len = int(search_regex(r"dev_overlap_len\s+= (\d+)", content))
            merge_strategy = search_regex(r"dev_merge_strategy\s+= (\w+)", content)
            freeze_bert = search_regex(r"freeze_bert\s+= (\w+)", content) == "True"
            add_cr_to_coarse = search_regex(r"add_cr_to_coarse\s+= (\w+)", content) == "True"
            filter_by_cr = search_regex(r"filter_mentions_by_cr\s+= (\w+)", content) == "True"
            remove_singleton = search_regex(r"remove_singleton_cr\s+= (\w+)", content) == "True"
            movie = search_regex(r"test_movie\s+= (\w+)", content)
            dev_scores, epochs = [], []
            for match in re.finditer(epoch_pattern, content):
                dev_scores.append(float(match.group(2)))
                epochs.append(int(match.group(1)))
            i = np.argmax(dev_scores)
            dev_lea = dev_scores[i]
            epoch = epochs[i]
        else:
            continue
    load_bert = search_regex(r"load_bert\s+= (\w+)", content) == "True"
    rows.append([dev_document_len, dev_overlap_len, merge_strategy, load_bert, freeze_bert, add_cr_to_coarse,
                 filter_by_cr, remove_singleton, movie, dev_lea, epoch])

df = pd.DataFrame(rows, columns=header)
print(df.shape)
print()

for col, dtype in zip(df.columns, df.dtypes):
    print(f"{col} ({dtype})")

(306, 11)

dev_document_len (int64)
dev_overlap_len (int64)
merge_strategy (object)
load_bert (bool)
freeze_bert (bool)
add_cr_to_coarse (bool)
filter_by_cr (bool)
remove_singleton (bool)
movie (object)
dev_lea (float64)
best_epoch (int64)


In [32]:
df_ = df.copy()
df_["movie_"] = df["movie"].str.slice(0, 1)
macro_df = (df_.groupby(["dev_document_len", "dev_overlap_len", "load_bert", "freeze_bert",
                        "add_cr_to_coarse", "filter_by_cr", "remove_singleton", "merge_strategy"])
               .agg({"dev_lea": "mean", "movie_": ["count", ",".join], "best_epoch": "mean"}))
# macro_df = macro_df[macro_df["movie"] == 6]
pd.set_option("display.max_rows", 100)
macro_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,dev_lea,movie_,movie_,best_epoch
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,mean,count,join,mean
dev_document_len,dev_overlap_len,load_bert,freeze_bert,add_cr_to_coarse,filter_by_cr,remove_singleton,merge_strategy,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
5120,512,False,False,True,False,True,avg,54.886167,6,"q,z,p,d,a,j",18.333333
5120,512,True,False,False,False,True,avg,60.1705,6,"z,p,d,q,a,j",7.0
5120,512,True,False,True,False,False,avg,16.933333,6,"z,d,q,p,a,j",9.333333
5120,512,True,False,True,False,True,avg,58.4578,5,"j,p,a,d,q",10.0
5120,512,True,False,True,False,True,max,60.500333,6,"d,z,p,q,j,a",10.333333
5120,512,True,False,True,False,True,min,59.871833,6,"p,j,a,q,d,z",6.666667
5120,512,True,False,True,False,True,none,60.0165,6,"z,d,q,a,j,p",7.166667
5120,512,True,False,True,False,True,post,59.878,6,"p,z,j,d,q,a",6.666667
5120,512,True,False,True,False,True,pre,60.295333,6,"q,a,p,z,d,j",9.166667
5120,512,True,False,True,True,True,avg,60.697167,6,"j,q,d,a,z,p",10.166667


In [36]:
for index, row in macro_df.iterrows():
     if row["movie_"]["count"] < 6:
          missing_movies = set(["a","p","q","z","j","d"]).difference(set(row["movie_"]["join"].split(",")))
          # print(index, missing_movies)
          (dev_document_len, dev_overlap_len, load_bert, freeze_bert, add_cr_to_coarse, filter_by_cr,
           remove_singleton, merge_strategy) = index
          load_bert_ = "" if load_bert else "no"
          freeze_bert_ = "" if freeze_bert else "no"
          add_cr_to_coarse_ = "" if add_cr_to_coarse else "no"
          filter_by_cr_ = "" if filter_by_cr else "no"
          remove_singleton_ = "" if remove_singleton else "no"
          for movie in missing_movies:
               if movie == "z":
                    movie_ = "zootopia"
               elif movie == "q":
                    movie_ = "quiet_place"
               else: assert False
               print(f"$py --dev_document_len={dev_document_len} --dev_overlap_len={dev_overlap_len} "
                    f"--dev_merge_strategy={merge_strategy} --test_document_lens={dev_document_len} "
                    f"--test_overlap_lens={dev_overlap_len} --test_merge_strategies={merge_strategy} "
                    f"--{load_bert_}load_bert --{freeze_bert_}freeze_bert --{add_cr_to_coarse_}add_cr_to_coarse "
                    f"--{filter_by_cr_}filter_by_cr --{remove_singleton_}remove_singleton_cr "
                    f"--test_movie={movie_}")

$py --dev_document_len=5120 --dev_overlap_len=512 --dev_merge_strategy=avg --test_document_lens=5120 --test_overlap_lens=512 --test_merge_strategies=avg --load_bert --nofreeze_bert --add_cr_to_coarse --nofilter_by_cr --remove_singleton_cr --test_movie=zootopia
$py --dev_document_len=5120 --dev_overlap_len=1024 --dev_merge_strategy=avg --test_document_lens=5120 --test_overlap_lens=1024 --test_merge_strategies=avg --load_bert --nofreeze_bert --add_cr_to_coarse --nofilter_by_cr --remove_singleton_cr --test_movie=zootopia
$py --dev_document_len=5120 --dev_overlap_len=1024 --dev_merge_strategy=max --test_document_lens=5120 --test_overlap_lens=1024 --test_merge_strategies=max --load_bert --nofreeze_bert --add_cr_to_coarse --nofilter_by_cr --remove_singleton_cr --test_movie=zootopia
$py --dev_document_len=5120 --dev_overlap_len=1024 --dev_merge_strategy=min --test_document_lens=5120 --test_overlap_lens=1024 --test_merge_strategies=min --load_bert --nofreeze_bert --add_cr_to_coarse --nofilter_

In [5]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Jan01")
dirs_with_no_result_file = []

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if not os.path.exists(result_file):
        dir__ = os.path.join(results_dir, dir_)
        print(dir__)
        dirs_with_no_result_file.append(dir__)
print()
print(f"{len(dirs_with_no_result_file)} directories have no result files")

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Jan01/Jan02_05:26:39PM_zootopia_excerpts_15084
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Jan01/Jan02_07:16:21AM_zootopia_excerpts_3127

2 directories have no result files


In [6]:
for dir_ in dirs_with_no_result_file:
    log_file = os.path.join(dir_, "train.log")
    with open(log_file) as fr:
        content = fr.read()
    if not content.rstrip().endswith("Testing"):
        print(dir_)
        # print(content)
        print("")

In [7]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Jan01")
epoch_pattern = re.compile(r"Epoch = (\d+)\n"
                           r"dev:: loss=[0-9\.]+, metric:Word=[0-9\.]+, Span=([0-9\.]+), Character=[0-9\.]+")
header = ["dev_document_len", "dev_overlap_len", "merge_strategy", "load_bert", "freeze_bert", "add_cr_to_coarse",
          "filter_by_cr", "remove_singleton", "movie", "dev_lea", "best_epoch"]
rows = []

def search_regex(pattern, content):
    match = re.search(pattern, content)
    assert match is not None
    return match.group(1)

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    log_file = os.path.join(results_dir, dir_, "train.log")
    with open(log_file) as fr:
        content = fr.read()
    if os.path.exists(result_file):
        with open(result_file) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
        dev_document_len = result["dev_document_len"]
        dev_overlap_len = result["dev_overlap_len"]
        merge_strategy = result["dev_merge_strategy"]
        freeze_bert = result["freeze_bert"]
        add_cr_to_coarse = result["add_cr_to_coarse"]
        filter_by_cr = result["filter_mentions_by_cr"]
        remove_singleton = result["remove_singleton_cr"]
        movie = result["test_movie"]
        dev_lea = result["dev_metric"]["span"]["lea"]["f1"]
        epoch = result["best_epoch"]
    else:
        if re.search(r"\nTesting", content) is not None:
            dev_document_len = int(search_regex(r"dev_document_len\s+= (\d+)", content))
            dev_overlap_len = int(search_regex(r"dev_overlap_len\s+= (\d+)", content))
            merge_strategy = search_regex(r"dev_merge_strategy\s+= (\w+)", content)
            freeze_bert = search_regex(r"freeze_bert\s+= (\w+)", content) == "True"
            add_cr_to_coarse = search_regex(r"add_cr_to_coarse\s+= (\w+)", content) == "True"
            filter_by_cr = search_regex(r"filter_mentions_by_cr\s+= (\w+)", content) == "True"
            remove_singleton = search_regex(r"remove_singleton_cr\s+= (\w+)", content) == "True"
            movie = search_regex(r"test_movie\s+= (\w+)", content)
            dev_scores, epochs = [], []
            for match in re.finditer(epoch_pattern, content):
                dev_scores.append(float(match.group(2)))
                epochs.append(int(match.group(1)))
            i = np.argmax(dev_scores)
            dev_lea = dev_scores[i]
            epoch = epochs[i]
        else:
            continue
    load_bert = search_regex(r"load_bert\s+= (\w+)", content) == "True"
    rows.append([dev_document_len, dev_overlap_len, merge_strategy, load_bert, freeze_bert, add_cr_to_coarse,
                 filter_by_cr, remove_singleton, movie, dev_lea, epoch])

df1 = pd.DataFrame(rows, columns=header)
print(df1.shape)
print()

for col, dtype in zip(df1.columns, df1.dtypes):
    print(f"{col} ({dtype})")

(24, 11)

dev_document_len (int64)
dev_overlap_len (int64)
merge_strategy (object)
load_bert (bool)
freeze_bert (bool)
add_cr_to_coarse (bool)
filter_by_cr (bool)
remove_singleton (bool)
movie (object)
dev_lea (float64)
best_epoch (int64)


In [8]:
df2 = pd.concat([df, df1])

In [9]:
print(df2.shape)
for col, dtype in zip(df2.columns, df2.dtypes):
    print(f"{col} ({dtype})")

(330, 11)
dev_document_len (int64)
dev_overlap_len (int64)
merge_strategy (object)
load_bert (bool)
freeze_bert (bool)
add_cr_to_coarse (bool)
filter_by_cr (bool)
remove_singleton (bool)
movie (object)
dev_lea (float64)
best_epoch (int64)


In [10]:
df_ = df2.copy()
df_["movie_"] = df["movie"].str.slice(0, 1)
macro_df = (df_.groupby(["dev_document_len", "dev_overlap_len", "load_bert", "freeze_bert",
                        "add_cr_to_coarse", "filter_by_cr", "remove_singleton", "merge_strategy"])
               .agg({"dev_lea": "mean", "movie_": ["count", ",".join], "best_epoch": "mean"}))
# macro_df = macro_df[macro_df["movie"] == 6]
pd.set_option("display.max_rows", 100)
macro_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,dev_lea,movie_,movie_,best_epoch
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,mean,count,join,mean
dev_document_len,dev_overlap_len,load_bert,freeze_bert,add_cr_to_coarse,filter_by_cr,remove_singleton,merge_strategy,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
5120,512,False,False,True,False,True,avg,54.886167,6,"q,z,p,d,a,j",18.333333
5120,512,True,False,False,False,True,avg,60.1705,6,"z,p,d,q,a,j",7.0
5120,512,True,False,True,False,False,avg,16.933333,6,"z,d,q,p,a,j",9.333333
5120,512,True,False,True,False,True,avg,60.682,6,"j,p,a,d,q,q",10.166667
5120,512,True,False,True,False,True,max,60.500333,6,"d,z,p,q,j,a",10.333333
5120,512,True,False,True,False,True,min,59.871833,6,"p,j,a,q,d,z",6.666667
5120,512,True,False,True,False,True,none,60.0165,6,"z,d,q,a,j,p",7.166667
5120,512,True,False,True,False,True,post,59.878,6,"p,z,j,d,q,a",6.666667
5120,512,True,False,True,False,True,pre,60.295333,6,"q,a,p,z,d,j",9.166667
5120,512,True,False,True,True,True,avg,60.697167,6,"j,q,d,a,z,p",10.166667


In [13]:
df2.to_csv(os.path.join(os.getenv("DATA_DIR"), "mica_text_coref/movie_coref/results/coreference/cross_val_6.csv"),
          index=False)

Create cross validation dataframe for other less important hyperparameters

In [14]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Jan02")
dirs_with_no_result_file = []

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if not os.path.exists(result_file):
        dir__ = os.path.join(results_dir, dir_)
        print(dir__)
        dirs_with_no_result_file.append(dir__)
print()
print(f"{len(dirs_with_no_result_file)} directories have no result files")

for dir_ in dirs_with_no_result_file:
    log_file = os.path.join(dir_, "train.log")
    with open(log_file) as fr:
        content = fr.read()
    if not content.rstrip().endswith("Testing"):
        print(dir_)
        # print(content)
        print("")

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Jan02/Jan04_01:41:34AM_zootopia_excerpts_18761
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Jan02/Jan04_01:15:55AM_zootopia_excerpts_22109
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Jan02/Jan04_01:21:53AM_quiet_place_excerpts_18820
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Jan02/Jan03_06:22:00AM_prestige_excerpts_29431
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Jan02/Jan04_01:40:37AM_zootopia_excerpts_11380
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Jan02/Jan04_01:30:17AM_zootopia_excerpts_19022

6 directories have no result files
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Jan02/Jan04_01:41:34AM_zootopia_excerpts_18761

/proj/sbaruah/data/mica_text_

In [15]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/cross_val_excerpts_Jan02")
epoch_pattern = re.compile(r"Epoch = (\d+)\n"
                           r"dev:: loss=[0-9\.]+, metric:Word=[0-9\.]+, Span=([0-9\.]+), Character=[0-9\.]+")
header = ["dev_document_len", "dev_overlap_len", "genre", "bce_weight", "movie", "dev_lea", "best_epoch"]
rows = []

def search_regex(pattern, content):
    match = re.search(pattern, content)
    assert match is not None
    return match.group(1)

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    log_file = os.path.join(results_dir, dir_, "train.log")
    with open(log_file) as fr:
        content = fr.read()
    if os.path.exists(result_file):
        with open(result_file) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
        dev_document_len = result["dev_document_len"]
        dev_overlap_len = result["dev_overlap_len"]
        genre = result["genre"]
        bce_weight = result["bce_weight"]
        movie = result["test_movie"]
        dev_lea = result["dev_metric"]["span"]["lea"]["f1"]
        epoch = result["best_epoch"]
    else:
        if re.search(r"\nTesting", content) is not None:
            dev_document_len = int(search_regex(r"dev_document_len\s+= (\d+)", content))
            dev_overlap_len = int(search_regex(r"dev_overlap_len\s+= (\d+)", content))
            genre = search_regex(r"genre\s+= (\w+)", content)
            bce_weight = float(search_regex(r"bce_weight\s+= ([0-9\.]+)", content))
            movie = search_regex(r"test_movie\s+= (\w+)", content)
            dev_scores, epochs = [], []
            for match in re.finditer(epoch_pattern, content):
                dev_scores.append(float(match.group(2)))
                epochs.append(int(match.group(1)))
            i = np.argmax(dev_scores)
            dev_lea = dev_scores[i]
            epoch = epochs[i]
        else:
            continue
    rows.append([dev_document_len, dev_overlap_len, genre, bce_weight, movie, dev_lea, epoch])

df = pd.DataFrame(rows, columns=header)
print(df.shape)
print()

for col, dtype in zip(df.columns, df.dtypes):
    print(f"{col} ({dtype})")

(328, 7)

dev_document_len (int64)
dev_overlap_len (int64)
genre (object)
bce_weight (float64)
movie (object)
dev_lea (float64)
best_epoch (int64)


In [16]:
macro_df = (df.groupby(["dev_document_len", "dev_overlap_len", "genre", "bce_weight"])
              .agg({"dev_lea": "mean", "best_epoch": "mean", "movie": "count"}))
macro_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,dev_lea,best_epoch,movie
dev_document_len,dev_overlap_len,genre,bce_weight,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5120,512,bc,0.5,60.925333,6.166667,6
5120,512,bn,0.5,62.362167,11.333333,6
5120,512,mz,0.5,60.3535,6.5,6
5120,512,nw,0.5,61.142333,9.833333,6
5120,512,pt,0.5,60.9035,7.666667,6
5120,512,tc,0.5,60.374667,7.666667,6
5120,512,wb,0.0,59.612167,6.166667,6
5120,512,wb,0.25,58.745,8.0,6
5120,512,wb,0.5,60.682,10.166667,12
5120,512,wb,0.75,61.3325,10.0,6


In [17]:
macro_df.loc[(5120, 512, slice(None), slice(None))]

Unnamed: 0_level_0,Unnamed: 1_level_0,dev_lea,best_epoch,movie
genre,bce_weight,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bc,0.5,60.925333,6.166667,6
bn,0.5,62.362167,11.333333,6
mz,0.5,60.3535,6.5,6
nw,0.5,61.142333,9.833333,6
pt,0.5,60.9035,7.666667,6
tc,0.5,60.374667,7.666667,6
wb,0.0,59.612167,6.166667,6
wb,0.25,58.745,8.0,6
wb,0.5,60.682,10.166667,12
wb,0.75,61.3325,10.0,6


Final cross validation dataframe

In [2]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/final_Jan12")
dirs_with_no_result_file = []

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if not os.path.exists(result_file):
        dir__ = os.path.join(results_dir, dir_)
        print(dir__)
        dirs_with_no_result_file.append(dir__)
print()
print(f"{len(dirs_with_no_result_file)} directories have no result files")

for dir_ in dirs_with_no_result_file:
    log_file = os.path.join(dir_, "train.log")
    with open(log_file) as fr:
        content = fr.read()
    if not content.rstrip().endswith("Testing"):
        print(dir_)
        # print(content)
        print("")

/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/final_Jan12/Jan12_10:00:10PM_john_wick_excerpts_20363
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/final_Jan12/Jan12_01:30:02PM_avengers_endgame_excerpts_hi_13274
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/final_Jan12/Jan13_12:28:18AM_prestige_excerpts_23496
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/final_Jan12/Jan12_03:41:47PM_avengers_endgame_excerpts_12809
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/final_Jan12/Jan12_08:33:57PM_prestige_excerpts_28606
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/final_Jan12/Jan12_01:38:17PM_avengers_endgame_excerpts_hi_23831
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/final_Jan12/Jan13_06:05:22AM_zootopia_excerpts_30139
/proj/sbaruah/data/mica_text_coref/movie_coref/results/coreference/final_Jan12/Jan12_01:41:22PM_avengers_endgame_excerpts_hi_2398

In [12]:
results_dir = os.path.join(os.getenv("DATA_DIR"),
                           "mica_text_coref/movie_coref/results/coreference/final_Jan12")
header = (["document_len", "overlap_len", "hierarchical", "repk", "movie"]
          + ["muc_p", "muc_r", "muc_f1"]
          + ["bcub_p", "bcub_r", "bcub_f1"]
          + ["ceafe_p", "ceafe_r", "ceafe_f1"]
          + ["conll_f1"]
          + ["lea_p", "lea_r", "lea_f1"]
          + ["max_gpu", "best_epoch"])
small_header = ["document_len", "overlap_len", "hierarchical", "repk", "movie"]
rows = []

for dir_ in os.listdir(results_dir):
    result_file = os.path.join(results_dir, dir_, "result.yaml")
    if os.path.exists(result_file):
        with open(result_file) as f:
            result = yaml.load(f, Loader=yaml.FullLoader)
        document_len = result["dev_document_len"]
        overlap_len = result["dev_overlap_len"]
        hierarchical = result["hierarchical"]
        repk = result["repk"]
        movie = result["test_movie"]

        muc_p = result["dev_metric"]["span"]["muc"]["precision"]
        muc_r = result["dev_metric"]["span"]["muc"]["recall"]
        muc_f1 = result["dev_metric"]["span"]["muc"]["f1"]

        bcub_p = result["dev_metric"]["span"]["bcub"]["precision"]
        bcub_r = result["dev_metric"]["span"]["bcub"]["recall"]
        bcub_f1 = result["dev_metric"]["span"]["bcub"]["f1"]

        ceafe_p = result["dev_metric"]["span"]["ceafe"]["precision"]
        ceafe_r = result["dev_metric"]["span"]["ceafe"]["recall"]
        ceafe_f1 = result["dev_metric"]["span"]["ceafe"]["f1"]

        lea_p = result["dev_metric"]["span"]["lea"]["precision"]
        lea_r = result["dev_metric"]["span"]["lea"]["recall"]
        lea_f1 = result["dev_metric"]["span"]["lea"]["f1"]

        conll_f1 = (muc_f1 + bcub_f1 + ceafe_f1)/3

        max_gpu = max(result["dev_gpu_memory"])
        best_epoch = result["best_epoch"]

        rows.append([document_len, overlap_len, hierarchical, repk, movie,
                     muc_p, muc_r, muc_f1,
                     bcub_p, bcub_r, bcub_f1,
                     ceafe_p, ceafe_r, ceafe_f1,
                     conll_f1,
                     lea_p, lea_r, lea_f1,
                     max_gpu, best_epoch])

df = pd.DataFrame(rows, columns=header)
print(df.shape)
print()

for col, dtype in zip(df.columns, df.dtypes):
    print(f"{col} ({dtype})")

(322, 20)

document_len (int64)
overlap_len (int64)
hierarchical (bool)
repk (int64)
movie (object)
muc_p (float64)
muc_r (float64)
muc_f1 (float64)
bcub_p (float64)
bcub_r (float64)
bcub_f1 (float64)
ceafe_p (float64)
ceafe_r (float64)
ceafe_f1 (float64)
conll_f1 (float64)
lea_p (float64)
lea_r (float64)
lea_f1 (float64)
max_gpu (float64)
best_epoch (int64)


In [19]:
macro_df = (df.groupby(["document_len", "overlap_len", "hierarchical", "repk"])
              .agg({"movie": "count", "max_gpu": ["max", "mean"], "best_epoch": "mean",
                    "muc_p": "mean", "muc_r": "mean", "muc_f1": "mean",
                    "bcub_p": "mean", "bcub_r": "mean", "bcub_f1": "mean",
                    "ceafe_p": "mean", "ceafe_r": "mean", "ceafe_f1": "mean",
                    "lea_p": "mean", "lea_r": "mean", "lea_f1": "mean",
                    "conll_f1": "mean"}))

In [20]:
_df = macro_df[macro_df["movie"]["count"] == 6]
display(_df.loc[(slice(None), slice(None), False, 3)])
display(_df.loc[(slice(None), 512, True, slice(None))])

Unnamed: 0_level_0,Unnamed: 1_level_0,movie,max_gpu,max_gpu,best_epoch,muc_p,muc_r,muc_f1,bcub_p,bcub_r,bcub_f1,ceafe_p,ceafe_r,ceafe_f1,lea_p,lea_r,lea_f1,conll_f1
Unnamed: 0_level_1,Unnamed: 1_level_1,count,max,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
document_len,overlap_len,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
2048,128,6,7.63,7.59,9.833333,93.267167,90.9305,92.0735,79.905333,31.604667,44.825167,18.278333,54.448167,26.0065,79.440333,31.1405,44.273333,54.301722
2048,256,6,7.63,7.593333,7.5,93.4275,90.908167,92.1485,76.921833,39.867333,51.9775,19.392,55.595833,27.2455,76.491667,39.4075,51.485667,57.123833
2048,512,6,7.63,7.595,7.5,93.415333,91.279833,92.325333,77.815167,42.004833,53.605167,20.586,58.853333,29.153167,77.385333,41.554667,53.125167,58.361222
3072,256,6,7.78,7.718333,8.666667,93.155167,91.1915,92.1545,78.482833,40.899167,52.94,21.425333,57.254167,29.903667,78.042,40.436,52.431167,58.332722
3072,512,6,7.78,7.721667,10.666667,93.487667,92.148833,92.805833,78.898167,48.965833,59.639167,24.382333,59.978333,33.5445,78.519167,48.511167,59.174333,61.9965
3072,1024,6,7.79,7.731667,9.0,93.601167,92.033833,92.798667,80.2245,54.154,64.212833,24.301167,58.728667,33.416333,79.863,53.729833,63.786333,63.475944
4096,128,6,7.91,7.826667,7.0,93.252667,91.7785,92.496,79.864333,42.5575,54.001333,22.697833,57.777333,31.09,79.44,42.109333,53.515833,59.195778
4096,256,6,7.92,7.828333,6.5,93.114167,92.109333,92.595167,73.9415,50.172,59.060667,24.540833,58.425833,33.328667,73.5545,49.7545,58.633333,61.6615
4096,512,6,7.92,7.828333,5.166667,93.285833,92.0615,92.654667,76.856667,47.992167,58.800333,23.316167,57.857833,32.159667,76.4975,47.559667,58.3675,61.204889
4096,1024,6,7.92,7.833333,8.333333,93.857,92.084167,92.952833,82.209833,60.058667,69.128333,27.8625,61.619,37.199,81.881167,59.667833,68.746,66.426722


Unnamed: 0_level_0,Unnamed: 1_level_0,movie,max_gpu,max_gpu,best_epoch,muc_p,muc_r,muc_f1,bcub_p,bcub_r,bcub_f1,ceafe_p,ceafe_r,ceafe_f1,lea_p,lea_r,lea_f1,conll_f1
Unnamed: 0_level_1,Unnamed: 1_level_1,count,max,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
document_len,repk,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
2048,1,6,11.95,9.153333,5.333333,94.414667,91.580333,92.950333,54.696,77.691667,61.937,31.852,49.1445,36.965,54.451167,77.413,61.661167,63.950778
2048,2,6,24.64,14.008333,5.833333,94.394333,91.6115,92.9545,64.284333,75.9095,67.7545,32.0325,54.355833,38.959167,64.019167,75.617333,67.460667,66.556056
3072,1,6,11.15,8.79,7.833333,94.358333,91.765833,93.0145,58.582333,76.318667,63.894833,33.079833,50.093167,37.9065,58.338,76.056667,63.629833,64.938611
3072,2,6,21.54,12.646667,7.333333,94.3155,91.7875,93.006833,70.348167,76.77,72.340167,34.058167,56.912333,39.602667,70.096167,76.492833,72.067167,68.316556
3072,3,6,38.47,18.846667,7.333333,94.282833,91.677,92.9335,72.539167,76.186167,73.786167,33.973833,58.772667,40.092333,72.263833,75.899,73.500833,68.937333
4096,1,6,10.78,8.623333,9.333333,94.302333,92.275833,93.258333,66.676167,76.703667,70.059833,40.594833,54.341833,45.435833,66.431333,76.435,69.799,69.584667
4096,2,6,20.13,11.995,8.5,94.361333,92.2005,93.246,69.634833,79.271,73.217833,37.102833,58.0655,43.716833,69.3815,79.011333,72.957833,70.060222
4096,3,6,35.35,17.421667,7.5,94.269833,92.080167,93.138333,73.111167,78.084,75.170667,36.715833,58.214833,43.2925,72.834667,77.819333,74.897667,70.533833
5120,1,6,10.2,8.465,6.5,94.276333,91.954333,93.080333,71.304667,77.501167,73.184333,34.75,59.0565,41.914167,71.071833,77.224833,72.927833,69.392944
5120,2,6,17.93,11.253333,6.166667,94.249333,91.7485,92.959333,70.052833,77.0875,72.5295,35.060833,57.9055,41.5535,69.7805,76.818667,72.255833,69.014111
