# From csv to conll / jsonlines

For the eval.ipynb, we need to get individual conll files for evaulation
1. Run `Prepare` and `Step 1`

----

For training fast-coref model, we need the generate aggregrated .conll and .jsonlines files

The manually labeled training data folders are:
- round4_100_1
- round4_200_1x2
- round4_300_123
- round4_400_1234
- round4_500_1234r3

Remove docs that have 0 coref:
1. Set `base_output_dir_name=gold_no0coref_all`, set `keep_0_coref_docs=False`; 
2. Set `source_csv_dir_name` to be the above 5 folder names
3. Run `Prepare` and `Step 1 & 2 & 3`

Keep docs that have 0 coref:
1. Set `base_output_dir_name=gold_keep0coref_all`, set `keep_0_coref_docs=True`; 
2. Set `source_csv_dir_name` to be the above 5 folder names and repeat the script 5 times
3. Run `Prepare` and `Step 1 & 2 & 3`

Make `keep0coref` and `no0coref` datasets identical (organize the data in the same order):
1. Run `Step 4`

Create `unsplit` dataset for the experiment regarding number of validation:
1. Run `Step 5`

In [1]:
base_output_dir_name = "gold_keep0coref_new_all"
# If False, then the 0 coref docs will not be included in the train/dev set
keep_0_coref_docs = True

## Prepare

In [2]:
import sys
sys.path.append("../../src")
sys.path.append("../../../../git_clone_repos/fast-coref/src/")

import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
import logging

from IPython.display import display, HTML
# display(HTML(df.to_html()))

from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Event
from common_utils.data_loader_utils import load_mimic_cxr_bySection
from common_utils.coref_utils import resolve_mention_and_group_num
from common_utils.file_checker import FileChecker
from common_utils.common_utils import check_and_create_dirs, check_and_remove_dirs
from data_preprocessing import mimic_cxr_csv2conll, mimic_cxr_conll2jsonlines

os.environ["TOKENIZERS_PARALLELISM"] = "false"

FILE_CHECKER = FileChecker()
START_EVENT = Event()
logger = logging.getLogger()

In [3]:
from hydra import compose, initialize
from omegaconf import OmegaConf

config = None
with initialize(version_base=None, config_path="../config", job_name="coreference_resolution"):
        config = compose(config_name="coreference_resolution", overrides=["+coreference_resolution/data_preprocessing@_global_=mimic_cxr"])

## Step 1: Generate individual conll files

In [4]:
source_csv_dir_name = "round4_500_1234r3_new"
# Source input dir (csv)
config.input_gt.base_dir = "../../output/mimic_cxr/manual_training_set/" + source_csv_dir_name
# Target output dir (conll)
config.temp_gt.base_dir = "../../output/mimic_cxr/coref/individual_conll_ground_truth/" + source_csv_dir_name
config.temp_gt.force_run = False # Force to delete and recreate

# source_csv_dir_name = "round1x2_new"
# # Source input dir (csv)
# config.input_gt.base_dir = "../../output/mimic_cxr/manual_test_set/" + source_csv_dir_name
# # Target output dir (conll)
# config.temp_gt.base_dir = "../../output/mimic_cxr/coref/individual_conll_ground_truth/" + source_csv_dir_name
# config.temp_gt.force_run = False # Force to delete and recreate

In [5]:
check_and_remove_dirs(config.temp_gt.base_dir, config.temp_gt.force_run)
if os.path.exists(config.temp_gt.base_dir):
    print("Individual conll files found and will be reused.")
else:
    log_out = mimic_cxr_csv2conll.prepare_conll(config, config.input_gt, config.temp_gt, keep_0_coref_docs)
    with open(config.output.run_statistic, "a", encoding="UTF-8") as f:
        f.write(f"Source: {config.temp_gt.base_dir} \n")
        f.write(json.dumps(log_out, indent=2))
        f.write("\n")

100%|██████████| 250/250 [00:00<00:00, 5679.43it/s]
100%|██████████| 250/250 [00:09<00:00, 26.60it/s]
100%|██████████| 250/250 [00:00<00:00, 8557.78it/s]
100%|██████████| 250/250 [00:08<00:00, 28.38it/s]


## Step 2: Generate aggregrated conll files

In [6]:
# config.data_split.test_gt.target_doc_dir = "../../output/mimic_cxr/manual_test_set/round1x2_new"
# config.data_split.test_gt.source_dir = "../../output/mimic_cxr/coref/individual_conll_ground_truth/round1x2_new"

config.data_split.activate = ["train_manual", "test_gt"]

# The docs in `target_doc_dir` dir is the docs we want to get from the `source_dir`
config.data_split.train_manual.target_doc_dir = config.input_gt.base_dir # csv file path
config.data_split.train_manual.source_dir = config.temp_gt.base_dir # Individual conll file path

# config.data_split.test_gt.target_doc_dir = "../../output/mimic_cxr/manual_test_set/round1x2" 
# config.data_split.test_gt.source_dir = "../../output/mimic_cxr/coref/individual_conll_ground_truth/round1x2"

In [7]:
log_out = mimic_cxr_csv2conll.aggregrate_conll(config, keep_0_coref_docs)
with open(config.output.log_file, "a", encoding="UTF-8") as f:
    for split_mode, details in log_out.items():
        f.write(json.dumps({
            "output_folder": split_mode,
            "details": details
        }, indent=2))
        f.write("\n")

## Step 3: Generage jsonlines files

In [8]:
config.longformer.source = [{'train': 'train_manual'}, {'dev': 'train_manual'}, {'test': 'test_gt'}]

In [9]:
log_msg = mimic_cxr_conll2jsonlines.invoke(config)

Model: longformer, Segment length: 4096


In [10]:
log_msg

['Wrote 400 documents to /home/yuxiangliao/PhD/workspace/VSCode_workspace/sr_coref/output/mimic_cxr/coref/longformer/train.4096.jsonlines',
 'Wrote 100 documents to /home/yuxiangliao/PhD/workspace/VSCode_workspace/sr_coref/output/mimic_cxr/coref/longformer/dev.4096.jsonlines',
 'Wrote 200 documents to /home/yuxiangliao/PhD/workspace/VSCode_workspace/sr_coref/output/mimic_cxr/coref/longformer/test.4096.jsonlines']

In [11]:
import shutil

output_dir_conll = os.path.join(config.output.base_dir, "conll")
dst = os.path.join(config.output.base_dir, base_output_dir_name, source_csv_dir_name, "conll")
shutil.move(output_dir_conll, dst)

output_dir_longformer = os.path.join(config.output.base_dir, "longformer")
dst = os.path.join(config.output.base_dir, base_output_dir_name, source_csv_dir_name, "longformer")
shutil.move(output_dir_longformer, dst)

print("Output to: ", os.path.join(config.output.base_dir, base_output_dir_name, source_csv_dir_name))

Output to:  /home/yuxiangliao/PhD/workspace/VSCode_workspace/sr_coref/output/mimic_cxr/coref/gold_keep0coref_new_all/round4_500_1234r3_new


## Step 4: Align keep0coref data to no0coref data

In [14]:
import sys
sys.path.append("../../src")
sys.path.append("../../../../git_clone_repos/fast-coref/src/")

import shutil, os
from data_preprocessing import mimic_cxr_csv2conll

# 读取 no0coref的 train dev的doc id list，分别记录
remove0coref_base_dir = "../../output/mimic_cxr/coref/gold_no0coref_all"
remove0coref_dirs = [
    # os.path.join(remove0coref_base_dir,"mimic_manual_100"),
    # os.path.join(remove0coref_base_dir,"mimic_manual_200"),
    # os.path.join(remove0coref_base_dir,"mimic_manual_300"),
    # os.path.join(remove0coref_base_dir,"mimic_manual_400"),
    os.path.join(remove0coref_base_dir,"mimic_manual_500")
]

# 读取 keep0coref的 train dev的doc id list，合并记录
keep0coref_base_dir = "../../output/mimic_cxr/coref/gold_keep0coref_new_all"
keep0coref_dirs = [
    # os.path.join(keep0coref_base_dir,"round4_100_1"),
    # os.path.join(keep0coref_base_dir,"round4_200_1x2"),
    # os.path.join(keep0coref_base_dir,"round4_300_123"),
    # os.path.join(keep0coref_base_dir,"round4_400_1234"),
    os.path.join(keep0coref_base_dir,"round4_500_1234r3_new"),
]

individual_conll_gt_base_dir = "../../output/mimic_cxr/coref/individual_conll_ground_truth"
individual_conll_gt_dirs = [
    # os.path.join(individual_conll_gt_base_dir,"round4_100_1"),
    # os.path.join(individual_conll_gt_base_dir,"round4_200_1x2"),
    # os.path.join(individual_conll_gt_base_dir,"round4_300_123"),
    # os.path.join(individual_conll_gt_base_dir,"round4_400_1234"),
    os.path.join(individual_conll_gt_base_dir,"round4_500_1234r3"),
]

# 复制 no0coref的数据为keep0coref，然后把多余的0 coref data，添加到末尾
# mimic_cxr_csv2conll.copy_and_paste_conll(input_conll_file, output_conll_file)


In [15]:
import json
import random
from data_preprocessing import mimic_cxr_csv2conll

aligned_output_base_dir = "../../output/mimic_cxr/coref/gold_keep0coref_all_aligned"

for remove0coref_dir, keep0coref_dir, individual_conll_gt_dir in list(zip(remove0coref_dirs,keep0coref_dirs,individual_conll_gt_dirs)):
    base_dir_name = os.path.basename(remove0coref_dir)
    aligned_output_dir = os.path.join(aligned_output_base_dir, base_dir_name)
    
    no_0_coref_doc_list_dict = {"train":[],"dev":[]}
    has_0_coref_doc_list_dict = {"train":[],"dev":[]}
    with open(os.path.join(remove0coref_dir,"longformer", "train.4096.jsonlines"), "r") as f:
        no_0_coref_doc_list_dict["train"] = f.readlines()
    with open(os.path.join(remove0coref_dir,"longformer", "dev.4096.jsonlines"), "r") as f:
        no_0_coref_doc_list_dict["dev"] = f.readlines()
    
    with open(os.path.join(keep0coref_dir,"longformer", "train.4096.jsonlines"), "r") as f:     
        has_0_coref_doc_list_dict["train"] = f.readlines()
    with open(os.path.join(keep0coref_dir,"longformer", "dev.4096.jsonlines"), "r") as f:     
        has_0_coref_doc_list_dict["dev"] = f.readlines()
    
    # find docs that are not included in the no0coref dataset (i.e. find the 0 coref docs)
    unused_doc_list = [*has_0_coref_doc_list_dict["train"],*has_0_coref_doc_list_dict["dev"]]
    doc_indexing_list = list(map(lambda x: json.loads(x)["doc_key"], unused_doc_list))
    unused_doc_id_list = doc_indexing_list.copy()
    for split, no_0coref_list in no_0_coref_doc_list_dict.items():
        for line in no_0coref_list:
            doc_id = json.loads(line)["doc_key"]
            if doc_id in unused_doc_id_list:
                unused_doc_id_list.remove(doc_id)

    for split, no_0coref_list in no_0_coref_doc_list_dict.items():
        expected_len = len(has_0_coref_doc_list_dict[split])
        current_len = len(no_0_coref_doc_list_dict[split])
        needed_doc_num = expected_len - current_len
        # Generate /longformer/xxx.jsonlines file
        output_lines = no_0coref_list.copy()
        
        while needed_doc_num:
            doc_id = unused_doc_id_list.pop(0)
            doc_line = unused_doc_list[doc_indexing_list.index(doc_id)]
            idx = random.randint(0, len(output_lines))
            output_lines.insert(idx, doc_line)
            needed_doc_num -= 1
        
        # Generate /longformer/split.jsonlines files
        check_and_create_dirs(os.path.join(aligned_output_dir,"longformer"))
        with open(os.path.join(aligned_output_dir,"longformer",split+".4096.jsonlines"),"w") as f:
            f.write("\n".join([i.strip() for i in output_lines]))
            
        # Generate /conll/split.conll files
        check_and_create_dirs(os.path.join(aligned_output_dir,"conll"))
        for line in output_lines:
            doc_id, section_name, _ = json.loads(line)["doc_key"].split("_")
            input_conll_file = os.path.join(individual_conll_gt_dir,section_name,doc_id+".conll")
            output_conll_file = os.path.join(aligned_output_dir,"conll",split+".conll")
            mimic_cxr_csv2conll.copy_and_paste_conll(input_conll_file, output_conll_file)
            
    
    # copy the test jsonlines from keep0coref dataset
    shutil.copy(os.path.join(keep0coref_dir,"longformer","test.4096.jsonlines"),
                os.path.join(aligned_output_dir,"longformer","test.4096.jsonlines"))
    
    # copy the test conll from keep0coref dataset
    shutil.copy(os.path.join(keep0coref_dir,"conll","test.conll"),
                os.path.join(aligned_output_dir,"conll","test.conll"))
    

## Step 5: Flexiable train and dev dataset

For round4_500_1234r3_unsplit, we aggregrate the train and dev data into the same .jsonlines file.
So, we can experiment with different number of train/dev data for model trainig.

The .jsonlines file should be name as `train_dev.4096.jsonlines`

Notice that the conll files do not change.

In [18]:
import os
source_input_dir = os.path.abspath("../../output/mimic_cxr/coref/gold_keep0coref_new_all_aligned/round4_500_1234r3_new")
output_dir = os.path.abspath("../../output/mimic_cxr/coref/gold_keep0coref_new_all_aligned/round4_500_1234r3_new_unsplit")

import sys
sys.path.append("../../src")

import shutil
from common_utils.common_utils import check_and_remove_dirs

check_and_remove_dirs(output_dir, True)
# copy files
shutil.copytree(source_input_dir, output_dir)
# rename the train file to train_dev
os.rename(os.path.join(output_dir,"longformer","train.4096.jsonlines"), 
          os.path.join(output_dir,"longformer","train_dev.4096.jsonlines"))

# copy the dev file to the new train_dev file
with open(os.path.join(output_dir,"longformer","dev.4096.jsonlines"),"r",encoding="utf-8") as f_in:
    dev_lines = f_in.readlines()
with open(os.path.join(output_dir,"longformer","train_dev.4096.jsonlines"),"a",encoding="utf-8") as f_out:
    f_out.write("\n")
    f_out.writelines(dev_lines)
    
os.remove(os.path.join(output_dir,"longformer","dev.4096.jsonlines"))