# From csv to conll / jsonlines

For the eval.ipynb, we need to get individual conll files for evaulation
1. Run `Prepare` and `Step 1`

For training fast-coref model
1. Run `Prepare`
2. Run `Step 1 & 2 & 3`

## Prepare

In [1]:
import sys
sys.path.append("../../src")
sys.path.append("../../../../git_clone_repos/fast-coref/src/")

import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
import logging

from IPython.display import display, HTML
# display(HTML(df.to_html()))

from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Event
from common_utils.data_loader_utils import load_mimic_cxr_bySection
from common_utils.coref_utils import resolve_mention_and_group_num
from common_utils.file_checker import FileChecker
from common_utils.common_utils import check_and_create_dirs, check_and_remove_dirs
from data_preprocessing import mimic_cxr_csv2conll, mimic_cxr_conll2jsonlines

os.environ["TOKENIZERS_PARALLELISM"] = "false"

FILE_CHECKER = FileChecker()
START_EVENT = Event()
logger = logging.getLogger()

In [2]:
from hydra import compose, initialize
from omegaconf import OmegaConf

config = None
with initialize(version_base=None, config_path="../config", job_name="coreference_resolution"):
        config = compose(config_name="coreference_resolution", overrides=["+coreference_resolution/data_preprocessing@_global_=mimic_cxr"])

## Step 1: Generate individual conll files

In [3]:
# Source input dir (csv)
config.input_gt.base_dir = "../../output/mimic_cxr/manual_training_set/round4_500_1234r3"
# Target output dir (conll)
config.temp_gt.base_dir = "../../output/mimic_cxr/coref/individual_conll_ground_truth/round4_500_1234r3"
config.temp_gt.force_run = False # Force to delete and recreate

In [4]:
check_and_remove_dirs(config.temp_gt.base_dir, config.temp_gt.force_run)
if os.path.exists(config.temp_gt.base_dir):
    print("Individual test conll files found and will be reused.")
else:
    log_out = mimic_cxr_csv2conll.prepare_conll(config, config.input_gt, config.temp_gt)
    with open(config.output.run_statistic, "a", encoding="UTF-8") as f:
        f.write(f"Source: {config.temp_gt.base_dir} \n")
        f.write(json.dumps(log_out, indent=2))
        f.write("\n")

Individual test conll files found and will be reused.


## Step 2: Generate aggregrated conll files

In [5]:
config.data_split.activate = ["train_manual", "test_gt"]

 # The docs in `target_doc_dir` dir is the docs we want to get from the `source_dir`
config.data_split.train_manual.target_doc_dir = config.input_gt.base_dir # csv file path
config.data_split.train_manual.source_dir = config.temp_gt.base_dir # Individual conll file path

# config.data_split.test_gt.target_doc_dir = "../../output/mimic_cxr/manual_test_set/round1x2" 
# config.data_split.test_gt.source_dir = "../../output/mimic_cxr/coref/individual_conll_ground_truth/round1x2"

In [6]:
log_out = mimic_cxr_csv2conll.aggregrate_conll(config)
with open(config.output.log_file, "a", encoding="UTF-8") as f:
    for split_mode, details in log_out.items():
        f.write(json.dumps({
            "output_folder": split_mode,
            "details": details
        }, indent=2))
        f.write("\n")

## Step 3: Generage jsonlines files

In [7]:
config.longformer.source = [{'train': 'train_manual'}, {'dev': 'train_manual'}, {'test': 'test_gt'}]

In [8]:
log_msg = mimic_cxr_conll2jsonlines.invoke(config)

Model: longformer, Segment length: 4096


In [9]:
log_msg

['Wrote 371 documents to /home/yuxiangliao/PhD/workspace/VSCode_workspace/structured_reporting/output/mimic_cxr/coref/longformer/train.4096.jsonlines',
 'Wrote 94 documents to /home/yuxiangliao/PhD/workspace/VSCode_workspace/structured_reporting/output/mimic_cxr/coref/longformer/dev.4096.jsonlines',
 'Wrote 156 documents to /home/yuxiangliao/PhD/workspace/VSCode_workspace/structured_reporting/output/mimic_cxr/coref/longformer/test.4096.jsonlines']