# Generate fast-coref model output as csv format

## Prepare

We dont want to process all the documents, instead, we only need 1000/343/348 data for train/dev/test

In [39]:
import sys
sys.path.append("../../src")
sys.path.append("../../../../git_clone_repos/fast-coref/src")

import os
import ast
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from collections import defaultdict
from IPython.display import display, HTML
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Event
from common_utils.data_loader_utils import load_mimic_cxr_bySection
from common_utils.coref_utils import resolve_mention_and_group_num, shuffle_list, ConllToken, check_and_make_dir, get_data_split, get_file_name_prefix, get_porportion_and_name, remove_all, resolve_mention_and_group_num, shuffle_list
from common_utils.file_checker import FileChecker
from common_utils.common_utils import check_and_create_dirs, check_and_remove_dirs

os.environ["TOKENIZERS_PARALLELISM"] = "false"
FILE_CHECKER = FileChecker()
START_EVENT = Event()

SEED_NUM = 42

mpl.style.use("default")

SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [40]:
from hydra import compose, initialize
from omegaconf import OmegaConf

config = None
with initialize(version_base=None, config_path="../config", job_name="nlp_ensemble"):
        config = compose(config_name="data_preprocessing", overrides=["+nlp_ensemble@_global_=mimic_cxr"])
        
section_name_cfg = config.name_style.mimic_cxr.section_name
output_section_cfg = config.output.section
input_path = config.input.path
data_size, pid_list, sid_list, section_list = load_mimic_cxr_bySection(input_path, output_section_cfg, section_name_cfg)

In [41]:
# Sort
s_list, f_list, i_list, pfi_list, fai_list = zip(*sorted(zip(sid_list, section_list[0][1], section_list[1][1], section_list[2][1], section_list[3][1])))
sid_list = s_list
section_list = [
        ("findings", f_list),
        ("impression", i_list),
        ("provisional_findings_impression", pfi_list),
        ("findings_and_impression",fai_list)
]

In [42]:
section_corefNum_docs_dict_file = "../../output/mimic_cxr/nlp_ensemble/fast_coref_joint_(stripped_input).statistic"

### Modify ###
scoref_dir = "../../output/mimic_cxr/nlp_ensemble/corenlp/scoref"
dcoref_dir = "../../output/mimic_cxr/nlp_ensemble/corenlp/dcoref"
fcoref_dir = "../../output/mimic_cxr/nlp_ensemble/fast_coref_joint_(stripped_input)"

def batch_processing(section_name, sid, spacy_input_path):
    START_EVENT.wait()
    # df_spacy = pd.read_csv(spacy_input_path, index_col=0, na_filter=False)
    # df_scoref = pd.read_csv(os.path.join(scoref_dir,section_name,sid+".csv"), index_col=0, na_filter=False)
    # df_dcoref = pd.read_csv(os.path.join(dcoref_dir,section_name,sid+".csv"), index_col=0, na_filter=False)
    df_fcoref = pd.read_csv(os.path.join(fcoref_dir,section_name,sid+".csv"), index_col=0, na_filter=False)

    # token_list = df_spacy.loc[:,"[sp]token"].to_list()
    # token_num = len(token_list)

    # _, scoref_group_num = resolve_mention_and_group_num(df_scoref, "[co][ml]coref_group_conll")
    # _, dcoref_group_num = resolve_mention_and_group_num(df_dcoref, "[co][rb]coref_group_conll")
    _, fcoref_group_num = resolve_mention_and_group_num(df_fcoref, "[fj]coref_group_conll")
    token_num,scoref_group_num,dcoref_group_num = 0,0,0

    return sid, token_num, scoref_group_num, dcoref_group_num, fcoref_group_num

section_corefNum_docs_dict = {}

if not os.path.exists(section_corefNum_docs_dict_file):
    section_doc_numData_dict:dict[str,dict[str,dict[str,int]]] = {}
    section_scatter_data_list = {}
    for section_entry in os.scandir("../../output/mimic_cxr/nlp_ensemble/spacy"):
        if section_entry.is_dir():
            print("Processing section:", section_entry.name)
            section_doc_numData_dict[section_entry.name]:dict[str,dict[str,int]] = {}

            tasks = []
            scatter_data_list:list[dict] = []
            with ProcessPoolExecutor(max_workers=14) as executor:
                for report_entry in tqdm(os.scandir(section_entry.path)):
                    if FILE_CHECKER.ignore(os.path.abspath(report_entry.path)):
                        continue
                    sid = report_entry.name.rstrip(".csv")
                    tasks.append(executor.submit(batch_processing,section_entry.name, sid, report_entry.path))

                START_EVENT.set()

                # Receive results from multiprocessing.
                for future in tqdm(as_completed(tasks), total=len(tasks)):
                    sid, token_num, scoref_group_num, dcoref_group_num, fcoref_group_num = future.result()
                    numData = {
                        "tokNum":token_num,
                        "sNum": scoref_group_num,
                        "dNum": dcoref_group_num,
                        "fNum": fcoref_group_num,
                        "avgNum": (scoref_group_num + dcoref_group_num + fcoref_group_num) / 3
                    }
                    # For later statistic
                    section_doc_numData_dict[section_entry.name][sid]:dict[str,int] = numData
                    # For scatter plot
                    scatter_data_list.append(numData)

                START_EVENT.clear()

            section_scatter_data_list[section_entry.name] = scatter_data_list
    
    for section_name, doc_numData_dict in section_doc_numData_dict.items():
        section_corefNum_docs_dict[section_name] = defaultdict(list)
        for doc_id, numData_dict in doc_numData_dict.items():
            section_corefNum_docs_dict[section_name][numData_dict["fNum"]].append(doc_id)
    
    with open(section_corefNum_docs_dict_file,"w") as f:
        f.write(json.dumps(section_corefNum_docs_dict))
else:
    with open(section_corefNum_docs_dict_file,"r") as f:
        a = f.readlines()
    temp = json.loads("".join(a))
    for section_name, corefNum_docs_dict in temp.items():
        section_corefNum_docs_dict[section_name] = {}
        for corefNumStr, docList in corefNum_docs_dict.items():
            section_corefNum_docs_dict[section_name][int(corefNumStr)] = docList

In [43]:
section_corefNum_docNum_dict = {}
for section_name in ["findings", "impression"]:
    corefNum_docs_dict = section_corefNum_docs_dict[section_name]
    section_corefNum_docNum_dict[section_name] = {}
    corefNum = 0
    while True:
        if corefNum not in corefNum_docs_dict:
            break
        section_corefNum_docNum_dict[section_name][corefNum] = len(corefNum_docs_dict[corefNum])
        corefNum += 1
print(json.dumps(section_corefNum_docNum_dict, indent=2))

{
  "findings": {
    "0": 135615,
    "1": 16800,
    "2": 2804,
    "3": 590,
    "4": 154,
    "5": 27,
    "6": 13,
    "7": 6,
    "8": 2
  },
  "impression": {
    "0": 173650,
    "1": 13016,
    "2": 2155,
    "3": 506,
    "4": 100,
    "5": 28,
    "6": 8,
    "7": 1
  }
}


In [32]:
# 846/846
expected_sampling = {
  "findings": 846,
  "impression": 846
}

target_sampling = {
  "findings": {
    1: None, # 846 - rest
    2: 214,
    3: 214,
    4: 154,
    5: 27,
    6: 13,
    7: 6,
    8: 2
  },
  "impression": {
    1: None, # 846 - rest
    2: 236,
    3: 236,
    4: 100,
    5: 28,
    6: 8,
    7: 1
  }
}

In [33]:
from hydra import compose, initialize
from omegaconf import OmegaConf
from nlp_ensemble.nlp_menbers import play_fastcoref
config = None
with initialize(version_base=None, config_path="../config", job_name="coreference_resolution"):
        config = compose(config_name="coreference_resolution", overrides=["+coreference_resolution/data_preprocessing@_global_=mimic_cxr"])
shuffle_seed = config.shuffle_seed

In [34]:
# Choose docs for model training, excluded docs that used in gt
docId_testset_list = [i.rstrip(".csv") for i in FILE_CHECKER.filter(os.listdir(os.path.join("../../output/mimic_cxr/manual_test_set/round1x2", section_name)))]
trainset_docs_list = {}
for section_name in ["findings", "impression"]:
    trainset_docs_list[section_name] = defaultdict(list)
    section_all_docNum = 0
    for groupNum in sorted(target_sampling[section_name], reverse=True):
        docNum = target_sampling[section_name][groupNum]
        if groupNum == 1:
            docNum = expected_sampling[section_name] - section_all_docNum
        candidate_docId_list = section_corefNum_docs_dict[section_name][groupNum]
        candidate_docId_list_exclude = [x for x in candidate_docId_list if x not in docId_testset_list]
        candidate_docId_list_shuffle = shuffle_list(candidate_docId_list_exclude, shuffle_seed)
        trainset_docs_list[section_name][groupNum] = candidate_docId_list_shuffle[0:docNum]
        section_all_docNum += len(trainset_docs_list[section_name][groupNum])

The actual sampling details

In [35]:
for section_name, data_dict in trainset_docs_list.items():
    for group_num, doc_list in data_dict.items():
        print(section_name, group_num, len(doc_list))

findings 8 2
findings 7 6
findings 6 13
findings 5 26
findings 4 154
findings 3 214
findings 2 214
findings 1 217
impression 7 1
impression 6 8
impression 5 26
impression 4 99
impression 3 236
impression 2 236
impression 1 240


## Opt 1: Generate ensemble csv files

In [10]:
from common_utils.coref_utils import shuffle_list
import coref_voting
from coref_voting import DocClass, MentionClass, compute_voting_result, get_output_df
from hydra import compose, initialize
from omegaconf import OmegaConf
from nlp_ensemble.nlp_menbers import play_fastcoref


config = None
with initialize(version_base=None, config_path="../config", job_name="majority_voting"):
        config = compose(config_name="coreference_resolution", overrides=["+coreference_resolution/coref_voting@_global_=mimic_cxr"])

In [11]:
### Modify ###
config.input.source.coref_models.fj.dir = "../../output/mimic_cxr/nlp_ensemble/fast_coref_joint_(stripped_input)"

config.input.source.in_use = ["ml","rb","fj"] # ml, rb, fj, gt, fj_x, fj_x2
mv_output_base_dir = os.path.join("../../output/mimic_cxr/coref_voting/temp_for_silver/ml_rb_fj_stripped_1k")
for section_name in ["findings","impression"]:
    mv_output_dir = os.path.join(mv_output_base_dir, section_name)
    check_and_create_dirs(mv_output_dir)

In [12]:
def do_majority_voting(config, spacy_file_path, section_name, file_name):
    """ Voting on one document """
    START_EVENT.wait()

    # Read spacy output as alignment base
    df_spacy = pd.read_csv(spacy_file_path, index_col=0, na_filter=False)
    # Some of the i2b2 raw files are utf-8 start with DOM, but we didn't remove the DOM character, thus we fix it here.
    df_spacy.iloc[0] = df_spacy.iloc[0].apply(lambda x: x.replace("\ufeff", "").replace("\xef\xbb\xbf", "") if isinstance(x, str) else x)

    docObj: DocClass = coref_voting.resolve_voting_info(config, df_spacy, section_name, file_name)
    valid_mention_group: list[set[MentionClass]] = compute_voting_result(config, docObj)
    df_out = get_output_df(config, df_spacy, valid_mention_group, docObj)
    
    output_file_path = os.path.join(mv_output_base_dir, section_name, file_name)
    df_out.to_csv(output_file_path)

    return file_name

for section_name in ["findings","impression"]:
    with ProcessPoolExecutor(max_workers=config.thread.workers) as executor:
        all_task = []
        doc_list = [docid for _, docs in trainset_docs_list[section_name].items() for docid in docs]
        
        for doc_id in doc_list:
            file_name = doc_id + ".csv"
            spacy_out_dir = os.path.join(config.input.source.baseline_model.dir, section_name)
            spacy_file_path = os.path.join(spacy_out_dir, file_name)
            all_task.append(executor.submit(do_majority_voting, config, spacy_file_path, section_name, file_name))
        
         # Notify tasks to start
        START_EVENT.set()

        if all_task:
            for future in tqdm(as_completed(all_task), total=len(all_task)):
                file_name = future.result()

        executor.shutdown(wait=True, cancel_futures=False)
        START_EVENT.clear()

100%|██████████| 846/846 [03:46<00:00,  3.73it/s]
100%|██████████| 846/846 [03:21<00:00,  4.21it/s]


statistics 

(re-run the following scripts wihtin this section, if not getting enough docs for silver-ensemble)

In [18]:
section_corefNum_docs_dict_file = "../../output/mimic_cxr/nlp_ensemble/fast_coref_joint_(stripped_input).statistic"

### Modify ###
ensemble_dir = mv_output_base_dir

def batch_processing(section_name, sid):
    START_EVENT.wait()
    df_fcoref = pd.read_csv(os.path.join(ensemble_dir,section_name,sid+".csv"), index_col=0, na_filter=False)
    _, fcoref_group_num = resolve_mention_and_group_num(df_fcoref, "[mv]coref_group_conll")
    return sid,  fcoref_group_num

section_corefNum_ensembleDoc_dict = {}

section_scatter_data_list = {}
for section_name in ["findings", "impression"]:
    section_corefNum_ensembleDoc_dict[section_name] = defaultdict(list)
    
    tasks = []
    with ProcessPoolExecutor(max_workers=14) as executor:
        for doc_filename in FILE_CHECKER.filter(os.listdir(os.path.join(ensemble_dir, section_name))):
            sid = doc_filename.rstrip(".csv")
            tasks.append(executor.submit(batch_processing, section_name, sid))

        START_EVENT.set()

        # Receive results from multiprocessing.
        for future in tqdm(as_completed(tasks), total=len(tasks)):
            sid, fcoref_group_num = future.result()
            # For later statistic
            section_corefNum_ensembleDoc_dict[section_name][fcoref_group_num].append(sid)

        START_EVENT.clear()

for section_name, data_dict in section_corefNum_ensembleDoc_dict.items():
    for group_num, doc_list in data_dict.items():
        print(section_name, group_num, len(doc_list))
        
for section_name, data_dict in section_corefNum_ensembleDoc_dict.items():
    a = 0
    for group_num, doc_list in data_dict.items():
        a += len(doc_list)
    print(section_name, a)

100%|██████████| 1007/1007 [00:00<00:00, 4812.06it/s]
100%|██████████| 1035/1035 [00:00<00:00, 4964.25it/s]

findings 1 406
findings 2 241
findings 3 126
findings 0 161
findings 5 16
findings 4 48
findings 7 2
findings 6 7
impression 1 442
impression 2 285
impression 0 189
impression 3 84
impression 4 26
impression 5 6
impression 6 3
findings 1007
impression 1035





After doing ensemble, some of the target docs might become no_coref, and will be ignored later. 

Therefore, we need to add more docs that have >1_coref to make the final training set to be 1000 docs.

In [19]:
import math

section_rest_sampling = {}
for section_name in ["findings", "impression"]:
    num_needed = expected_sampling[section_name] - len([doc for coref_num, doc_list in section_corefNum_ensembleDoc_dict[section_name].items() if coref_num != 0 for doc in doc_list])
    print(num_needed)
    section_rest_sampling[section_name] = {1: math.ceil(num_needed/3), 2: math.ceil(num_needed/3), 3: num_needed - math.ceil(num_needed/3)*2}
section_rest_sampling

0
0


{'findings': {1: 0, 2: 0, 3: 0}, 'impression': {1: 0, 2: 0, 3: 0}}

In [16]:
def do_majority_voting2(config, spacy_file_path, section_name, file_name):
    """ Voting on one document """
    START_EVENT.wait()
    mv_output_dir = os.path.join(mv_output_base_dir, section_name)
    check_and_create_dirs(mv_output_dir)
    output_file_path = os.path.join(mv_output_dir, file_name)
    
    if os.path.exists(output_file_path):
        df_out = pd.read_csv(output_file_path, index_col=0, na_filter=False)
    else:
        # Read spacy output as alignment base
        df_spacy = pd.read_csv(spacy_file_path, index_col=0, na_filter=False)
        # Some of the i2b2 raw files are utf-8 start with DOM, but we didn't remove the DOM character, thus we fix it here.
        df_spacy.iloc[0] = df_spacy.iloc[0].apply(lambda x: x.replace("\ufeff", "").replace("\xef\xbb\xbf", "") if isinstance(x, str) else x)

        docObj: DocClass = coref_voting.resolve_voting_info(config, df_spacy, section_name, file_name)
        valid_mention_group: list[set[MentionClass]] = compute_voting_result(config, docObj)
        df_out = get_output_df(config, df_spacy, valid_mention_group, docObj)

    _, fcoref_group_num = resolve_mention_and_group_num(df_out, "[mv]coref_group_conll")
    
    return file_name, fcoref_group_num, df_out, output_file_path

docId_testset_list = [i.rstrip(".csv") for i in FILE_CHECKER.filter(os.listdir(os.path.join("../../output/mimic_cxr/manual_test_set/round1x2", section_name)))]

doc_added = {}
for section_name in ["findings", "impression"]:
    doc_added[section_name] = defaultdict(list)
    for group_num, doc_num_needed in section_rest_sampling[section_name].items():
        
        # all docs under this groupNum
        candidate_docId_list = section_corefNum_docs_dict[section_name][group_num]
        # Exclude docs that used in test set
        candidate_docId_list_exclude = [x for x in candidate_docId_list if x+".csv" not in docId_testset_list]
        # exclude docs that already in target dir
        exising_docs = os.listdir(os.path.join(mv_output_base_dir,section_name))
        candidate_docId_list_exclude = [x for x in candidate_docId_list if x+".csv" not in exising_docs]
        
        with ProcessPoolExecutor(max_workers=config.thread.workers) as executor:
            all_task = []
            
            for doc_id in candidate_docId_list_exclude:
                file_name = doc_id + ".csv"
                spacy_out_dir = os.path.join(config.input.source.baseline_model.dir, section_name)
                spacy_file_path = os.path.join(spacy_out_dir, file_name)
                all_task.append(executor.submit(do_majority_voting2, config, spacy_file_path, section_name, file_name))
                    
            # Notify tasks to start
            START_EVENT.set()

            if all_task:
                doc_num_added = 0
                for future in tqdm(as_completed(all_task), total=len(all_task)):
                    if future.cancelled():
                        continue
                    file_name, fcoref_group_num, df_out, output_file_path = future.result()
                    if fcoref_group_num > 0:
                        doc_num_added += 1
                        if doc_num_added <= doc_num_needed:
                            df_out.to_csv(output_file_path)
                            doc_added[section_name][group_num].append(file_name)
                        else:
                            for fu in all_task:
                                fu.cancel()
                            
            START_EVENT.clear()

100%|██████████| 16583/16583 [00:22<00:00, 725.31it/s]  
100%|██████████| 2590/2590 [00:21<00:00, 118.74it/s] 
100%|██████████| 376/376 [00:26<00:00, 14.30it/s] 
100%|██████████| 12776/12776 [00:24<00:00, 517.38it/s]  
100%|██████████| 1919/1919 [00:21<00:00, 88.42it/s]  
100%|██████████| 270/270 [00:26<00:00, 10.16it/s] 


In [17]:
for section_name, data_dict in doc_added.items():
    targetList = os.listdir(os.path.join(mv_output_base_dir,section_name))
    for group_num, doc_list in data_dict.items():
        print(section_name, group_num, len(doc_list))
        for doccsv in doc_list:
            if doccsv not in targetList:
                print(doccsv)

findings 1 54
findings 2 54
findings 3 53
impression 1 63
impression 2 63
impression 3 63


## Given csv files, generate individual conll files

In [20]:
from hydra import compose, initialize
from omegaconf import OmegaConf
from nlp_ensemble.nlp_menbers import play_fastcoref
config = None
config = None
with initialize(version_base=None, config_path="../config", job_name="coreference_resolution"):
        config = compose(config_name="coreference_resolution", overrides=["+coreference_resolution/data_preprocessing@_global_=mimic_cxr"])

In [21]:
### Modify ###
# Choose to use "model direct output" or "ensemble output"
# config.input_pred.base_dir = "../../output/mimic_cxr/nlp_ensemble/fast_coref_joint_(stripped_input)" # source csv dir
# config.input_pred.column_name.coref_group_conll = "[fj]coref_group_conll"
# config.input_pred.column_name.sentence_group = "[fj]sentence_group"
# config.input_pred.column_name.token = "[fj]token_from_spacy"
# config.temp_pred.base_dir = "../../output/mimic_cxr/coref/temp_individual_conll/joint_best_(stripped_input)" # target output dir

config.input_pred.base_dir = "../../output/mimic_cxr/coref_voting/temp_for_silver/ml_rb_fj_stripped_1k" # source csv dir
config.temp_pred.base_dir = "../../output/mimic_cxr/coref/temp_individual_conll/majority_voting_(ml_rb_fj_stripped)" # target output dir

target_doc_files = {}
for section_name in ["findings", "impression"]:
    # For model direct output
    # target_doc_files[section_name] = os.listdir(os.path.join(config.input_pred.base_dir, section_name))

    # For ensemble output
    target_doc_files[section_name] = [doc for _, docs in trainset_docs_list[section_name].items() for doc in docs]

    
config.input_pred.section = ["findings", "impression"]
check_and_remove_dirs(config.temp_pred.base_dir,True)

In [22]:
def batch_processing(input_cfg, temp_cfg, section_name, input_file_path) -> int:
    """ All whitespces like "\n", "\n " and " " are skipped. 
    Return:
        True if this doc has at least one coref group. Otherwise False
    """
    START_EVENT.wait()

    doc_id = get_file_name_prefix(input_file_path, input_cfg.suffix)
    BEGIN = f"#begin document ({doc_id}_{section_name}); part 0\n"
    SENTENCE_SEPARATOR = "\n"
    END = "#end document\n"
    output_file_path = os.path.join(temp_cfg.base_dir, section_name, f"{doc_id}.conll")

    # Resolve CSV file
    sentenc_list: list[list[ConllToken]] = []
    df = pd.read_csv(input_file_path, index_col=0, na_filter=False)
    _, coref_group_num = resolve_mention_and_group_num(df, input_cfg.column_name.coref_group_conll)

    # Write .conll file only if doc has at least one coref group
    if coref_group_num > 0:
        sentence_id = 0
        while True:
            token_list: list[ConllToken] = []
            df_sentence = df[df.loc[:, input_cfg.column_name.sentence_group] == sentence_id].reset_index()
            if df_sentence.empty:
                break
            for _idx, data in df_sentence.iterrows():
                # Skip all whitespces like "\n", "\n " and " ".
                if str(data[input_cfg.column_name.token]).strip() == "":
                    continue
                conllToken = ConllToken(doc_id+"_"+section_name, sentence_id, _idx, data[input_cfg.column_name.token])
                coref_col_cell = data[input_cfg.column_name.coref_group_conll]
                if isinstance(coref_col_cell, str) and coref_col_cell != "-1":
                    conllToken.add_coref_label("|".join(ast.literal_eval(coref_col_cell)))
                token_list.append(conllToken)
            sentenc_list.append(token_list)
            sentence_id += 1
        with open(output_file_path, "w", encoding="UTF-8") as out:
            out.write(BEGIN)
            for sent in sentenc_list:
                # Skip empty sentence
                if len(sent) == 1 and sent[0].tokenStr == "":
                    continue
                for tok in sent:
                    out.write(tok.get_conll_str() + "\n")
                out.write(SENTENCE_SEPARATOR)
            out.write(END)

    return doc_id, coref_group_num

for section_name in ["findings", "impression"]:
    section_temp_conll_dir = os.path.join(config.temp_pred.base_dir, section_name)
    check_and_create_dirs(section_temp_conll_dir)
    
    doc_files = target_doc_files[section_name]
    with ProcessPoolExecutor(max_workers=config.thread.workers) as executor:
        all_task = []
        for file_name in tqdm(doc_files):
            # if len(all_task) > 100:
            #     break
            input_file_path = os.path.join(config.input_pred.base_dir, section_name, file_name)
            all_task.append(executor.submit(batch_processing, config.input_pred, config.temp_pred, section_name, input_file_path))

        # Notify tasks to start
        START_EVENT.set()

        corefGroupNum_docId_dict = defaultdict(list)
        if all_task:
            for future in tqdm(as_completed(all_task), total=len(all_task)):
                doc_id, coref_group_num = future.result()

        executor.shutdown(wait=True, cancel_futures=False)
        START_EVENT.clear()

100%|██████████| 1007/1007 [00:00<00:00, 5383.59it/s]
100%|██████████| 1007/1007 [00:40<00:00, 24.67it/s]
100%|██████████| 1035/1035 [00:00<00:00, 5566.75it/s]
100%|██████████| 1035/1035 [00:36<00:00, 27.99it/s]


## Given individual conll files, generate aggregrated conll

In [23]:
# 1000,343,348
data_split_num_list = {
    "findings": {
        "train": 500,
        "dev": 172,
        "test": 174
    },
    "impression": {
        "train": 500,
        "dev": 172,
        "test": 174
    }
}
### Modify ###
output_conll_aggregrate_dir = "../../output/mimic_cxr/coref/aggregrate_conll/train_silver_mv_1k"

In [24]:
from data_preprocessing.mimic_cxr_csv2conll import copy_and_paste_conll

check_and_remove_dirs(output_conll_aggregrate_dir, True)
check_and_create_dirs(output_conll_aggregrate_dir)
print("Source input:",config.temp_pred.base_dir)
for section_name in ["findings", "impression"]:
    section_temp_conll_dir = os.path.join(config.temp_pred.base_dir, section_name)
    docId_list_shuffle = shuffle_list(FILE_CHECKER.filter(os.listdir(section_temp_conll_dir)), config.shuffle_seed)
    split_start = 0
    for split in ["train","dev","test"]:
        split_end = split_start + data_split_num_list[section_name][split]
        print(section_name, split, split_start, split_end)
        # Aggregrate one by one
        for doc_filename in docId_list_shuffle[split_start:split_end]:
            input_conll_file = os.path.join(config.temp_pred.base_dir, section_name, doc_filename)
            output_conll_file = os.path.join(output_conll_aggregrate_dir,f"{split}.conll")
            copy_and_paste_conll(input_conll_file, output_conll_file)
            
        split_start = split_end

Source input: ../../output/mimic_cxr/coref/temp_individual_conll/majority_voting_(ml_rb_fj_stripped)
findings train 0 500
findings dev 500 672
findings test 672 846
impression train 0 500
impression dev 500 672
impression test 672 846


## Given aggregrated conll files, generate jsonlines files

In [25]:
config.data_split.train_silver.dir_name = "train_silver_mv_1k"
config.longformer.source = [{'train': 'train_silver'}, {'dev': 'train_silver'}, {'test': 'train_silver'}]

In [26]:
from data_preprocessing import mimic_cxr_conll2jsonlines

log_msg = mimic_cxr_conll2jsonlines.invoke(config)

Model: longformer, Segment length: 4096


2022-12-13 21:46:00,751 - Wrote 1000 documents to /home/yuxiangliao/PhD/workspace/VSCode_workspace/structured_reporting/output/mimic_cxr/coref/longformer/train.4096.jsonlines
2022-12-13 21:46:01,718 - Wrote 344 documents to /home/yuxiangliao/PhD/workspace/VSCode_workspace/structured_reporting/output/mimic_cxr/coref/longformer/dev.4096.jsonlines
2022-12-13 21:46:02,719 - Wrote 348 documents to /home/yuxiangliao/PhD/workspace/VSCode_workspace/structured_reporting/output/mimic_cxr/coref/longformer/test.4096.jsonlines
