In [8]:
import sys
sys.path.append("../../src")

import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Event
from common_utils.data_loader_utils import load_mimic_cxr_bySection
from common_utils.coref_utils import resolve_mention_and_group_num
from common_utils.file_checker import FileChecker
from common_utils.common_utils import check_and_create_dirs
from preprocess_i2b2 import aggregrate_files, I2b2Token, get_file_name_prefix, clean_and_split_line

FILE_CHECKER = FileChecker()
START_EVENT = Event()

In [9]:
from hydra import compose, initialize
from omegaconf import OmegaConf

config = None
with initialize(version_base=None, config_path="../config", job_name="i2b2_for_brat"):
        config = compose(config_name="data_preprocessing", overrides=["data_preprocessing@_global_=i2b2","machine=mac", "data_dir=/Users/liao/Desktop/DBMI_c2b2_2011_coref"])

# Aggregrate files

In [10]:
temp_dir = os.path.join("/Users/liao/Desktop/DBMI_c2b2_2011_coref","temp")
check_and_create_dirs(temp_dir)
docs_dir, chains_dir = aggregrate_files(config, temp_dir)

# Check that the files are matched.
doc_files = os.listdir(docs_dir)
chain_files = os.listdir(chains_dir)
assert len(doc_files) == len(chain_files)

# Resolve files

In [15]:
# Process each files
def batch_processing(doc_file_path, chain_file_path) -> tuple[str, str, list[list[I2b2Token]]]:
    """ Resolve a single i2b2 document, including a .txt file and a .chains file. """
    doc_id = get_file_name_prefix(doc_file_path, ".txt")

    # Resolve doc file
    sentence_list: list[list[I2b2Token]] = []
    with open(doc_file_path, "r", encoding="UTF-8-sig") as doc:
        tokenId_docwise = 0
        # for sentence_id, doc_line in enumerate(doc.readlines()):
        #     token_list: list[I2b2Token] = []
        #     for tokenId_sentencewise, token_str in enumerate(clean_and_split_line(doc_line, debug_doc=doc_id, debug_sent=sentence_id)):
        #         token_list.append(I2b2Token(doc_id, sentence_id, tokenId_sentencewise, tokenId_docwise, token_str))
        #         tokenId_docwise += 1
        #     sentence_list.append(token_list)

    # Resolve chain file (coref cluster)
    with open(chain_file_path, "r", encoding="UTF-8-sig") as chain:
        for cluster_id, cluster in enumerate(chain.readlines()):
            for coref in cluster.split("||")[0:-1]:  # Drop the last one, which is the type of the coref
                token_range: list[str, str] = coref.split(" ")[-2:]
                start = token_range[0]
                end = token_range[1]
                # if start == end:
                #     sentId, tokId = start.split(":")
                #     mark = f"({cluster_id})"
                #     sentence_list[int(sentId) - 1][int(tokId)].add_coref_conllmark(mark)
                # else:
                #     sentId, tokId = start.split(":")
                #     startMark = f"({cluster_id}"
                #     sentence_list[int(sentId) - 1][int(tokId)].add_coref_conllmark(startMark)

                #     sentId, tokId = end.split(":")
                #     endMark = f"{cluster_id})"
                #     sentence_list[int(sentId) - 1][int(tokId)].add_coref_conllmark(endMark)

    return doc_file_path, doc_id, sentence_list

all_task = []
with ProcessPoolExecutor(max_workers=1) as executor:
    # Submit task
    for _file_name in tqdm(doc_files):
        # Input files
        doc_file_path = os.path.join(docs_dir, _file_name)
        chain_file_path = os.path.join(chains_dir, _file_name + config.input.chain_suffix)
        all_task.append(executor.submit(batch_processing, doc_file_path, chain_file_path))

    # Notify tasks to start
    START_EVENT.set()

    # When a submitted task finished, the output is received here.
    if all_task:
        for future in tqdm(as_completed(all_task), total=len(all_task)):
            doc_file_path, doc_id, sentence_list = future.result()
    START_EVENT.clear()

100%|██████████| 424/424 [00:00<00:00, 8636.29it/s]
  0%|          | 0/424 [00:00<?, ?it/s]Process SpawnProcess-51:
Traceback (most recent call last):
  File "/Users/liao/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/liao/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/liao/opt/anaconda3/lib/python3.9/concurrent/futures/process.py", line 237, in _process_worker
    call_item = call_queue.get(block=True)
  File "/Users/liao/opt/anaconda3/lib/python3.9/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'batch_processing' on <module '__main__' (built-in)>
  0%|          | 0/424 [00:00<?, ?it/s]


BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.