# Create unlabeled pool

Our MIMIC-CXR dataset (splited by sections) has too many docs that have no coreference clusters. 

We need to create a subset for the unlabeled pool in which docs are more likely having coreference clusters.

The preliminary statistic is done by the original fast-coref model.

The subset need to remove the gt_test files.

The subset is sampling from the candidate dataset used for generating other silver/manual/semi-manual datasets. For the efficiency, we restrict the size of the subset to 5k docs.

The `unlabeled pool` will be generated at `output/mimic_cxr/active_learning/`

In [11]:
import sys

sys.path.append("../../src")
sys.path.append("../../../../git_clone_repos/fast-coref/src")

import os
import ast
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from collections import defaultdict
from IPython.display import display, HTML
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Event
from common_utils.data_loader_utils import load_mimic_cxr_bySection
from common_utils.coref_utils import (
    resolve_mention_and_group_num,
    shuffle_list,
    ConllToken,
    check_and_make_dir,
    get_data_split,
    get_file_name_prefix,
    get_porportion_and_name,
    remove_all,
    resolve_mention_and_group_num,
    shuffle_list,
)
from common_utils.file_checker import FileChecker
from common_utils.common_utils import check_and_create_dirs, check_and_remove_dirs

os.environ["TOKENIZERS_PARALLELISM"] = "false"
FILE_CHECKER = FileChecker()
START_EVENT = Event()

SEED_NUM = 42

In [12]:
from hydra import compose, initialize
from omegaconf import OmegaConf

config = None
with initialize(version_base=None, config_path="../config", job_name="nlp_ensemble"):
    config = compose(config_name="data_preprocessing", overrides=["+nlp_ensemble@_global_=mimic_cxr"])

section_name_cfg = config.name_style.mimic_cxr.section_name
output_section_cfg = config.output.section
input_path = config.input.path
data_size, pid_list, sid_list, section_list = load_mimic_cxr_bySection(input_path, output_section_cfg, section_name_cfg)

In [13]:
# Sort
s_list, f_list, i_list, pfi_list, fai_list = zip(
    *sorted(zip(sid_list, section_list[0][1], section_list[1][1], section_list[2][1], section_list[3][1]))
)
sid_list = s_list
section_list = [
    ("findings", f_list),
    ("impression", i_list),
    ("provisional_findings_impression", pfi_list),
    ("findings_and_impression", fai_list),
]

`fcoref_dir` is the folder that our unlabeled pool data come from. It contains section reports in csv format in which the contant that are tokenized by spacy with coref results predicted by fast-coref model.

`section_corefNum_docs_dict_file` is the statistical result regarding the number of coref cluster for each document in the `fcoref_dir`. This file is the side-product from `generate_silver_trainset.ipynb`-`#Prepare`.

In [14]:
section_corefNum_docs_dict_file = "../../output/mimic_cxr/nlp_ensemble/fast_coref_joint_(stripped_input).statistic"
fcoref_dir = "../../output/mimic_cxr/nlp_ensemble/fast_coref_joint_(stripped_input)"

section_corefNum_docs_dict = {}
with open(section_corefNum_docs_dict_file, "r") as f:
    a = f.readlines()
temp = json.loads("".join(a))
for section_name, corefNum_docs_dict in temp.items():
    section_corefNum_docs_dict[section_name] = {}
    for corefNumStr, docList in corefNum_docs_dict.items():
        section_corefNum_docs_dict[section_name][int(corefNumStr)] = docList

In [15]:
section_corefNum_docNum_dict = {}
for section_name in ["findings", "impression"]:
    corefNum_docs_dict = section_corefNum_docs_dict[section_name]
    section_corefNum_docNum_dict[section_name] = {}
    corefNum = 0
    while True:
        if corefNum not in corefNum_docs_dict:
            break
        section_corefNum_docNum_dict[section_name][corefNum] = len(corefNum_docs_dict[corefNum])
        corefNum += 1
print(json.dumps(section_corefNum_docNum_dict))

{"findings": {"0": 135615, "1": 16800, "2": 2804, "3": 590, "4": 154, "5": 27, "6": 13, "7": 6, "8": 2}, "impression": {"0": 173650, "1": 13016, "2": 2155, "3": 506, "4": 100, "5": 28, "6": 8, "7": 1}}


In [16]:
expected_sampling = {"findings": 2500, "impression": 2500}

target_sampling = {
    "findings": {1: None, 2: None, 3: 590, 4: 154, 5: 27, 6: 13, 7: 6, 8: 2},  # 846 - the rest
    "impression": {1: None, 2: None, 3: 506, 4: 100, 5: 28, 6: 8, 7: 1},  # 846 - the rest
}

# Assign the rest number in average for the None value item.
for section_name in ["findings", "impression"]:
    empty_ids = [k for k,v in target_sampling[section_name].items() if not v]
    rest_avg_num = (expected_sampling[section_name] - sum([i for i in target_sampling[section_name].values() if i])) // len(empty_ids)
    for key_id in empty_ids:
        target_sampling[section_name][key_id] = rest_avg_num
    
    over_num = sum([i for i in target_sampling[section_name].values() if i]) - expected_sampling[section_name]
    target_sampling[section_name][1] -= over_num
    print(sum([i for i in target_sampling[section_name].values() if i]))

print(target_sampling)

2500
2500
{'findings': {1: 854, 2: 854, 3: 590, 4: 154, 5: 27, 6: 13, 7: 6, 8: 2}, 'impression': {1: 929, 2: 928, 3: 506, 4: 100, 5: 28, 6: 8, 7: 1}}


In [17]:
from hydra import compose, initialize
from omegaconf import OmegaConf
from nlp_ensemble.nlp_menbers import play_fastcoref

config = None
with initialize(version_base=None, config_path="../config", job_name="coreference_resolution"):
    config = compose(
        config_name="coreference_resolution",
        overrides=["+coreference_resolution/data_preprocessing@_global_=mimic_cxr"],
    )
shuffle_seed = config.shuffle_seed

In [18]:
# Choose docs for the unlabeled pool, excluded docs that used in gt
docId_testset_list = [
    i.rstrip(".csv")
    for i in FILE_CHECKER.filter(
        os.listdir(os.path.join("../../output/mimic_cxr/manual_test_set/round1x2", section_name))
    )
]
unlabeled_pool_docs_list = {}
for section_name in ["findings", "impression"]:
    unlabeled_pool_docs_list[section_name] = defaultdict(list)
    section_all_docNum = 0
    for groupNum in sorted(target_sampling[section_name], reverse=True):
        docNum = target_sampling[section_name][groupNum]
        candidate_docId_list = section_corefNum_docs_dict[section_name][groupNum]
        candidate_docId_list_exclude = [x for x in candidate_docId_list if x not in docId_testset_list]
        candidate_docId_list_shuffle = shuffle_list(candidate_docId_list_exclude, shuffle_seed)
        unlabeled_pool_docs_list[section_name][groupNum] = candidate_docId_list_shuffle[0:docNum]
        section_all_docNum += len(unlabeled_pool_docs_list[section_name][groupNum])
    
    # After removing test files from the unlabeled pool, 
    # we need a few extra documents to meet the expected sampling number.
    while section_all_docNum < expected_sampling[section_name]:
        candidate_docId_list = section_corefNum_docs_dict[section_name][1]
        candidate_docId_list_exclude_gt = [x for x in candidate_docId_list if x not in docId_testset_list]
        candidate_docId_list_exclude_inUse = [x for x in candidate_docId_list if x not in unlabeled_pool_docs_list[section_name][1]]
        doc = candidate_docId_list_exclude_inUse.pop(-1)
        if doc not in unlabeled_pool_docs_list[section_name][1]:
            unlabeled_pool_docs_list[section_name][1].append(doc)
            section_all_docNum += 1

In [19]:
for section_name, corefGroupNum_docId_dict in unlabeled_pool_docs_list.items():
    section_num = 0
    for group_num, doc_list in corefGroupNum_docId_dict.items():
        section_num += len(doc_list)
    print(section_num)

2500
2500


`unlabeled_pool_dir` is the folder that the unlabeled pool data to be placed in.

In [20]:
import shutil

source_csv_dir = "../../output/mimic_cxr/nlp_ensemble/spacy"
unlabeled_pool_dir = "../../output/mimic_cxr/active_learning/unlabeled_pool_5k"

for section_name, data_dict in unlabeled_pool_docs_list.items():
    csv_source_dir = os.path.join(source_csv_dir, section_name)
    csv_target_dir = os.path.join(unlabeled_pool_dir, section_name)
    check_and_create_dirs(csv_target_dir)
    for group_num, doc_list in data_dict.items():
        for file_name in doc_list:
            csv_source_file = os.path.join(csv_source_dir, file_name+".csv")
            csv_dst_file = os.path.join(csv_target_dir, file_name+".csv")
            shutil.copy(csv_source_file, csv_dst_file)