# Statistics for the manuscript Table 1.

In [43]:
csv_file_path = "../../output/mimic_cxr/coref_voting/temp_for_silver/ensemble_1k"
csv_col_name = "[mv]coref_group_conll"

In [44]:
import sys
sys.path.append("../../src")
sys.path.append("../../../../git_clone_repos/fast-coref/src")

import os
import ast
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from collections import defaultdict
from IPython.display import display, HTML
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Event
from common_utils.file_checker import FileChecker
from common_utils.common_utils import check_and_create_dirs, check_and_remove_dirs
import ast
from collections import Counter
import logging
import os
import random
import re
import shutil
import pandas as pd
import numpy as np
from natsort import natsorted

os.environ["TOKENIZERS_PARALLELISM"] = "false"
FILE_CHECKER = FileChecker()
START_EVENT = Event()

SEED_NUM = 42


In [45]:
def resolve_mention_and_group_num(df: pd.DataFrame, conll_colName: str, omit_singleton=True) -> tuple[int, int]:
    """Args:
        df: The dataframe resolved from csv file.
        conll_colName: The name of the column with conll format elements.
        omit_singleton: Omit singleton mention and the corresponding coref group.

    Return:
        The number of coreference mentions and coreference groups.
    """
    corefGroup_counter = Counter()
    # Only the cells that contain str will be included
    conll_corefGroup_str_list = df[~df.loc[:, conll_colName].isin(["-1", -1.0, np.nan])].loc[:, conll_colName].to_list()
    # The index of those cells.
    conll_corefGroup_idx_list = df[~df.loc[:, conll_colName].isin(["-1", -1.0, np.nan])].index.tolist()
    cluster_id_stack = [] # the cluster id of a mention
    mention_idx_stack = [] # the index of a mention
    
    cluster_tokenNums_dict = defaultdict(list)
    for cell_idx, conll_corefGroup_cell_str in zip(conll_corefGroup_idx_list, conll_corefGroup_str_list):
        if isinstance(conll_corefGroup_cell_str, list):
            conll_corefGroup_str_list = conll_corefGroup_cell_str
        else:
            conll_corefGroup_str_list = ast.literal_eval(conll_corefGroup_cell_str)
        for conll_corefGroup_str in conll_corefGroup_str_list:
            str_start = re.search(r"\((\d+)", conll_corefGroup_str)
            str_end = re.search(r"(\d+)\)", conll_corefGroup_str)
            if str_start:
                cluster_id_stack.insert(0,int(str_start.group(1)))
                mention_idx_stack.insert(0, cell_idx)
            if str_end:
                idx = cluster_id_stack.index(int(str_end.group(1)))
                del cluster_id_stack[idx]
                start_idx = mention_idx_stack.pop(idx)
                mention_length = cell_idx - start_idx + 1
                cluster_id = int(str_end.group(1))
                cluster_tokenNums_dict[cluster_id].append(mention_length)
                corefGroup_counter.update([cluster_id])
    if omit_singleton:
        non_singletone_counter: list[tuple] = list(filter(lambda item: item[1] > 1, corefGroup_counter.items()))
        cluster_mention_num_list = [v for k, v in non_singletone_counter]
        coref_mention_num = sum(cluster_mention_num_list)
        if cluster_mention_num_list:
            max_cluster_mention_num = max(cluster_mention_num_list)
        else:
            max_cluster_mention_num = 0
        coref_group_num = len([k for k, v in non_singletone_counter])
        token_num_list = []
        for cluster_id, _ in non_singletone_counter:
            token_num_list.extend(cluster_tokenNums_dict[cluster_id])
        mention_tok_num = sum(token_num_list)
        
    return coref_mention_num, coref_group_num, mention_tok_num, max_cluster_mention_num

In [46]:
def batch_processing(src_dir, section_name, sid, spacy_input_path):
    START_EVENT.wait()
    df = pd.read_csv(os.path.join(src_dir,section_name,sid+".csv"), index_col=0, na_filter=False)

    token_list = df.loc[:,"[sp]token"].to_list()
    token_num = len(token_list)
    mention_num, group_num, mention_tok_num, max_cluster_mention_num = resolve_mention_and_group_num(df, csv_col_name)

    return sid, token_num, mention_num, group_num, mention_tok_num, max_cluster_mention_num



In [47]:
src_dir = csv_file_path

doc_list = []
token_num_list = []
mention_num_list = []
cluster_num_list = []
mention_tok_num_list = []
max_cluster_mention_num_list = []

for section_entry in os.scandir(src_dir):
    if section_entry.is_dir():
        print("Processing section:", section_entry.name)

        tasks = []
        scatter_data_list:list[dict] = []
        with ProcessPoolExecutor(max_workers=8) as executor:
            for report_entry in tqdm(os.scandir(section_entry.path)):
                if FILE_CHECKER.ignore(os.path.abspath(report_entry.path)):
                    continue
                sid = report_entry.name.rstrip(".csv")
                tasks.append(executor.submit(batch_processing, src_dir, section_entry.name, sid, report_entry.path))

            START_EVENT.set()

            # Receive results from multiprocessing.
            for future in tqdm(as_completed(tasks), total=len(tasks)):
                sid, token_num, mention_num, group_num, mention_tok_num, max_cluster_mention_num = future.result()
                doc_list.append(sid)
                token_num_list.append(token_num)
                mention_num_list.append(mention_num)
                cluster_num_list.append(group_num)
                mention_tok_num_list.append(mention_tok_num)
                max_cluster_mention_num_list.append(max_cluster_mention_num)
            START_EVENT.clear()


Processing section: impression


846it [00:00, 15413.47it/s]
100%|██████████| 846/846 [00:01<00:00, 667.17it/s] 


Processing section: findings


846it [00:00, 11065.59it/s]
100%|██████████| 846/846 [00:01<00:00, 651.45it/s]


document num

In [48]:
len(doc_list)

1692

Tokens / Document

In [49]:
sum(token_num_list) / len(doc_list)

117.46926713947991

Mentions / Document

In [50]:
sum(mention_num_list) / len(doc_list)

3.186761229314421

Clusters / Document

In [51]:
sum(cluster_num_list) / len(doc_list)

1.4781323877068557

Tokens / Mention

In [52]:
sum(mention_tok_num_list) / sum(mention_num_list)

2.841246290801187

Mentions / Cluster

In [53]:
sum(mention_num_list) / sum(cluster_num_list)

2.15593762495002

Max Number of Mentions in Clusters

In [54]:
max(max_cluster_mention_num_list)

7