# Init

In [33]:
import datasets
from datasets import load_dataset, Sequence, Image, DatasetDict, concatenate_datasets, Dataset
import os
import json
from tqdm import tqdm
import re
import copy
import pandas as pd
import numpy as np
from typing import Union, List
import ast
import linecache
from collections import defaultdict, Counter

In [21]:
def load_jsonline_from_file(file_path, line_idx):
    line = linecache.getline(file_path, line_idx + 1)
    return json.loads(line.strip()) if line else None

In [22]:
temp_dir = "/home/yuxiang/liao/workspace/arrg_preprocessing/outputs/interpret_sents/combined_results"


def save_to_temp(ds, version):
    temp_path = os.path.join(temp_dir, f"temp_v{version}")
    ds.save_to_disk(temp_path)
    return temp_path

# Load spacy results for reports

In [None]:
report_file = "/home/yuxiang/liao/workspace/arrg_preprocessing/outputs/interpret_reports/raw_reports.json"
with open(report_file, "r") as f:
    print(next(f))
    print(next(f))

In [None]:
new_ds = Dataset.from_json(report_file)

In [None]:
new_ds

# Load llm-sent-gen results

In [None]:
llm_file_dir = "/home/yuxiang/liao/workspace/arrg_preprocessing/outputs/interpret_sents/llm_split_sents"

with open(os.path.join(llm_file_dir, "llm_split_sents_1_of_3.json"), "r") as f:
    print(next(f))
    print(next(f))

In [None]:
doc_map = defaultdict(list)

for file_idx in range(1, 4):
    target_file_path = os.path.join(llm_file_dir, f"llm_split_sents_{file_idx}_of_3.json")
    with open(target_file_path, "r") as f:
        for line_idx, line in enumerate(tqdm(f)):
            doc = json.loads(line.strip())
            doc_map[doc["doc_key"]].append({"doc_key": doc["doc_key"], "split_sent_idx": int(doc["sent_idx"]), "file_path": target_file_path, "line_idx": line_idx})

In [None]:
def update_dataset(element):
    doc_key = element["doc_key"]

    sorted_doc_info_list = sorted(doc_map[doc_key], key=lambda x: x["split_sent_idx"])

    element["split_sents"] = []
    element["sent_idx_split_idx"] = []
    for info_dict in sorted_doc_info_list:

        # file_doc = {"doc_key":"train#0#impression","sent_idx":1,"original_sent":"STABLE SMALL LEFT PLEURAL EFFUSION.","split_sents":["Stable small left pleural effusion."]}
        file_doc = load_jsonline_from_file(info_dict["file_path"], info_dict["line_idx"])
        assert element["doc_key"] == file_doc["doc_key"]
        assert element["sents"][file_doc["sent_idx"]] == file_doc["original_sent"]

        for split_idx, split_sent in enumerate(file_doc["split_sents"]):
            element["split_sents"].append(split_sent)
            element["sent_idx_split_idx"].append((file_doc["sent_idx"], split_idx))

    return element


# temp_ds = new_ds.select(range(10))
new_ds = new_ds.map(update_dataset)

In [None]:
new_ds[1]

In [None]:
temp_path = save_to_temp(new_ds, version=1)
temp_path

# Load spacy results for sentences

In [4]:
temp_path = "/home/yuxiang/liao/workspace/arrg_preprocessing/outputs/interpret_sents/combined_results/temp_v1"
new_ds = Dataset.load_from_disk(temp_path)
new_ds

Dataset({
    features: ['doc_key', 'sent_toks', 'tok_char_indices', 'sents', 'sent_char_indices', 'split_sents', 'sent_idx_split_idx'],
    num_rows: 1136366
})

In [5]:
spacy_sent_file = "/home/yuxiang/liao/workspace/arrg_preprocessing/outputs/interpret_sents/raw/raw_sents.json"
with open(spacy_sent_file, "r") as f:
    print(next(f))
    print(next(f))

{"doc_key": "train#0#impression#0#0", "split_sent_text": "1.", "split_sent_toks": [["1", "."]], "tok_char_indices": [[[0, 1], [1, 2]]]}

{"doc_key": "train#0#impression#0#1", "split_sent_text": "Decreased bibasilar parenchymal opacities are now minimal.", "split_sent_toks": [["Decreased", "bibasilar", "parenchymal", "opacities", "are", "now", "minimal", "."]], "tok_char_indices": [[[0, 9], [10, 19], [20, 31], [32, 41], [42, 45], [46, 49], [50, 57], [57, 58]]]}



In [6]:
doc_map = defaultdict(list)

with open(spacy_sent_file, "r") as f:
    for line_idx, line in enumerate(tqdm(f)):
        doc = json.loads(line.strip())
        data_split, row_idx, section_name, orig_sent_idx, split_sent_idx = doc["doc_key"].split("#")
        doc_key = f"{data_split}#{row_idx}#{section_name}"

        doc_map[doc_key].append({"doc_key": doc_key, "sent_idx": int(orig_sent_idx), "split_sent_idx": int(split_sent_idx), "file_path": spacy_sent_file, "line_idx": line_idx})

5330591it [00:19, 267243.72it/s]


In [10]:
new_ds[0]

{'doc_key': 'train#0#findings',
 'sent_toks': [],
 'tok_char_indices': [],
 'sents': [],
 'sent_char_indices': [],
 'split_sents': [],
 'sent_idx_split_idx': []}

In [14]:
def update_dataset(element):
    element["split_sent_toks"] = []
    element["split_tok_char_indices"] = []
    if len(element["split_sents"]) == 0:
        return element

    sorted_doc_info_list = sorted(doc_map[element["doc_key"]], key=lambda x: (x["sent_idx"], x["split_sent_idx"]))
    for info_dict in sorted_doc_info_list:
        # file_doc = {"doc_key": "train#0#impression#0#1", "split_sent_text": "Decreased bibasilar parenchymal opacities are now minimal.", "split_sent_toks": [["Decreased", "bibasilar", "parenchymal", "opacities", "are", "now", "minimal", "."]], "tok_char_indices": [[[0, 9], [10, 19], [20, 31], [32, 41], [42, 45], [46, 49], [50, 57], [57, 58]]]}
        file_doc = load_jsonline_from_file(info_dict["file_path"], info_dict["line_idx"])
        data_split, row_idx, section_name, orig_sent_idx, split_sent_idx = file_doc["doc_key"].split("#")
        orig_sent_idx = int(orig_sent_idx)
        split_sent_idx = int(split_sent_idx)
        assert info_dict["sent_idx"] == orig_sent_idx and info_dict["split_sent_idx"] == split_sent_idx
        _doc_key = f"{data_split}#{row_idx}#{section_name}"
        assert element["doc_key"] == _doc_key
        _idx = element["sent_idx_split_idx"].index([orig_sent_idx, split_sent_idx])
        assert element["split_sents"][_idx] == file_doc["split_sent_text"]

        assert len(file_doc["split_sent_toks"]) == 1
        assert len(file_doc["tok_char_indices"]) == 1

        element["split_sent_toks"].append(file_doc["split_sent_toks"][0])
        element["split_tok_char_indices"].append(file_doc["tok_char_indices"][0])

    return element


# temp_ds = new_ds.select(range(10))
new_ds = new_ds.map(update_dataset)

Map: 100%|██████████| 1136366/1136366 [03:30<00:00, 5408.68 examples/s]


In [16]:
temp_path = save_to_temp(new_ds, version=2)
temp_path

Saving the dataset (0/6 shards):   0%|          | 0/1136366 [00:00<?, ? examples/s]

Saving the dataset (6/6 shards): 100%|██████████| 1136366/1136366 [00:01<00:00, 716053.02 examples/s]


'/home/yuxiang/liao/workspace/arrg_preprocessing/outputs/interpret_sents/combined_results/temp_v2'

# Load radlex results

## Load radlex ontology

In [23]:
class OntologyNode:
    def __init__(self, row_idx, class_id, class_name, df_row):
        self.row_idx = row_idx
        self.class_id = class_id
        self.class_name = class_name
        self.synonyms = [] if df_row["Synonyms"] == "" else df_row["Synonyms"].split("|")
        self.df_row = df_row

        # The tree structure is maintained by the parent and children attributes. Only one level of parent-child relationship is maintained.
        self.parent = []
        self.children = []
        self.is_root = False
        self.tree_level = None

        # It's parents from all levels
        self._all_parents = []

    def add_child(self, child):
        self.children.append(child)

    def add_parent(self, parent):
        self.parent.append(parent)

    @property
    def all_parents(self):
        if self.is_root:
            return []
        elif self._all_parents:
            return self._all_parents
        else:
            for parent in self.parent:
                # 避免父节点重复
                self._all_parents = set(parent.all_parents + [parent])
                self._all_parents = list(self._all_parents)
            return self._all_parents

    def __eq__(self, other):
        if isinstance(other, OntologyNode):
            return self.class_id == other.class_id
        else:
            return self.class_id == other

    def __hash__(self):
        return hash(self.class_id)

    def __str__(self):
        return f"{self.class_id}: {self.class_name}"

    def __repr__(self):
        return self.__str__()


def set_tree_level(curr_node, tree_level):
    curr_node.tree_level = tree_level
    for child in curr_node.children:
        set_tree_level(child, tree_level + 1)
    if not curr_node.children:
        return

In [24]:
def build_radlex_tree(df_csv):
    # Build a RadLex node list
    node_list = []
    root_node = None
    for idx, row in tqdm(df_csv.iterrows(), total=df_csv.shape[0], desc="Building RadLex tree"):
        ontology_node = OntologyNode(row_idx=idx, class_id=row["Class ID"], class_name=row["Preferred Label"], df_row=row)
        if row["Preferred Label"] in row["Class ID"]:
            ontology_node.class_name = row["http://radlex.org/RID/Preferred_Name_for_Obsolete"]
        node_list.append(ontology_node)

    # Resolve the node list and build a RadLex tree
    for node in tqdm(node_list, total=len(node_list), desc="Building RadLex tree"):
        df_row = node.df_row
        parent_ids = df_row["Parents"].split("|")
        for parent_id in parent_ids:
            parent_row_indices = df_csv.loc[df_csv["Class ID"] == parent_id].index
            if not parent_row_indices.empty:
                parent_row_idx = parent_row_indices[0]
                parent_node = node_list[parent_row_idx]
                assert parent_node.class_id == parent_id
                node.add_parent(parent_node)
                parent_node.add_child(node)
            else:
                # In radlex, http://radlex.org/RID/RID0 has parent http://www.w3.org/2002/07/owl#Thing.
                # However, the RID0 is already the root node in the RadLex ontology. We can safely ignore the owl#Thing.
                root_node = node
                node.is_root = True
                node.tree_level = 0

    return node_list, root_node

In [25]:
radlex_csv_path = "/home/yuxiang/liao/resources/bioportal/radlex/RADLEX.csv"
df_radlex_csv = pd.read_csv(radlex_csv_path, keep_default_na=False)
radlex_nodes, radlex_root_node = build_radlex_tree(df_radlex_csv)
radlex_nodes_dict = {node.class_id: node for node in radlex_nodes}
print(f"Number of RadLex nodes: {len(radlex_nodes)}")

# Tracing all parents of nodes
for node in radlex_nodes:
    node.all_parents

set_tree_level(radlex_root_node, tree_level=0)

Building RadLex tree: 100%|██████████| 46761/46761 [00:03<00:00, 14912.58it/s]
Building RadLex tree: 100%|██████████| 46761/46761 [01:10<00:00, 662.42it/s]


Number of RadLex nodes: 46761


## Analyse

In [18]:
radlex_file = "/home/yuxiang/liao/workspace/arrg_preprocessing/outputs/interpret_sents/radlex_annotate/radlex_ann.json"
with open(radlex_file, "r") as f:
    print(next(f))
    print(next(f))

{"doc_key": "train#0#impression#0#0", "sent_text": "1.", "radlex": []}

{"doc_key": "train#0#impression#0#1", "sent_text": "Decreased bibasilar parenchymal opacities are now minimal.", "radlex": [{"match_type": "lemma", "radlex_id": "http://radlex.org/RID/RID5733", "radlex_name": "decreasing", "matched_text": "Decreased", "char_indices": [0, 9], "tok_indices": [0, 1]}, {"match_type": "lower_text", "radlex_id": "http://radlex.org/RID/RID36044", "radlex_name": "decreased", "matched_text": "Decreased", "char_indices": [0, 9], "tok_indices": [0, 1]}, {"match_type": "fuzzy_lemma", "radlex_id": "http://radlex.org/RID/RID5978", "radlex_name": "parenchyma", "matched_text": "parenchymal", "char_indices": [20, 31], "tok_indices": [2, 3]}, {"match_type": "lemma", "radlex_id": "http://radlex.org/RID/RID28530", "radlex_name": "opacity", "matched_text": "opacities", "char_indices": [32, 41], "tok_indices": [3, 4]}, {"match_type": "lower_text", "radlex_id": "http://radlex.org/RID/RID5670", "radlex_na

In [57]:
fuzzy_match_dict = defaultdict(set)
fuzzy_match_count = Counter()

with open(radlex_file, "r") as f:
    for line_idx, line in enumerate(tqdm(f)):
        doc = json.loads(line.strip())
        data_split, row_idx, section_name, orig_sent_idx, split_sent_idx = doc["doc_key"].split("#")
        doc_key = f"{data_split}#{row_idx}#{section_name}"

        position_matches = defaultdict(list)
        for matched_info in doc["radlex"]:
            # matched_info = {"match_type": "fuzzy_lemma", "radlex_id": "http://radlex.org/RID/RID5978", "radlex_name": "parenchyma", "matched_text": "parenchymal", "char_indices": [20, 31], "tok_indices": [2, 3]}
            posi_id = "_".join(map(str, matched_info["tok_indices"]))
            position_matches[posi_id].append(matched_info)

        for matched_info in doc["radlex"]:
            posi_id = "_".join(map(str, matched_info["tok_indices"]))
            # 匹配逻辑：id = radlex_id+start+end 如果有exact match，就忽略fuzzy match。但没有考虑不同id的match情况。
            # 比如 hemithorax，即能exact match到 hemithorax，也能fuzzy match到 hemothorax
            # 我们这里仅分析某个span的所有match都是fuzzy_match
            if matched_info["match_type"] == "fuzzy_lemma" and all([i["match_type"] == "fuzzy_lemma" for i in position_matches[posi_id]]):
                fuzzy_match_dict[(matched_info["radlex_id"], matched_info["radlex_name"])].add(matched_info["matched_text"])
                fuzzy_match_count.update([(matched_info["radlex_id"], matched_info["radlex_name"])])

5330591it [00:44, 120335.25it/s]


In [60]:
len(fuzzy_match_count)

507

In [58]:
for k, v in fuzzy_match_count.most_common():
    print(k, v)
    print(fuzzy_match_dict[k])

('http://radlex.org/RID/RID38667', 'thinning') 100054
{'things', 'THINK', 'think', 'thinks', 'Then', 'this', 'thing', 'Thought', 'thought', 'THIS', 'This', 'Than', 'then', 'than', 'THAN', 'twin'}
('http://radlex.org/RID/RID10345', 'projection radiography') 91291
{'radiographs', 'Radiographs', 'radiographed', 'RADIOGRAPH', 'Radiograph', 'radiograph', 'projection radiograph', 'RADIOGRAPHS'}
('http://radlex.org/RID/RID9889', 'frontal belly of occipitofrontalis muscle') 39670
{'Frontal', 'FRONTAL', 'frontal'}
('http://radlex.org/RID/RID5978', 'parenchyma') 28131
{'PARENCHYMAL', 'parenchymal', 'parenchymas', 'Parenchymal'}
('http://radlex.org/RID/RID5045', 'degeneration') 20595
{'Degenerative', 'DEGENERATIVE', 'degenerative'}
('http://radlex.org/RID/RID34', 'possibly') 15753
{'possible', 'POSSIBLE', 'Possible'}
('http://radlex.org/RID/RID33', 'probably') 11808
{'probable', 'Protable', 'PROBABLE', 'Probable'}
('http://radlex.org/RID/RID5663', 'developmental') 10751
{'development', 'Developme

In [None]:
invalid_radlex_ids = set(
    [
        "http://radlex.org/RID/RID38667",  # thinning
        "http://radlex.org/RID/RID5022",  # stricture
        "http://radlex.org/RID/RID9889",  # frontalis
        "http://radlex.org/RID/RID28842",  # injection
    ]
)

invalid_radlex_text_pairs = {
    "": "",
}

## Process

In [None]:
temp_path = "/home/yuxiang/liao/workspace/arrg_preprocessing/outputs/interpret_sents/combined_results/temp_v2"
new_ds = Dataset.load_from_disk(temp_path)
new_ds

# Load cxrgraph results

# Load radcoref results