# Init

In [1]:
import datasets
from datasets import load_dataset, Sequence, Image, DatasetDict, concatenate_datasets, Dataset
import os
import json
from tqdm import tqdm
import re
import copy
import pandas as pd
import numpy as np
from typing import Union, List
import ast
import linecache

  from .autonotebook import tqdm as notebook_tqdm


# Load Ontology

In [2]:
class OntologyNode:
    def __init__(self, row_idx, class_id, class_name, df_row):
        self.row_idx = row_idx
        self.class_id = class_id
        self.class_name = class_name
        self.synonyms = [] if df_row["Synonyms"] == "" else df_row["Synonyms"].split("|")
        self.df_row = df_row

        # The tree structure is maintained by the parent and children attributes. Only one level of parent-child relationship is maintained.
        self.parent = []
        self.children = []
        self.is_root = False
        self.tree_level = None

        # It's parents from all levels
        self._all_parents = []

    def add_child(self, child):
        self.children.append(child)

    def add_parent(self, parent):
        self.parent.append(parent)

    @property
    def all_parents(self):
        if self.is_root:
            return []
        elif self._all_parents:
            return self._all_parents
        else:
            for parent in self.parent:
                # 避免父节点重复
                self._all_parents = set(parent.all_parents + [parent])
                self._all_parents = list(self._all_parents)
            return self._all_parents

    def __eq__(self, other):
        if isinstance(other, OntologyNode):
            return self.class_id == other.class_id
        else:
            return self.class_id == other

    def __hash__(self):
        return hash(self.class_id)

    def __str__(self):
        return f"{self.class_id}: {self.class_name}"

    def __repr__(self):
        return self.__str__()


def set_tree_level(curr_node, tree_level):
    curr_node.tree_level = tree_level
    for child in curr_node.children:
        set_tree_level(child, tree_level + 1)
    if not curr_node.children:
        return

In [3]:
def build_radlex_tree(df_csv):
    # Build a RadLex node list
    node_list = []
    root_node = None
    for idx, row in tqdm(df_csv.iterrows(), total=df_csv.shape[0], desc="Building RadLex tree"):
        ontology_node = OntologyNode(row_idx=idx, class_id=row["Class ID"], class_name=row["Preferred Label"], df_row=row)
        if row["Preferred Label"] in row["Class ID"]:
            ontology_node.class_name = row["http://radlex.org/RID/Preferred_Name_for_Obsolete"]
        node_list.append(ontology_node)

    # Resolve the node list and build a RadLex tree
    for node in tqdm(node_list, total=len(node_list), desc="Building RadLex tree"):
        df_row = node.df_row
        parent_ids = df_row["Parents"].split("|")
        for parent_id in parent_ids:
            parent_row_indices = df_csv.loc[df_csv["Class ID"] == parent_id].index
            if not parent_row_indices.empty:
                parent_row_idx = parent_row_indices[0]
                parent_node = node_list[parent_row_idx]
                assert parent_node.class_id == parent_id
                node.add_parent(parent_node)
                parent_node.add_child(node)
            else:
                # In radlex, http://radlex.org/RID/RID0 has parent http://www.w3.org/2002/07/owl#Thing.
                # However, the RID0 is already the root node in the RadLex ontology. We can safely ignore the owl#Thing.
                root_node = node
                node.is_root = True
                node.tree_level = 0

    return node_list, root_node

In [None]:
radlex_csv_path = "/home/yuxiang/liao/resources/bioportal/radlex/RADLEX.csv"
df_radlex_csv = pd.read_csv(radlex_csv_path, keep_default_na=False)
radlex_nodes, radlex_root_node = build_radlex_tree(df_radlex_csv)
radlex_nodes_dict = {node.class_id: node for node in radlex_nodes}
print(f"Number of RadLex nodes: {len(radlex_nodes)}")

# Tracing all parents of nodes
for node in radlex_nodes:
    node.all_parents

set_tree_level(radlex_root_node, tree_level=0)

Building RadLex tree: 100%|██████████| 46761/46761 [00:01<00:00, 23456.20it/s]
Building RadLex tree: 100%|██████████| 46761/46761 [01:11<00:00, 654.77it/s]


Number of RadLex nodes: 46761


In [6]:
print(radlex_nodes[0])
print(radlex_nodes[0].parent)
print(radlex_nodes[0]._all_parents)

http://radlex.org/RID/RID35591: string-of-pearls sign of bowel
[http://radlex.org/RID/RID29023: imaging sign]
[http://radlex.org/RID/RID1: RadLex entity, http://radlex.org/RID/RID29023: imaging sign, http://radlex.org/RID/RID5: imaging observation, http://radlex.org/RID/RID0: RadLex ontology entity]


# Analyse

In [8]:
data_path = "/home/yuxiang/liao/workspace/arrg_preprocessing/outputs/interpret_sents/combined_results/interpret-mimic-cxr-sent"
ds = datasets.load_from_disk(data_path)

In [11]:
ds = ds.remove_columns(column_names=["original_text", "sent_toks", "tok_char_indices", "sents", "sent_char_indices"])

In [None]:
ds[]

{'doc_key': 'train#0#impression',
 'split_sents': ['1.',
  'Decreased bibasilar parenchymal opacities are now minimal.',
  'Stable small left pleural effusion.',
  '2.',
  'Feeding tube is again seen.',
  'Sternal plates are again seen.'],
 'split_sent_toks': [['1', '.'],
  ['Decreased',
   'bibasilar',
   'parenchymal',
   'opacities',
   'are',
   'now',
   'minimal',
   '.'],
  ['Stable', 'small', 'left', 'pleural', 'effusion', '.'],
  ['2', '.'],
  ['Feeding', 'tube', 'is', 'again', 'seen', '.'],
  ['Sternal', 'plates', 'are', 'again', 'seen', '.']],
 'split_sent_tok_char_indices': [[[0, 1], [1, 2]],
  [[0, 9],
   [10, 19],
   [20, 31],
   [32, 41],
   [42, 45],
   [46, 49],
   [50, 57],
   [57, 58]],
  [[0, 6], [7, 12], [13, 17], [18, 25], [26, 34], [34, 35]],
  [[0, 1], [1, 2]],
  [[0, 7], [8, 12], [13, 15], [16, 21], [22, 26], [26, 27]],
  [[0, 7], [8, 14], [15, 18], [19, 24], [25, 29], [29, 30]]],
 'sent_idx_split_idx': [[0, 0], [0, 1], [1, 0], [2, 0], [2, 1], [2, 2]],
 'radlex