In [1]:
import os
import sys
from pathlib import Path

PROJ_ROOT = Path.cwd().resolve().parent
sys.path.append(str(PROJ_ROOT))

import pandas as pd
import numpy as np
import pydicom
pydicom.config.settings.reading_validation_mode = pydicom.config.IGNORE

import plistlib

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
from ipywidgets import interact

from utils.functions import load_config, inspect_xml_file
from dataset.common import load_patient_data, load_scan
from dataset.helper import scan_unique_roi_names

In [2]:
cfg = load_config()

# Make Patient Image-XML Matching Table

## Dicom Files

### Generated Strucutred Data from DICOM

In [3]:
# Run this cell to generate patient data
# patient_df = GeneratePatientData(cfg.paths.raw_data, saveFileName='patient_data_w_metadata.csv')

In [4]:
# Load Patient Data
patient_df = load_patient_data(cfg.paths.processed_data+'coca_matched_data.csv')
patient_df.columns
# patient_df

Index(['PID', 'SID', 'FolderList', 'Status', 'NumSlices', 'RescaleSlope',
       'RescaleIntercept', 'PixelRepresentation', 'MassCalibrationFactor',
       'PixelSpacing_X', 'PixelSpacing_Y', 'Z_Spacing', 'SliceThickness',
       'Origin_X', 'Origin_Y', 'Origin_Z', 'Direction', 'ConvolutionKernel',
       'CardiacPhase', 'Manufacturer', 'KVP', 'HasAnnotation', 'LabelPath',
       'TargetClasses', 'NumAnnotatedSlices'],
      dtype='object')

In [5]:
duplicated_patients = patient_df[patient_df['PID'].duplicated()]['PID'].unique()
print("Number of duplicated patients:",len(duplicated_patients))
print(duplicated_patients)

Number of duplicated patients: 30
[ 78 120 135 146 155 156 165 189 192 194 228 276 358 388 398 417 435 453
 493 513 545 607 638 641 684 685 700 726 741 763]


In [6]:
# one_dicom = pydicom.dcmread(cfg.paths.raw_data + patient_df['FolderList'][0][0])
# one_dicom

### Example - CT Images

In [7]:
temp = load_scan(patient_df['FolderList'][3])

In [8]:
def browse_images(idx):
    plt.figure(figsize=(5, 5))
    plt.imshow(temp[idx].pixel_array, cmap='gray')
    plt.title(f"Instance: {temp[idx].InstanceNumber} || Total: {len(temp)}")
    plt.axis('off')
    plt.show()

In [9]:
interact(browse_images, idx=(0, len(temp)-1))

interactive(children=(IntSlider(value=28, description='idx', max=56), Output()), _dom_classes=('widget-interac…

<function __main__.browse_images(idx)>

## XML Files

In [10]:
XMLBASEPATH = os.path.join(cfg.paths.raw_data, 'calcium_xml')
all_xml_paths = [os.path.join(XMLBASEPATH, xml) for xml in os.listdir(XMLBASEPATH) if xml.endswith('.xml')]
patient_xml_dict = {int(os.path.basename(xml_path).split('/')[-1].split('.')[0]): xml_path for xml_path in all_xml_paths}
len(patient_xml_dict.keys())
# tempXML = patient_xml_dict[786]

451

In [12]:
roi_name_counter = scan_unique_roi_names(all_xml_paths, verbose=True)
list(roi_name_counter.keys())

  0%|          | 0/451 [00:00<?, ?it/s]


✅ Scan Complete.
❌ Failed files: 0


['Right Coronary Artery',
 'Left Anterior Descending Artery',
 'Left Circumflex Artery',
 'Left Coronary Artery',
 '1',
 '555614876',
 '555831064',
 'Unnamed']

In [None]:
# for pid, file in patient_xml_dict.items():
#     with open(file, 'rb') as f:
#         xml = plistlib.load(f)

#     for image_idx, image in enumerate(xml['Images']):
#         ROIs = image['ROIs']
#         for r_idx, roi in enumerate(ROIs):
#             num_points = roi.get('NumberOfPoints', 0)
#             points_px_raw = roi.get('Point_px', [])

#             if num_points == 0:
#                 print("[num] patient", pid, 'Image', image_idx, 'ROI', r_idx)
#             if len(points_px_raw) == 0:
#                 print("[px] patient", pid, 'Image', image_idx, 'ROI', r_idx)

# Load Metadata

In [14]:
metadata = pd.read_csv(cfg.paths.processed_data+'/coca_matched_data.csv')
# metadata

In [15]:
metadata['HasAnnotation'].value_counts()

HasAnnotation
True     466
False    352
Name: count, dtype: int64

In [16]:
metadata['NumAnnotatedSlices'].sum()

np.int64(3775)