### Extracting annotations from XML file

In [1]:
import os
import xml.etree.ElementTree as ET
import numpy as np

xml_root_dir = "LIDC-XML-only"

def extract_annotations(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    namespace = {'ns': 'http://www.nih.gov'}
    annotations_dict = {}
    reading_sessions = root.findall('.//ns:readingSession', namespace)
    for reading_session in reading_sessions:
        for unblinded_read_nodule in reading_session.findall('.//ns:unblindedReadNodule', namespace):
            nodule_id = unblinded_read_nodule.find('ns:noduleID', namespace).text
            characteristics = unblinded_read_nodule.find('ns:characteristics', namespace)
            characteristics_dict = {}
            if characteristics is not None:
                for characteristic in characteristics:
                    characteristics_dict[characteristic.tag.split('}')[1]] = int(characteristic.text)
            else:
                continue
            for roi in unblinded_read_nodule.findall('.//ns:roi', namespace):
                image_sop_uid = roi.find('ns:imageSOP_UID', namespace).text
                edge_map = []
                for edge in roi.findall('.//ns:edgeMap', namespace):
                    x_coord = int(edge.find('ns:xCoord', namespace).text)
                    y_coord = int(edge.find('ns:yCoord', namespace).text)
                    edge_map.append((x_coord, y_coord))
                if image_sop_uid not in annotations_dict:
                    annotations_dict[image_sop_uid] = []
                annotation = {
                    'nodule_id': nodule_id,
                    'edge_map': edge_map
                }
                if characteristics_dict:
                    annotation['characteristics'] = characteristics_dict
                annotations_dict[image_sop_uid].append(annotation)
    return annotations_dict

all_annotations = {}

for subdir, _, files in os.walk(xml_root_dir):
    for file in files:
        if file.endswith(".xml"):
            xml_file_path = os.path.join(subdir, file)
            annotations = extract_annotations(xml_file_path)
            for uid, annotation in annotations.items():
                if uid not in all_annotations:
                    all_annotations[uid] = []
                all_annotations[uid].extend(annotation)

### Converting Diacom to JPEG

In [2]:
def apply_windowing(dicom_img, window_level, window_width):
    # Convert pixel data from DICOM to Hounsfield Units (HU)
    hu_image = dicom_img.pixel_array * dicom_img.RescaleSlope + dicom_img.RescaleIntercept
    
    # Apply windowing
    min_hu = window_level - window_width / 2
    max_hu = window_level + window_width / 2
    windowed_image = np.clip(hu_image, min_hu, max_hu)

    # Normalize to 0-255 grayscale
    windowed_image = (windowed_image - min_hu) / (max_hu - min_hu) * 255.0
    return windowed_image

In [3]:
import os
import json
import pydicom
import numpy as np
from PIL import Image
import random

root_dir = "manifest-1600709154662\LIDC-IDRI"
output_dir = "CT"

train_dir = os.path.join(output_dir, "train")
val_dir = os.path.join(output_dir, "val")
test_dir = os.path.join(output_dir, "test")

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

dcm_files = []
for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith(".dcm"):
            dcm_files.append(os.path.join(subdir, file))

random.shuffle(dcm_files)
total_files = len(dcm_files)
train_split = int(0.8 * total_files)
val_split = int(0.1 * total_files)

train_files = dcm_files[:train_split]
val_files = dcm_files[train_split:train_split + val_split]
test_files = dcm_files[train_split + val_split:]


def process_and_save(files, output_subdir):
    for file_path in files:
        dcm = pydicom.dcmread(file_path)
        if hasattr(dcm, 'PixelData'):
            uid = dcm.SOPInstanceUID
            if uid in all_annotations:
                pixel_array = apply_windowing(dcm, -500, 1500)
                if pixel_array.ndim > 2:
                    pixel_array = pixel_array[0]
                    
                image = Image.fromarray(np.uint8(pixel_array), 'L')  
                jpg_file_name = f"{uid}.jpg"
                jpg_file_path = os.path.join(output_subdir, jpg_file_name)
                image.save(jpg_file_path)


process_and_save(train_files, train_dir)
process_and_save(val_files, val_dir)
process_and_save(test_files, test_dir)