### Extracting annotations from XML file

In [1]:
import os
import xml.etree.ElementTree as ET
import numpy as np

xml_root_dir = "LIDC-XML-only"

def extract_annotations(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    namespace = {'ns': 'http://www.nih.gov'}
    annotations_dict = {}
    reading_sessions = root.findall('.//ns:readingSession', namespace)
    for reading_session in reading_sessions:
        for unblinded_read_nodule in reading_session.findall('.//ns:unblindedReadNodule', namespace):
            nodule_id = unblinded_read_nodule.find('ns:noduleID', namespace).text
            characteristics = unblinded_read_nodule.find('ns:characteristics', namespace)
            characteristics_dict = {}
            if characteristics is not None:
                for characteristic in characteristics:
                    characteristics_dict[characteristic.tag.split('}')[1]] = int(characteristic.text)
            else:
                continue
            for roi in unblinded_read_nodule.findall('.//ns:roi', namespace):
                image_sop_uid = roi.find('ns:imageSOP_UID', namespace).text
                edge_map = []
                for edge in roi.findall('.//ns:edgeMap', namespace):
                    x_coord = int(edge.find('ns:xCoord', namespace).text)
                    y_coord = int(edge.find('ns:yCoord', namespace).text)
                    edge_map.append((x_coord, y_coord))
                if image_sop_uid not in annotations_dict:
                    annotations_dict[image_sop_uid] = []
                annotation = {
                    'nodule_id': nodule_id,
                    'edge_map': edge_map
                }
                if characteristics_dict:
                    annotation['characteristics'] = characteristics_dict
                annotations_dict[image_sop_uid].append(annotation)
    return annotations_dict

all_annotations = {}

for subdir, _, files in os.walk(xml_root_dir):
    for file in files:
        if file.endswith(".xml"):
            xml_file_path = os.path.join(subdir, file)
            annotations = extract_annotations(xml_file_path)
            for uid, annotation in annotations.items():
                if uid not in all_annotations:
                    all_annotations[uid] = []
                all_annotations[uid].extend(annotation)

### Sphericity Annotations

In [2]:
import os
import json
from PIL import Image

def process_dataset_sphericity(root_dir, output_file, all_annotations):
    coco_dataset = {
        "images": [],
        "annotations": [],
        "categories": [
            {"id": 1, "name": "Linear"},
            {"id": 2, "name": "Ovoid/Linear"},
            {"id": 3, "name": "Ovoid"},
            {"id": 4, "name": "Ovoid/Round"},
            {"id": 5, "name": "NA"},
        ]
    }

    annotation_id = 0
    image_id = 0

    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".jpg"):
                file_path = os.path.join(subdir, file)
                
                uid = os.path.splitext(file)[0]
                
                with Image.open(file_path) as img:
                    width, height = img.size
                    
                    coco_dataset["images"].append({
                        "id": image_id,
                        "file_name": file,
                        "width": width,
                        "height": height
                    })
                    
                    if uid in all_annotations:
                        for annotation in all_annotations[uid]:
                            edge_points = annotation['edge_map']
                            x, y = zip(*edge_points)
                            bbox = [min(x), min(y), max(x) - min(x), max(y) - min(y)]
                            area = bbox[2] * bbox[3]
                            score = annotation['characteristics']['sphericity']
                            coco_dataset["annotations"].append({
                                "id": annotation_id,
                                "image_id": image_id,
                                "category_id": score,
                                "bbox": bbox,
                                "area": area,
                                "segmentation": [],
                                "iscrowd": 0
                            })
                            annotation_id += 1
                    
                    image_id += 1

    with open(output_file, 'w') as f:
        json.dump(coco_dataset, f)

train_dir = "CT\\train"
val_dir = "CT\\val"
test_dir = "CT\\test"

train_output_file = os.path.join(train_dir, 'coco_dataset_train_sphericity.json')
val_output_file = os.path.join(val_dir, 'coco_dataset_val_sphericity.json')
test_output_file = os.path.join(test_dir, 'coco_dataset_test_sphericity.json')

process_dataset_sphericity(train_dir, train_output_file, all_annotations)
process_dataset_sphericity(val_dir, val_output_file, all_annotations)
process_dataset_sphericity(test_dir, test_output_file, all_annotations)

### Margin Annotations

In [3]:
import os
import json
from PIL import Image

def process_dataset_margin(root_dir, output_file, all_annotations):
    coco_dataset = {
        "images": [],
        "annotations": [],
        "categories": [
            {"id": 1, "name": "Poorly Defined"},
            {"id": 2, "name": "Near Poorly Defined"},
            {"id": 3, "name": "Medium Margin"},
            {"id": 4, "name": "Near Sharp"},
            {"id": 5, "name": "Sharp"},
        ]
    }

    annotation_id = 0
    image_id = 0

    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".jpg"):
                file_path = os.path.join(subdir, file)
                
                uid = os.path.splitext(file)[0]
                
                with Image.open(file_path) as img:
                    width, height = img.size
                    
                    coco_dataset["images"].append({
                        "id": image_id,
                        "file_name": file,
                        "width": width,
                        "height": height
                    })
                    
                    if uid in all_annotations:
                        for annotation in all_annotations[uid]:
                            edge_points = annotation['edge_map']
                            x, y = zip(*edge_points)
                            bbox = [min(x), min(y), max(x) - min(x), max(y) - min(y)]
                            area = bbox[2] * bbox[3]
                            score=annotation['characteristics']['margin']
                            coco_dataset["annotations"].append({
                                "id": annotation_id,
                                "image_id": image_id,
                                "category_id": score,
                                "bbox": bbox,
                                "area": area,
                                "segmentation": [],
                                "iscrowd": 0
                            })
                            annotation_id += 1
                    
                    image_id += 1

    with open(output_file, 'w') as f:
        json.dump(coco_dataset, f)

train_dir = "CT\\train"
val_dir = "CT\\val"
test_dir = "CT\\test"

train_output_file = os.path.join(train_dir, 'coco_dataset_train_margin.json')
val_output_file = os.path.join(val_dir, 'coco_dataset_val_margin.json')
test_output_file = os.path.join(test_dir, 'coco_dataset_test_margin.json')

process_dataset_margin(train_dir, train_output_file, all_annotations)
process_dataset_margin(val_dir, val_output_file, all_annotations)
process_dataset_margin(test_dir, test_output_file, all_annotations)

### Texture Annotations

In [4]:
import os
import json
from PIL import Image

def process_dataset_annotation(root_dir, output_file, all_annotations):
    coco_dataset = {
        "images": [],
        "annotations": [],
        "categories": [
            {"id": 1, "name": "Non-Solid/GGO"},
            {"id": 2, "name": "Non-Solid/Mixed"},
            {"id": 3, "name": "Part Solid/Mixed"},
            {"id": 4, "name": "Solid/Mixed"},
            {"id": 5, "name": "Solid"},
        ]
    }

    annotation_id = 0
    image_id = 0

    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".jpg"):
                file_path = os.path.join(subdir, file)
                
                uid = os.path.splitext(file)[0]
                
                with Image.open(file_path) as img:
                    width, height = img.size
                    
                    coco_dataset["images"].append({
                        "id": image_id,
                        "file_name": file,
                        "width": width,
                        "height": height
                    })
                    
                    if uid in all_annotations:
                        for annotation in all_annotations[uid]:
                            edge_points = annotation['edge_map']
                            x, y = zip(*edge_points)
                            bbox = [min(x), min(y), max(x) - min(x), max(y) - min(y)]
                            area = bbox[2] * bbox[3]
                            score = annotation['characteristics']['texture']
                            coco_dataset["annotations"].append({
                                "id": annotation_id,
                                "image_id": image_id,
                                "category_id": score,
                                "bbox": bbox,
                                "area": area,
                                "segmentation": [],
                                "iscrowd": 0
                            })
                            annotation_id += 1
                    
                    image_id += 1

    with open(output_file, 'w') as f:
        json.dump(coco_dataset, f)

train_dir = "CT\\train"
val_dir = "CT\\val"
test_dir = "CT\\test"

train_output_file = os.path.join(train_dir, 'coco_dataset_train_annotation.json')
val_output_file = os.path.join(val_dir, 'coco_dataset_val_annotation.json')
test_output_file = os.path.join(test_dir, 'coco_dataset_test_annotation.json')

process_dataset_annotation(train_dir, train_output_file, all_annotations)
process_dataset_annotation(val_dir, val_output_file, all_annotations)
process_dataset_annotation(test_dir, test_output_file, all_annotations)

### Malignancy Annotations

In [5]:
import os
import json
from PIL import Image

def process_dataset_malignancy(root_dir, output_file, all_annotations):
    coco_dataset = {
        "images": [],
        "annotations": [],
        "categories": [
            {"id": 1, "name": "Highly Unlikely"},
            {"id": 2, "name": "Moderately Unlikely"},
            {"id": 3, "name": "Indeterminate"},
            {"id": 4, "name": "Moderately Suspicious"},
            {"id": 5, "name": "Highly Suspicious"},
        ]
    }

    annotation_id = 0
    image_id = 0

    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".jpg"):
                file_path = os.path.join(subdir, file)
                
                uid = os.path.splitext(file)[0]
                
                with Image.open(file_path) as img:
                    width, height = img.size
                    
                    coco_dataset["images"].append({
                        "id": image_id,
                        "file_name": file,
                        "width": width,
                        "height": height
                    })
                    
                    if uid in all_annotations:
                        for annotation in all_annotations[uid]:
                            edge_points = annotation['edge_map']
                            x, y = zip(*edge_points)
                            bbox = [min(x), min(y), max(x) - min(x), max(y) - min(y)]
                            area = bbox[2] * bbox[3]
                            score = annotation['characteristics']['malignancy']
                            coco_dataset["annotations"].append({
                                "id": annotation_id,
                                "image_id": image_id,
                                "category_id": score,
                                "bbox": bbox,
                                "area": area,
                                "segmentation": [],
                                "iscrowd": 0
                            })
                            annotation_id += 1
                    
                    image_id += 1

    with open(output_file, 'w') as f:
        json.dump(coco_dataset, f)

train_dir = "CT\\train"
val_dir = "CT\\val"
test_dir = "CT\\test"

train_output_file = os.path.join(train_dir, 'coco_dataset_train_malignancy.json')
val_output_file = os.path.join(val_dir, 'coco_dataset_val_malignancy.json')
test_output_file = os.path.join(test_dir, 'coco_dataset_test_malignancy.json')

process_dataset_malignancy(train_dir, train_output_file, all_annotations)
process_dataset_malignancy(val_dir, val_output_file, all_annotations)
process_dataset_malignancy(test_dir, test_output_file, all_annotations)

### Spiculation Annotations

In [6]:
import os
import json
from PIL import Image

def process_dataset_spiculation(root_dir, output_file, all_annotations):
    coco_dataset = {
        "images": [],
        "annotations": [],
        "categories": [
            {"id": 1, "name": "No Spiculation"},
            {"id": 2, "name": "Nearly No Spiculation"},
            {"id": 3, "name": "Medium Spiculation"},
            {"id": 4, "name": "Near Marked Spiculation"},
            {"id": 5, "name": "Marked Spiculation"},
        ]
    }

    annotation_id = 0
    image_id = 0

    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".jpg"):
                file_path = os.path.join(subdir, file)
                
                uid = os.path.splitext(file)[0]
                
                with Image.open(file_path) as img:
                    width, height = img.size
                    
                    coco_dataset["images"].append({
                        "id": image_id,
                        "file_name": file,
                        "width": width,
                        "height": height
                    })
                    
                    if uid in all_annotations:
                        for annotation in all_annotations[uid]:
                            edge_points = annotation['edge_map']
                            x, y = zip(*edge_points)
                            bbox = [min(x), min(y), max(x) - min(x), max(y) - min(y)]
                            area = bbox[2] * bbox[3]
                            score = annotation['characteristics']['spiculation']
                            coco_dataset["annotations"].append({
                                "id": annotation_id,
                                "image_id": image_id,
                                "category_id": score,
                                "bbox": bbox,
                                "area": area,
                                "segmentation": [],
                                "iscrowd": 0
                            })
                            annotation_id += 1
                    
                    image_id += 1

    with open(output_file, 'w') as f:
        json.dump(coco_dataset, f)

train_dir = "CT\\train"
val_dir = "CT\\val"
test_dir = "CT\\test"

train_output_file = os.path.join(train_dir, 'coco_dataset_train_spiculation.json')
val_output_file = os.path.join(val_dir, 'coco_dataset_val_spiculation.json')
test_output_file = os.path.join(test_dir, 'coco_dataset_test_spiculation.json')

process_dataset_spiculation(train_dir, train_output_file, all_annotations)
process_dataset_spiculation(val_dir, val_output_file, all_annotations)
process_dataset_spiculation(test_dir, test_output_file, all_annotations)