<a href="https://colab.research.google.com/github/zaaachos/Thesis-Diagnostic-Captioning/blob/main/notebooks/IUxray_load.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from shutil import rmtree
import xml.etree.ElementTree as ET
import random
import numpy
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')
baseDir = "/content/drive/MyDrive/datasets/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def split_cases(reports_images, reports_text, keys, filename):
    new_images = {}

    for key in keys:
        for image in reports_images[key]:
            new_images[image] = reports_text[key]

    with open(filename, "w") as output_file:
        for new_image in new_images:
            output_file.write(new_image + "\t" + new_images[new_image])
            output_file.write("\n")

In [None]:
# create dataset folder
try:
    rmtree(baseDir+"iu_xray/")
except BaseException:
    pass
os.makedirs(baseDir+"iu_xray/")

# create folder for images
os.makedirs(baseDir+"iu_xray/iu_xray_images/")

In [None]:
# download PNG images
os.system("wget -P iu_xray/ https://openi.nlm.nih.gov/imgs/collections/NLMCXR_png.tgz")

# download reports
os.system("wget -P iu_xray/ https://openi.nlm.nih.gov/imgs/collections/NLMCXR_reports.tgz")

0

In [None]:
# unzip
os.system("tar -xzf ./iu_xray/NLMCXR_png.tgz -C "+baseDir+"iu_xray/iu_xray_images/")
os.system("tar -xzf ./iu_xray/NLMCXR_reports.tgz -C "+baseDir+"iu_xray/")

0

In [None]:
# read the reports xml files and create the dataset tsv
reports_path = baseDir+"iu_xray/ecgen-radiology"

reports = os.listdir(reports_path)

reports.sort()

In [None]:
reports_with_no_image = []
reports_with_empty_sections = []
reports_with_no_impression = []
reports_with_no_findings = []

images_captions = {}
images_major_tags = {}
images_auto_tags = {}
reports_with_images = {}
text_of_reports = {}

for report in reports:
    tree = ET.parse(os.path.join(reports_path, report))
    root = tree.getroot()
    img_ids = []
    # find the images of the report
    images = root.findall("parentImage")
    # if there aren't any ignore the report
    if len(images) == 0:
        reports_with_no_image.append(report)
    else:
        sections = root.find("MedlineCitation").find("Article").find("Abstract").findall("AbstractText")
        # find impression and findings sections
        for section in sections:
            if section.get("Label") == "FINDINGS":
                findings = section.text
            if section.get("Label") == "IMPRESSION":
                impression = section.text

        if impression is None and findings is None:
            reports_with_empty_sections.append(report)
        else:
            if impression is None:
                reports_with_no_impression.append(report)
                caption = findings
            elif findings is None:
                reports_with_no_findings.append(report)
                caption = impression
            else:
                caption = impression + " " + findings

            # get the MESH tags
            tags = root.find("MeSH")
            major_tags = []
            auto_tags = []
            if tags is not None:
                major_tags = [t.text for t in tags.findall("major")]
                auto_tags = [t.text for t in tags.findall("automatic")]

            for image in images:
                iid = image.get("id") + ".png"
                images_captions[iid] = caption
                img_ids.append(iid)
                images_major_tags[iid] = major_tags
                images_auto_tags[iid] = auto_tags

            reports_with_images[report] = img_ids
            text_of_reports[report] = caption

print("Found", len(reports_with_no_image), "reports with no associated image")
print("Found", len(reports_with_empty_sections), "reports with empty Impression and Findings sections")
print("Found", len(reports_with_no_impression), "reports with no Impression section")
print("Found", len(reports_with_no_findings), "reports with no Findings section")

print("Collected", len(images_captions), "image-caption pairs")


Found 104 reports with no associated image
Found 25 reports with empty Impression and Findings sections
Found 6 reports with no Impression section
Found 489 reports with no Findings section
Collected 7430 image-caption pairs


In [None]:
with open(baseDir+"iu_xray/iu_xray.tsv", "w") as output_file:
    for image_caption in images_captions:
        output_file.write(image_caption + "\t" + images_captions[image_caption])
        output_file.write("\n")

# Safer JSON storing
with open(baseDir+"iu_xray/iu_xray_captions.json", "w") as output_file:
    output_file.write(json.dumps(images_captions))
with open(baseDir+"iu_xray/iu_xray_major_tags.json", "w") as output_file:
    output_file.write(json.dumps(images_major_tags))
with open(baseDir+"iu_xray/iu_xray_auto_tags.json", "w") as output_file:
    output_file.write(json.dumps(images_auto_tags))

# perform a case based split
random.seed(42)
keys = list(reports_with_images.keys())
random.shuffle(keys)

train_split = int(numpy.floor(len(reports_with_images) * 0.9))

train_keys = keys[:train_split]
test_keys = keys[train_split:]

train_path = baseDir+"iu_xray/train_images.tsv"
test_path = baseDir+"iu_xray/test_images.tsv"

split_cases(reports_with_images, text_of_reports, train_keys, train_path)
split_cases(reports_with_images, text_of_reports, test_keys, test_path)

In [None]:
os.listdir(baseDir+"iu_xray")

['iu_xray_images',
 'ecgen-radiology',
 'iu_xray.tsv',
 'iu_xray_captions.json',
 'iu_xray_major_tags.json',
 'iu_xray_auto_tags.json',
 'train_images.tsv',
 'test_images.tsv']