In [2]:
from bs4 import BeautifulSoup
from collections import defaultdict

import numpy as np
import os
import pandas as pd
from posixpath import join
from tqdm.notebook import tqdm


import iuchest.reports as iureports


In [3]:
base_path = "/mnt/datasets/uc5/std-dataset"  # subfolders image, text
out_fld = "/mnt/datasets/uc5/meta/iuchest"

In [None]:
def filename_from_path(path, keep_extension=True):
    base = os.path.basename(path)
    if keep_extension:
        return base

    pre, _ = os.path.splitext(base)
    return pre
    
def parse_id(soup):
    keys = ['pmcid', 'iuxrid', 'uid']
    d = defaultdict(None)
    selected_id = None
    for k in keys:
        if soup(k):
            # since: soup(k) returns:
            #        [<pmcid id="3315"></pmcid>]
            # 1) soup(k)[0] takes the first element of the result set: <pmcid id="3315"></pmcid>
            # 2) soup(k)[0].get('id') reads the value of the property 'id': 3315
            v = soup(k)[0].get('id')
            d[k] = v
            selected_id = v
            if k == keys[0] or k == keys[1]:
                # prefer pmcid or uixrid, that are simple integers. uid starts with 'CXR'
                # example: pmcid=3700, uixrid=3700, uid=CXR3700
                # break as soon as you find one of the first two keys
                break
    assert selected_id  # is not None and is not empty, fail otherwise
    return {"id": selected_id}

def parse_medical_texts(soup):
    a = soup.abstract
    ats = a.find_all('abstracttext')
    res = {}
    valid_labels = ["impression", "indication", "findings", "comparison"]
    for at in ats:
        label = at.get('label').lower()
        if label in valid_labels:
            res[label] = at.text
    return res

def parse_mesh_terms(soup):
    mt = soup.mesh
    res = {}
    if mt:
        mt_major = mt.find_all('major')
        mt_minor = mt.find_all('minor')
        if mt_major:
            res["major_mesh"] = [major.text for major in mt_major if major.text]
        if mt_minor:
            res["minor_mesh"] = [minor.text for minor in mt_minor if minor.text]
    return res

def parse_automatic_terms(soup):
    mt = soup.mesh
    res = {}
    terms = []
    if mt:
        mt_auto = mt.find_all('automatic')
        if mt_auto:
            terms = [term.text for term in mt_auto if term.text]
    res["auto_term"] = terms
    return res

def parse_images(soup):
    res = []
    imgs = soup.find_all('parentimage')
    for img in imgs:
        d = {}
        if img.caption:
            d["image_caption"] = img.caption.text
        if img.url:
            p = img.url.text  # this is an absolute path
            fn = filename_from_path(p, keep_extension=False)
            # dataset contains png images, but paths in reports point to (old) jpeg versions
            d["image_filename"] = fn + '.png'
        else:
            print('FATAL: NO img.url')
            exit()
        res.append(d)
    return res  # {"images": res}


def parse_single_report(filepath, verbose=False):
    with open(filepath, "r", encoding="utf-8") as fin:
        xml = fin.read()
    soup = BeautifulSoup(xml, "lxml")
    parsed = {}
    parsed.update(parse_id(soup))
    parsed.update(parse_medical_texts(soup))
    parsed.update(parse_mesh_terms(soup))
    parsed.update(parse_automatic_terms(soup))
    images = parse_images(soup)
    parsed["image_filename"] = [d["image_filename"] for d in images]
    parsed["filename"] = os.path.basename(filepath)
    return parsed

def parse_reports(txt_fld, ext="xml", verbose=False, dev=False):
    reports = []
    for i, fn in enumerate(tqdm( [ join(txt_fld, fn) for fn in os.listdir(txt_fld) if (ext is None or fn.endswith(ext)) ])):
        reports.append(parse_single_report(fn))
    return reports


reports_fld = join(base_path, "text")
images_fld = join(base_path, "image")

reports = parse_reports(reports_fld)
reports = pd.DataFrame.from_records(reports).set_index("id")
reports.sort_index(inplace=True)

reports["n_images"] = reports["image_filename"].apply(lambda l: len(l))
reports["n_mesh_terms"] = reports["major_mesh"].apply(lambda l: len(l))
reports["n_auto_terms"] = reports["auto_term"].apply(lambda l: len(l))

reports.to_csv( join(out_fld, "reports_raw.tsv"), sep="\t")

READ RAW REPORTS AND CLEAN IT

In [7]:
raw_reports = pd.read_csv( (join(out_fld, "reports_raw.tsv")), sep="\t", na_filter=False)
display(raw_reports.T)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3945,3946,3947,3948,3949,3950,3951,3952,3953,3954
id,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
comparison,None.,Chest radiographs XXXX.,None.,XXXX PA and lateral chest radiographs,,,,Two views of the chest dated XXXX.,None.,,...,Two-view chest from XXXX.,Rib radiographs dated XXXX.,,None.,XXXX,"XXXX, XXXX.",,Chest x-XXXX dated XXXX at XXXX hours.,None.,
indication,Positive TB test,"XXXX-year-old male, chest pain.",,"XXXX-year-old male, XXXX.","dyspnea, subjective fevers, arthritis, immigra...",History of chest pain,Acute bronchitis.,XXXX-year-old with XXXX for one month. History...,Pruritic.,",786.05 XXXX XXXX to XXXX",...,XXXX-year-old male with altered mental status.,XXXX-year-old with pneumonia. Shortness of bre...,MELANOMA,possible tuberculosis,XXXX for one XXXX,SP CABG SOB NO RALES,chest pain.,"XXXX-year-old female, transplant workup.",XXXX-year-old XXXX with pain.,Chest pain
findings,The cardiac silhouette and mediastinum size ar...,The cardiomediastinal silhouette is within nor...,Both lungs are clear and expanded. Heart and m...,There is XXXX increased opacity within the rig...,Interstitial markings are diffusely prominent ...,,Heart size and pulmonary vascularity appear wi...,"The heart, pulmonary XXXX and mediastinum are ...",Cardiac and mediastinal contours are within no...,The lungs appear clear. There are no focal air...,...,Heart size within normal limits. No focal airs...,The heart size is upper limits of normal. The ...,There are bilateral pulmonary nodules whose ap...,Heart size normal. Lungs are clear. XXXX are n...,The lungs are clear. The cardiomediastinal sil...,Sternotomy sutures and bypass grafts have been...,,Calcified mediastinal XXXX. No focal areas of ...,Cardiomediastinal silhouette demonstrates norm...,Lungs are clear bilaterally. There is no focal...
impression,Normal chest x-XXXX.,No acute cardiopulmonary process.,No active disease.,1. Increased opacity in the right upper lobe w...,Diffuse fibrosis. No visible focal acute disease.,Status post left mastectomy. Heart size normal...,1. Retrocardiac soft tissue density. The appea...,No acute cardiopulmonary disease.,No acute findings.,No acute cardiopulmonary disease.,...,No acute cardiopulmonary findings.,Minimal perihilar opacity which could indicate...,1. Bilateral pulmonary nodules suggesting pulm...,Normal chest No evidence of tuberculosis,Negative chest .,Post operative chest with no acute disease.,Heart size is normal and lungs are clear. No p...,No acute cardiopulmonary abnormality. .,No acute cardiopulmonary abnormality.,No acute cardiopulmonary abnormality.
major_mesh,['normal'],['Calcified Granuloma/lung/upper lobe/right'],['normal'],"['Opacity/lung/upper lobe/right', 'Pulmonary A...",['Markings/lung/bilateral/interstitial/diffuse...,['Mastectomy/left'],"['Density/retrocardiac', 'Calcinosis/blood ves...","['Aorta/tortuous', 'Shoulder/bilateral/degener...",['normal'],['normal'],...,['normal'],['Opacity/lung/hilum/streaky/mild'],"['Nodule/lung/bilateral/multiple', 'Surgical I...",['normal'],['normal'],['Sutures/sternum'],['normal'],"['Calcinosis/mediastinum', 'Thoracic Vertebrae...","['Atherosclerosis/aorta, thoracic', 'Aorta, Th...",['normal']
auto_term,[],['calcified granuloma'],[],"['atelectases', 'mass lesion', 'opacity', 'Ate...","['diffuse fibrosis', 'Fibrosis', 'Pulmonary Fi...","['mastectomies', 'Mastectomy', 'surgery']","['atelectases', 'calcified granuloma', 'hiatal...",['degenerative change'],[],[],...,[],"['atypical pneumonias', 'opacity']","['metastatic disease', 'nodule', 'opacity', 'p...",[],[],"['bypass grafts', 'sternotomy', 'Coronary Arte...",[],['degenerative change'],"['atheroscleroses', 'degenerative disc disease...",[]
image_filename,"['CXR1_1_IM-0001-3001.png', 'CXR1_1_IM-0001-40...","['CXR10_IM-0002-1001.png', 'CXR10_IM-0002-2001...","['CXR100_IM-0002-1001.png', 'CXR100_IM-0002-20...","['CXR1000_IM-0003-1001.png', 'CXR1000_IM-0003-...","['CXR1001_IM-0004-1001.png', 'CXR1001_IM-0004-...","['CXR1002_IM-0004-1001.png', 'CXR1002_IM-0004-...",['CXR1003_IM-0005-2002.png'],"['CXR1004_IM-0005-1001.png', 'CXR1004_IM-0005-...","['CXR1005_IM-0006-1001.png', 'CXR1005_IM-0006-...","['CXR1006_IM-0007-1001.png', 'CXR1006_IM-0007-...",...,"['CXR990_IM-2476-1001.png', 'CXR990_IM-2476-20...","['CXR991_IM-2476-1001.png', 'CXR991_IM-2476-20...","['CXR992_IM-2477-0001-0001.png', 'CXR992_IM-24...","['CXR993_IM-2478-1001.png', 'CXR993_IM-2478-10...","['CXR994_IM-2478-1001.png', 'CXR994_IM-2478-20...","['CXR995_IM-2478-1001.png', 'CXR995_IM-2478-10...","['CXR996_IM-2479-1001.png', 'CXR996_IM-2479-20...","['CXR997_IM-2479-1001.png', 'CXR997_IM-2479-20...","['CXR998_IM-2479-1001.png', 'CXR998_IM-2479-20...","['CXR999_IM-2480-1001.png', 'CXR999_IM-2480-20..."
filename,1.xml,10.xml,100.xml,1000.xml,1001.xml,1002.xml,1003.xml,1004.xml,1005.xml,1006.xml,...,990.xml,991.xml,992.xml,993.xml,994.xml,995.xml,996.xml,997.xml,998.xml,999.xml
n_images,2,2,2,3,2,2,1,2,2,2,...,2,2,2,2,2,2,2,2,2,3


In [10]:
# all to lowercase:
columns = ["comparison", "indication", "findings", "impression", "major_mesh", "auto_term"]

def to_lowercase(s):
    if isinstance(s, str):
        return s.lower()
    elif isinstance(s, list):
        return [to_lowercase(x) for x in s]
    else:
        print("errors, unexpected type:", type(s))

for c in columns:
    raw_reports[c] = raw_reports[c].apply(to_lowercase)

# remove empty images
raw_reports = raw_reports[raw_reports["n_images"] > 0]

# split text in sentences


def join_findings_impression(findings, impression):
    from nltk import sent_tokenize
    fi = []
    im = []
    if (findings is not None) and (len(findings) > 0):
        sents = sent_tokenize(findings)

    

raw_reports["text"] = pd.Seris([join_findings_impression(f, v) for ])

In [None]:
# apply correction from excel file provided by the dataset authors


In [None]:

for r in reports.itertuples():
