<font color="yellow">IU-Chest X-Ray dataset</font>

This notebook contains pre-processing and plotting functions.

The first set of cells process raw files (as they are downloaded)

In [1]:
from bs4 import BeautifulSoup
from collections import defaultdict
import numpy as np
import os
import numpy as np
from numpy import count_nonzero as nnz
import pandas as pd
import pickle
from posixpath import join
from tqdm.notebook import tqdm
import os
import lightning.reports as iureports
from lightning.vocabulary_light import Vocabulary

%load_ext autoreload
%autoreload 2
base_path = "/mnt/datasets/uc5/std-dataset"  # subfolders image, text
reports_fld = join(base_path, "text")
images_fld = join(base_path, "image")

meta_fld = "/mnt/datasets/uc5/meta/iuchest_light"
os.makedirs(meta_fld, exist_ok=True)

In [2]:
# process dataset and produce a first, raw csv file with all the available information

def filename_from_path(path, keep_extension=True):
    base = os.path.basename(path)
    if keep_extension:
        return base

    pre, _ = os.path.splitext(base)
    return pre
    
def parse_id(soup):
    keys = ['pmcid', 'iuxrid', 'uid']
    d = defaultdict(None)
    selected_id = None
    for k in keys:
        if soup(k):
            # since: soup(k) returns:
            #        [<pmcid id="3315"></pmcid>]
            # 1) soup(k)[0] takes the first element of the result set: <pmcid id="3315"></pmcid>
            # 2) soup(k)[0].get('id') reads the value of the property 'id': 3315
            v = soup(k)[0].get('id')
            d[k] = v
            selected_id = v
            if k == keys[0] or k == keys[1]:
                # prefer pmcid or uixrid, that are simple integers. uid starts with 'CXR'
                # example: pmcid=3700, uixrid=3700, uid=CXR3700
                # break as soon as you find one of the first two keys
                break
    assert selected_id  # is not None and is not empty, fail otherwise
    return {"id": selected_id}

def parse_medical_texts(soup):
    a = soup.abstract
    ats = a.find_all('abstracttext')
    res = {}
    valid_labels = ["impression", "indication", "findings", "comparison"]
    for at in ats:
        label = at.get('label').lower()
        if label in valid_labels:
            res[label] = at.text
    return res

def parse_mesh_terms(soup):
    mt = soup.mesh
    res = {}
    if mt:
        mt_major = mt.find_all('major')
        mt_minor = mt.find_all('minor')
        if mt_major:
            res["orig_mesh_terms"] = [major.text for major in mt_major if major.text]
        if mt_minor:
            res["minor_mesh"] = [minor.text for minor in mt_minor if minor.text]
    return res

def parse_automatic_terms(soup):
    mt = soup.mesh
    res = {}
    terms = []
    if mt:
        mt_auto = mt.find_all('automatic')
        if mt_auto:
            terms = [term.text for term in mt_auto if term.text]
    res["orig_auto_term"] = terms
    return res

def parse_images(soup):
    res = []
    imgs = soup.find_all('parentimage')
    for img in imgs:
        d = {}
        if img.caption:
            d["image_caption"] = img.caption.text
        if img.url:
            p = img.url.text  # this is an absolute path
            fn = filename_from_path(p, keep_extension=False)
            # dataset contains png images, but paths in reports point to (old) jpeg versions
            d["image_filename"] = fn + '.png'
        else:
            print('FATAL: NO img.url')
            exit()
        res.append(d)
    return res  # {"images": res}


def parse_single_report(filepath, verbose=False):
    with open(filepath, "r", encoding="utf-8") as fin:
        xml = fin.read()
    soup = BeautifulSoup(xml, "lxml")
    parsed = {}
    parsed.update(parse_id(soup))
    parsed.update(parse_medical_texts(soup))
    parsed.update(parse_mesh_terms(soup))
    parsed.update(parse_automatic_terms(soup))
    images = parse_images(soup)
    parsed["image_filename"] = [d["image_filename"] for d in images]
    parsed["filename"] = os.path.basename(filepath)
    return parsed

def parse_reports(txt_fld, ext="xml", verbose=False, dev=False):
    reports = []
    for i, fn in enumerate(tqdm( [ join(txt_fld, fn) for fn in os.listdir(txt_fld) if (ext is None or fn.endswith(ext)) ])):
        reports.append(parse_single_report(fn))
    return reports




reports = parse_reports(reports_fld)
reports = pd.DataFrame.from_records(reports).set_index("id")
reports.sort_index(inplace=True)

reports["n_images"] = reports["image_filename"].apply(lambda l: len(l))
reports["n_orig_mesh_terms"] = reports["orig_mesh_terms"].apply(lambda l: len(l))
reports["n_orig_auto_terms"] = reports["orig_auto_term"].apply(lambda l: len(l))

out_fn = join(meta_fld, "reports_raw.pkl")
reports.to_pickle( out_fn )
print(f"saved {out_fn}, all done.")

  0%|          | 0/3955 [00:00<?, ?it/s]

saved /mnt/datasets/uc5/meta/iuchest_light/reports_raw.pkl, all done.


# <font color="red">STEP</font>
<font color="yellow">previously on nb_iuchsts.nb, reports processed and saved in the file reports_raw.pkl.</font>
The next cells perform basic operations that are needed for any subsequent processing pipeline:
1. remove reports without images;
2. all text to lowercase;
3. add some columns with text length and number of sentences;
4. clean MeSH and automatic terms. In case of automatic terms, there is a step for term normalization using term equivalences provided by the dataset authors (and available on the website);
5. text in impression, findings, etc. is CLEANED (see the cell for details)

In [3]:
from bs4 import BeautifulSoup
from collections import defaultdict
import numpy as np
import os
import numpy as np
from numpy import count_nonzero as nnz
import pandas as pd
import pickle
from posixpath import join
from tqdm.notebook import tqdm

import lightning.reports as iureports
from lightning.vocabulary_light import Vocabulary
from nltk import sent_tokenize


%load_ext autoreload
%autoreload 2
base_path = "/mnt/datasets/uc5/std-dataset"  # subfolders image, text
reports_fld = join(base_path, "text")
images_fld = join(base_path, "image")

meta_fld = "/mnt/datasets/uc5/meta/iuchest_light"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
df = pd.read_pickle( join(meta_fld, "reports_raw.pkl"))
print(f"raw csv, shape {df.shape}")
display(df.T)

raw csv, shape (3955, 11)


id,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
comparison,None.,Chest radiographs XXXX.,None.,XXXX PA and lateral chest radiographs,,,,Two views of the chest dated XXXX.,None.,,...,Two-view chest from XXXX.,Rib radiographs dated XXXX.,,None.,XXXX,"XXXX, XXXX.",,Chest x-XXXX dated XXXX at XXXX hours.,None.,
indication,Positive TB test,"XXXX-year-old male, chest pain.",,"XXXX-year-old male, XXXX.","dyspnea, subjective fevers, arthritis, immigra...",History of chest pain,Acute bronchitis.,XXXX-year-old with XXXX for one month. History...,Pruritic.,",786.05 XXXX XXXX to XXXX",...,XXXX-year-old male with altered mental status.,XXXX-year-old with pneumonia. Shortness of bre...,MELANOMA,possible tuberculosis,XXXX for one XXXX,SP CABG SOB NO RALES,chest pain.,"XXXX-year-old female, transplant workup.",XXXX-year-old XXXX with pain.,Chest pain
findings,The cardiac silhouette and mediastinum size ar...,The cardiomediastinal silhouette is within nor...,Both lungs are clear and expanded. Heart and m...,There is XXXX increased opacity within the rig...,Interstitial markings are diffusely prominent ...,,Heart size and pulmonary vascularity appear wi...,"The heart, pulmonary XXXX and mediastinum are ...",Cardiac and mediastinal contours are within no...,The lungs appear clear. There are no focal air...,...,Heart size within normal limits. No focal airs...,The heart size is upper limits of normal. The ...,There are bilateral pulmonary nodules whose ap...,Heart size normal. Lungs are clear. XXXX are n...,The lungs are clear. The cardiomediastinal sil...,Sternotomy sutures and bypass grafts have been...,,Calcified mediastinal XXXX. No focal areas of ...,Cardiomediastinal silhouette demonstrates norm...,Lungs are clear bilaterally. There is no focal...
impression,Normal chest x-XXXX.,No acute cardiopulmonary process.,No active disease.,1. Increased opacity in the right upper lobe w...,Diffuse fibrosis. No visible focal acute disease.,Status post left mastectomy. Heart size normal...,1. Retrocardiac soft tissue density. The appea...,No acute cardiopulmonary disease.,No acute findings.,No acute cardiopulmonary disease.,...,No acute cardiopulmonary findings.,Minimal perihilar opacity which could indicate...,1. Bilateral pulmonary nodules suggesting pulm...,Normal chest No evidence of tuberculosis,Negative chest .,Post operative chest with no acute disease.,Heart size is normal and lungs are clear. No p...,No acute cardiopulmonary abnormality. .,No acute cardiopulmonary abnormality.,No acute cardiopulmonary abnormality.
orig_mesh_terms,[normal],[Calcified Granuloma/lung/upper lobe/right],[normal],"[Opacity/lung/upper lobe/right, Pulmonary Atel...",[Markings/lung/bilateral/interstitial/diffuse/...,[Mastectomy/left],"[Density/retrocardiac, Calcinosis/blood vessel...","[Aorta/tortuous, Shoulder/bilateral/degenerati...",[normal],[normal],...,[normal],[Opacity/lung/hilum/streaky/mild],"[Nodule/lung/bilateral/multiple, Surgical Inst...",[normal],[normal],[Sutures/sternum],[normal],"[Calcinosis/mediastinum, Thoracic Vertebrae/de...","[Atherosclerosis/aorta, thoracic, Aorta, Thora...",[normal]
orig_auto_term,[],[calcified granuloma],[],"[atelectases, mass lesion, opacity, Atelectasi...","[diffuse fibrosis, Fibrosis, Pulmonary Fibrosis]","[mastectomies, Mastectomy, surgery]","[atelectases, calcified granuloma, hiatal hern...",[degenerative change],[],[],...,[],"[atypical pneumonias, opacity]","[metastatic disease, nodule, opacity, pulmonar...",[],[],"[bypass grafts, sternotomy, Coronary Artery By...",[],[degenerative change],"[atheroscleroses, degenerative disc diseases, ...",[]
image_filename,"[CXR1_1_IM-0001-3001.png, CXR1_1_IM-0001-4001....","[CXR10_IM-0002-1001.png, CXR10_IM-0002-2001.png]","[CXR100_IM-0002-1001.png, CXR100_IM-0002-2001....","[CXR1000_IM-0003-1001.png, CXR1000_IM-0003-200...","[CXR1001_IM-0004-1001.png, CXR1001_IM-0004-100...","[CXR1002_IM-0004-1001.png, CXR1002_IM-0004-200...",[CXR1003_IM-0005-2002.png],"[CXR1004_IM-0005-1001.png, CXR1004_IM-0005-200...","[CXR1005_IM-0006-1001.png, CXR1005_IM-0006-300...","[CXR1006_IM-0007-1001.png, CXR1006_IM-0007-300...",...,"[CXR990_IM-2476-1001.png, CXR990_IM-2476-2001....","[CXR991_IM-2476-1001.png, CXR991_IM-2476-2001....","[CXR992_IM-2477-0001-0001.png, CXR992_IM-2477-...","[CXR993_IM-2478-1001.png, CXR993_IM-2478-1002....","[CXR994_IM-2478-1001.png, CXR994_IM-2478-2001....","[CXR995_IM-2478-1001.png, CXR995_IM-2478-1002....","[CXR996_IM-2479-1001.png, CXR996_IM-2479-2001....","[CXR997_IM-2479-1001.png, CXR997_IM-2479-2001....","[CXR998_IM-2479-1001.png, CXR998_IM-2479-2001....","[CXR999_IM-2480-1001.png, CXR999_IM-2480-2001...."
filename,1.xml,10.xml,100.xml,1000.xml,1001.xml,1002.xml,1003.xml,1004.xml,1005.xml,1006.xml,...,990.xml,991.xml,992.xml,993.xml,994.xml,995.xml,996.xml,997.xml,998.xml,999.xml
n_images,2,2,2,3,2,2,1,2,2,2,...,2,2,2,2,2,2,2,2,2,3
n_orig_mesh_terms,1,1,1,3,2,1,5,4,1,1,...,1,1,3,1,1,1,1,2,3,1


In [5]:
# REMOVE REPORTS WITHOUT IMAGES
# image per reports
print("num images - reports with num images")
print(df.n_images.value_counts())

print("null major mesh", df.orig_mesh_terms.isnull().sum())
print("empty major mesh", nnz(df.orig_mesh_terms == ""))

# remove empty images
iii = df["n_images"] == 0
print("reports without images:", nnz(iii))
df = df.drop(df[iii].index)
print("removed reports without images, new shape", df.shape)


num images - reports with num images
2    3208
1     446
3     181
0     104
4      15
5       1
Name: n_images, dtype: int64
null major mesh 0
empty major mesh 0
reports without images: 104
removed reports without images, new shape (3851, 11)


In [9]:
# all text to lowercase
# all to lowercase:
columns = ["comparison", "indication", "findings", "impression", "orig_mesh_terms", "orig_auto_term"]

def strip_field(s):
    s = s.strip()
    if s is None or s == "":
        return s
    elif s.endswith("."):
        return s
    else:
        return s + "."



for c in ["impression", "findings"]:
    df[c] = df[c].apply(lambda s: strip_field(s))

def to_lowercase(s):
    if isinstance(s, str):
        return s.strip().lower()
    elif isinstance(s, list):
        return [to_lowercase(x.strip()) for x in s]
    else:
        print("errors, unexpected type:", type(s))

for c in columns:
    df[c] = df[c].apply(to_lowercase)


# split text into sentences

columns = ["impression", "findings"]
for c in columns:
    df[c + "_sents"] = df[c].apply(lambda s: sent_tokenize(s))

for c in columns:
    df["len_" + c] = df[c].apply(lambda s: len(s.strip()))

for c in columns:
    df["nsents_" + c] = df[c + "_sents"].apply(lambda l: len(l))

for c in columns:
    print("column:", c)
    print(df["nsents_" + c].value_counts())

# some reports have 0 sentences either in "findings" or "impression"

def concat_columns(row):
    return row["findings"].strip() + " " + row["impression"].strip()

def concat_columns2(row):
    f = row["findings"].strip()
    i = row["impression"].strip()
    if len(i) > 0 and len(f) > 0:
        return f + " " + i
    elif len(i) > 0:
        return i
    elif len(f) > 0:
        return f
    else:
        print("EMPTY")
        print(row)
        return ""

df["raw_text"] = df.apply(concat_columns2, axis=1)
df["len_raw_text"] = df.raw_text.apply(len)
df["nsents_raw_text"] = df["nsents_findings"] + df["nsents_impression"]

assert (df.nsents_raw_text == df.nsents_findings + df.nsents_impression).all()

iii = df["raw_text"].str.len == 0
print("empty text:", nnz(iii))

eee = df["len_raw_text"] == 0
print("empty text:", nnz(eee))

iii = df["raw_text"].str == "."
print("dot:", nnz(iii))

if nnz(eee) > 0:
    display(df.loc[eee])

print("text to lowercase, done")

column: impression
1     2391
2      692
3      296
4      211
5      104
6       63
7       32
0       31
8       15
9       12
10       3
13       1
Name: nsents_impression, dtype: int64
column: findings
4     938
3     768
5     746
0     514
6     436
7     171
2      96
8      88
9      47
10     17
11     10
1       8
12      7
13      3
17      1
18      1
Name: nsents_findings, dtype: int64
EMPTY
comparison                                                            
indication                                                            
findings                                                              
impression                                                            
orig_mesh_terms                                               [normal]
orig_auto_term                                                      []
image_filename       [CXR1137_IM-0093-12012.png, CXR1137_IM-0093-40...
filename                                                      1137.xml
n_images                

Unnamed: 0_level_0,comparison,indication,findings,impression,orig_mesh_terms,orig_auto_term,image_filename,filename,n_images,n_orig_mesh_terms,n_orig_auto_terms,impression_sents,findings_sents,len_impression,len_findings,nsents_impression,nsents_findings,raw_text,len_raw_text,nsents_raw_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1137,,,,,[normal],[],"[CXR1137_IM-0093-12012.png, CXR1137_IM-0093-40...",1137.xml,2,1,0,[],[],0,0,0,0,,0,0
1142,,,,,[normal],[],"[CXR1142_IM-0096-1001.png, CXR1142_IM-0096-200...",1142.xml,2,1,0,[],[],0,0,0,0,,0,0
1147,,,,,[normal],[],[CXR1147_IM-0099-4004.png],1147.xml,1,1,0,[],[],0,0,0,0,,0,0
1293,,,,,[normal],[],[CXR1293_IM-0192-4004.png],1293.xml,1,1,0,[],[],0,0,0,0,,0,0
1297,,,,,[normal],[],"[CXR1297_IM-0195-1001.png, CXR1297_IM-0195-400...",1297.xml,2,1,0,[],[],0,0,0,0,,0,0
1536,,,,,[normal],[],"[CXR1536_IM-0347-0001-0001.png, CXR1536_IM-034...",1536.xml,3,1,0,[],[],0,0,0,0,,0,0
1566,,,,,[normal],[],[CXR1566_IM-0369-1001.png],1566.xml,1,1,0,[],[],0,0,0,0,,0,0
16,,,,,[normal],[],"[CXR16_IM-0389-1001.png, CXR16_IM-0389-2001.png]",16.xml,2,1,0,[],[],0,0,0,0,,0,0
1615,,,,,[normal],[],[CXR1615_IM-0398-4004.png],1615.xml,1,1,0,[],[],0,0,0,0,,0,0
1690,,,,,[normal],[],"[CXR1690_IM-0452-1001-0001.png, CXR1690_IM-045...",1690.xml,2,1,0,[],[],0,0,0,0,,0,0


text to lowercase, done


In [None]:
# check if there are some rows with empty text 
n_empty_impression = nnz(df.impression.str.len() == 0)
n_empty_findings = nnz(df.findings.str.len() == 0)
print("n rows with empty impression:", n_empty_impression)
print("n rows with empty findings:", n_empty_findings)

In [10]:
# clean MeSH terms
def clean_mesh_terms(terms):
    # e.g., terms = [ "major/minor", "major/minor/minor, ...""]
    if isinstance(terms, list):
        terms = [x.split("/")[0].strip().lower() for x in terms]  # take first element in group t1/t2/t3, make it lowercase
    elif isinstance(terms, str):
        terms = [terms.strip().lower()]
    else:
        assert False, "unexpected type: " + str(type(terms))

    # some terms contain a comma: keep only the first
    new_terms = set()
    for t in terms:
        new_terms.add(t)
    new_terms = list(new_terms)
    return new_terms
        
#
df["major_mesh"] = df["orig_mesh_terms"].apply(lambda x: clean_mesh_terms(x))
df["n_major_mesh"] = df["major_mesh"].apply(lambda l: len(l))
print(df["n_major_mesh"].value_counts())
display(df.head().T)

1     2267
2      596
3      388
4      285
5      144
6       93
7       44
8       20
9        9
10       4
12       1
Name: n_major_mesh, dtype: int64


id,1,10,100,1000,1001
comparison,none.,chest radiographs xxxx.,none.,xxxx pa and lateral chest radiographs,none
indication,positive tb test,"xxxx-year-old male, chest pain.",,"xxxx-year-old male, xxxx.","dyspnea, subjective fevers, arthritis, immigra..."
findings,the cardiac silhouette and mediastinum size ar...,the cardiomediastinal silhouette is within nor...,both lungs are clear and expanded. heart and m...,there is xxxx increased opacity within the rig...,interstitial markings are diffusely prominent ...
impression,normal chest x-xxxx.,no acute cardiopulmonary process.,no active disease.,1. increased opacity in the right upper lobe w...,diffuse fibrosis. no visible focal acute disease.
orig_mesh_terms,[normal],[calcified granuloma/lung/upper lobe/right],[normal],"[opacity/lung/upper lobe/right, pulmonary atel...",[markings/lung/bilateral/interstitial/diffuse/...
orig_auto_term,[],[calcified granuloma],[],"[atelectases, mass lesion, opacity, atelectasi...","[diffuse fibrosis, fibrosis, pulmonary fibrosis]"
image_filename,"[CXR1_1_IM-0001-3001.png, CXR1_1_IM-0001-4001....","[CXR10_IM-0002-1001.png, CXR10_IM-0002-2001.png]","[CXR100_IM-0002-1001.png, CXR100_IM-0002-2001....","[CXR1000_IM-0003-1001.png, CXR1000_IM-0003-200...","[CXR1001_IM-0004-1001.png, CXR1001_IM-0004-100..."
filename,1.xml,10.xml,100.xml,1000.xml,1001.xml
n_images,2,2,2,3,2
n_orig_mesh_terms,1,1,1,3,2


In [12]:
# clean auto terms
# normal when set of tags is empty
df["orig_auto_term"] = df["orig_auto_term"].apply(lambda x: ["normal"] if len(x)==0 else x)
list_of_auto = df.orig_auto_term.tolist()
u_auto = set()
for l in list_of_auto:
    for ll in l:
        u_auto.add(ll)

print("n unique auto terms:", len(u_auto))

# normalize terms
norm_terms = {}
with open(join("lightning", "auto_term_norm.txt"), "r", encoding="utf-8") as fin:
        lines = [line for line in fin.readlines() if len(line.strip()) > 0]

for line in lines:
    subst = [t.strip() for t in line.split(":") if len(t.strip()) > 0]
    norm_terms[subst[0]] = subst[1]
    # print(f"{subst[0]} -> {subst[1]}")
print(f"{len(norm_terms)} substitutions")

def perform_subst(terms):
        new_terms = set()
        for t in terms:
            new_term = norm_terms.get(t, t)
            if new_term != t:
                # print(f"{t} -> {new_term}")
                pass
            new_terms.add(new_term)
        return sorted(list(new_terms))

df["auto_term"] = df["orig_auto_term"].apply(lambda x: perform_subst(x))
display(df.T)
print("auto terms, done")

n unique auto terms: 572
49 substitutions


id,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
comparison,none.,chest radiographs xxxx.,none.,xxxx pa and lateral chest radiographs,none,,,two views of the chest dated xxxx.,none.,,...,two-view chest from xxxx.,rib radiographs dated xxxx.,none,none.,xxxx,"xxxx, xxxx.",,chest x-xxxx dated xxxx at xxxx hours.,none.,none
indication,positive tb test,"xxxx-year-old male, chest pain.",,"xxxx-year-old male, xxxx.","dyspnea, subjective fevers, arthritis, immigra...",history of chest pain,acute bronchitis.,xxxx-year-old with xxxx for one month. history...,pruritic.,",786.05 xxxx xxxx to xxxx",...,xxxx-year-old male with altered mental status.,xxxx-year-old with pneumonia. shortness of bre...,melanoma,possible tuberculosis,xxxx for one xxxx,sp cabg sob no rales,chest pain.,"xxxx-year-old female, transplant workup.",xxxx-year-old xxxx with pain.,chest pain
findings,the cardiac silhouette and mediastinum size ar...,the cardiomediastinal silhouette is within nor...,both lungs are clear and expanded. heart and m...,there is xxxx increased opacity within the rig...,interstitial markings are diffusely prominent ...,,heart size and pulmonary vascularity appear wi...,"the heart, pulmonary xxxx and mediastinum are ...",cardiac and mediastinal contours are within no...,the lungs appear clear. there are no focal air...,...,heart size within normal limits. no focal airs...,the heart size is upper limits of normal. the ...,there are bilateral pulmonary nodules whose ap...,heart size normal. lungs are clear. xxxx are n...,the lungs are clear. the cardiomediastinal sil...,sternotomy sutures and bypass grafts have been...,,calcified mediastinal xxxx. no focal areas of ...,cardiomediastinal silhouette demonstrates norm...,lungs are clear bilaterally. there is no focal...
impression,normal chest x-xxxx.,no acute cardiopulmonary process.,no active disease.,1. increased opacity in the right upper lobe w...,diffuse fibrosis. no visible focal acute disease.,status post left mastectomy. heart size normal...,1. retrocardiac soft tissue density. the appea...,no acute cardiopulmonary disease.,no acute findings.,no acute cardiopulmonary disease.,...,no acute cardiopulmonary findings.,minimal perihilar opacity which could indicate...,1. bilateral pulmonary nodules suggesting pulm...,normal chest no evidence of tuberculosis.,negative chest .,post operative chest with no acute disease.,heart size is normal and lungs are clear. no p...,no acute cardiopulmonary abnormality. .,no acute cardiopulmonary abnormality.,no acute cardiopulmonary abnormality.
orig_mesh_terms,[normal],[calcified granuloma/lung/upper lobe/right],[normal],"[opacity/lung/upper lobe/right, pulmonary atel...",[markings/lung/bilateral/interstitial/diffuse/...,[mastectomy/left],"[density/retrocardiac, calcinosis/blood vessel...","[aorta/tortuous, shoulder/bilateral/degenerati...",[normal],[normal],...,[normal],[opacity/lung/hilum/streaky/mild],"[nodule/lung/bilateral/multiple, surgical inst...",[normal],[normal],[sutures/sternum],[normal],"[calcinosis/mediastinum, thoracic vertebrae/de...","[atherosclerosis/aorta, thoracic, aorta, thora...",[normal]
orig_auto_term,[normal],[calcified granuloma],[normal],"[atelectases, mass lesion, opacity, atelectasi...","[diffuse fibrosis, fibrosis, pulmonary fibrosis]","[mastectomies, mastectomy, surgery]","[atelectases, calcified granuloma, hiatal hern...",[degenerative change],[normal],[normal],...,[normal],"[atypical pneumonias, opacity]","[metastatic disease, nodule, opacity, pulmonar...",[normal],[normal],"[bypass grafts, sternotomy, coronary artery by...",[normal],[degenerative change],"[atheroscleroses, degenerative disc diseases, ...",[normal]
image_filename,"[CXR1_1_IM-0001-3001.png, CXR1_1_IM-0001-4001....","[CXR10_IM-0002-1001.png, CXR10_IM-0002-2001.png]","[CXR100_IM-0002-1001.png, CXR100_IM-0002-2001....","[CXR1000_IM-0003-1001.png, CXR1000_IM-0003-200...","[CXR1001_IM-0004-1001.png, CXR1001_IM-0004-100...","[CXR1002_IM-0004-1001.png, CXR1002_IM-0004-200...",[CXR1003_IM-0005-2002.png],"[CXR1004_IM-0005-1001.png, CXR1004_IM-0005-200...","[CXR1005_IM-0006-1001.png, CXR1005_IM-0006-300...","[CXR1006_IM-0007-1001.png, CXR1006_IM-0007-300...",...,"[CXR990_IM-2476-1001.png, CXR990_IM-2476-2001....","[CXR991_IM-2476-1001.png, CXR991_IM-2476-2001....","[CXR992_IM-2477-0001-0001.png, CXR992_IM-2477-...","[CXR993_IM-2478-1001.png, CXR993_IM-2478-1002....","[CXR994_IM-2478-1001.png, CXR994_IM-2478-2001....","[CXR995_IM-2478-1001.png, CXR995_IM-2478-1002....","[CXR996_IM-2479-1001.png, CXR996_IM-2479-2001....","[CXR997_IM-2479-1001.png, CXR997_IM-2479-2001....","[CXR998_IM-2479-1001.png, CXR998_IM-2479-2001....","[CXR999_IM-2480-1001.png, CXR999_IM-2480-2001...."
filename,1.xml,10.xml,100.xml,1000.xml,1001.xml,1002.xml,1003.xml,1004.xml,1005.xml,1006.xml,...,990.xml,991.xml,992.xml,993.xml,994.xml,995.xml,996.xml,997.xml,998.xml,999.xml
n_images,2,2,2,3,2,2,1,2,2,2,...,2,2,2,2,2,2,2,2,2,3
n_orig_mesh_terms,1,1,1,3,2,1,5,4,1,1,...,1,1,3,1,1,1,1,2,3,1


auto terms, done


In [13]:
# text cleaning
import re
from nltk import word_tokenize
def clean_text_v1(text, verbose=False):
    
    def subst_numbers(token):
        s = re.sub(r"\A\d+(,|\.)\d+", "_NUM_", token)  # _DEC_ for finer texts
        s = re.sub(r"\A\d+", "_NUM_", s)
        return s

    def subst_meas(text):
        # substitute measures
        e = r"(_NUM_|_DEC_)\s?(cm|mm|in|xxxx)|_NUM_ x _MEAS_|_DEC_ x _MEAS_|_MEAS_ x _MEAS_ x _MEAS|_MEAS_ x _MEAS_"
        t1 = text
        while True:
            t2 = re.sub(e, "_MEAS_", t1)
            if t1 == t2:
                break
            else:
                t1 = t2
        return t1

    text2 = text.replace(" ", " ")
    text2 = text2.replace("..", ".")


    symbols = ",;:?)(!"

    e = "|".join([re.escape(s) for s in symbols])
    text2 = re.sub(e, " ", text2)
    # text2 = " ".join( [t.strip() for t in text2.split(" ")])
    # numbered list items
    text2 = re.sub(r"\s\d+\. ", " ", text2)
    # dash
    text2 = re.sub(r"-", "_", text2)
    # percentages
    text2 = re.sub(r"\d+%\s", "_PERC_ ", text2)
    # XXXX XXXX -> XXXX_XXX
    text2 = re.sub(r"xxxx(\sxxxx)+", "xxxx", text2)
    # ordinals
    text2 = re.sub(r"1st|2nd|3rd|[0-9]+th ", "_N_TH_ ", text2)


    sentences = []
    for sent in sent_tokenize(text2):
        new_tokens = [subst_numbers(token) for token in word_tokenize(sent)[:-1]]  # [:-1] not using last dot
        # for token in word_tokenize(sent):
        #     w = subst_numbers(token)
        #     new_tokens.append(w)
    
        sent = " ".join(new_tokens)
        sent = subst_meas(sent)
        sentences.append(sent)

    text2 = ". ".join(sentences) + "."  # dots, and in particular the last ., were not removed by word_tokenize

    if verbose and text != text2:   # and "_MEAS_" in text2:
        print("* IN (it has been modified):")
        print(text)
        print("* OUT:")
        print(text2)
        print(10 * "*** ")

    return text2


df["text"] = df.raw_text.apply(lambda text: clean_text_v1(text, verbose=False))

display(df.head().T)

out_fn = join(meta_fld, "reports_raw2.pkl")
df.to_pickle(out_fn)
print(f"saved raw reports, step2, in: {out_fn}")

id,1,10,100,1000,1001
comparison,none.,chest radiographs xxxx.,none.,xxxx pa and lateral chest radiographs,none
indication,positive tb test,"xxxx-year-old male, chest pain.",,"xxxx-year-old male, xxxx.","dyspnea, subjective fevers, arthritis, immigra..."
findings,the cardiac silhouette and mediastinum size ar...,the cardiomediastinal silhouette is within nor...,both lungs are clear and expanded. heart and m...,there is xxxx increased opacity within the rig...,interstitial markings are diffusely prominent ...
impression,normal chest x-xxxx.,no acute cardiopulmonary process.,no active disease.,1. increased opacity in the right upper lobe w...,diffuse fibrosis. no visible focal acute disease.
orig_mesh_terms,[normal],[calcified granuloma/lung/upper lobe/right],[normal],"[opacity/lung/upper lobe/right, pulmonary atel...",[markings/lung/bilateral/interstitial/diffuse/...
orig_auto_term,[normal],[calcified granuloma],[normal],"[atelectases, mass lesion, opacity, atelectasi...","[diffuse fibrosis, fibrosis, pulmonary fibrosis]"
image_filename,"[CXR1_1_IM-0001-3001.png, CXR1_1_IM-0001-4001....","[CXR10_IM-0002-1001.png, CXR10_IM-0002-2001.png]","[CXR100_IM-0002-1001.png, CXR100_IM-0002-2001....","[CXR1000_IM-0003-1001.png, CXR1000_IM-0003-200...","[CXR1001_IM-0004-1001.png, CXR1001_IM-0004-100..."
filename,1.xml,10.xml,100.xml,1000.xml,1001.xml
n_images,2,2,2,3,2
n_orig_mesh_terms,1,1,1,3,2


saved raw reports, step2, in: /mnt/datasets/uc5/meta/iuchest_light/reports_raw2.pkl


# <font color="red">STEP</font>

<font color="yellow">reports_raw2.pkl available here</font>

<font color="green">use reports_raw2.pkl for building the vocabulary with AUTOMATIC TERMS</font>

reports_raw.pkl processed in the previous cells is saved in `reports_raw2.pkl`

We now produce a binary encoding of the labels.

We are still using all the data available in the dataset:

1. MeSH terms: two terms are removed, `["technical quality of image unsatisfactory", "no indexing"]`.
After removal of the tags, the reports and the images left with zero tags are removed
    - 98 reports, 186 images are removed

In [None]:
# BINARY ENCODING OF THE LABELS
# M E S H 

#     #          #####  #     #
##   ##  ###### #     # #     #
# # # #  #      #       #     #
#  #  #  #####   #####  #######
#     #  #            # #     #
#     #  #      #     # #     #
#     #  ######  #####  #     #


from bs4 import BeautifulSoup
from collections import defaultdict
import numpy as np
import os
import numpy as np
from numpy import count_nonzero as nnz
import pandas as pd
import pickle
from posixpath import join
from tqdm.notebook import tqdm

import iuchest.reports as iureports
from utils.vocabulary import Vocabulary

%load_ext autoreload
%autoreload 2
base_path = "/mnt/datasets/uc5/std-dataset"  # subfolders image, text
reports_fld = join(base_path, "text")
images_fld = join(base_path, "image")


# 1: ALL AVAILABLE DATA
meta_fld =  "/mnt/datasets/uc5/meta/iuchest_light"
in_fn = join(meta_fld, "reports_raw2.pkl")
df = pd.read_pickle(in_fn)
print(f"read {in_fn}, shape: {df.shape}")
# display(df.T)

# encode data as binary matrix with 1-hot encoding of labels - still using all available data
# unique combinations of mesh terms
mesh_terms_s = df.orig_mesh_terms.str.join("/").tolist()
df["mesh_term_s"] = mesh_terms_s
mesh_terms_s = set(mesh_terms_s)
print("unique combos of mesh terms:", len(mesh_terms_s))

# unique mesh terms
umesh = set()
def add_terms(l: list):
    for t in l:
        umesh.add(t)
    #<

df.major_mesh.apply(lambda l: add_terms(l))
print("unique mesh terms:", len(umesh))

# encode image based binary matrix
umesh = ["normal"] + sorted([l for l in umesh if l != "normal"])
# for i, t in enumerate(umesh):
#     print(f"{i}) {t}")

# step build another file indexed by image filenames, this file will be used in training
matrix = []  # rows correspond to images
img_index = []  # used with matrix
rep_index = []
paths = []
rep_matrix = []  # rows correspond to reports

df["labels"] = df.major_mesh
df2 = df.reset_index()
for t in df2.itertuples():
    enc = []
    for term in umesh:
        enc.append(term in t.labels)
    rep_matrix.append(enc)
    rep_index.append(t.id)
    for fn in t.image_filename:
        img_index.append(fn)
        paths.append(join(images_fld, fn))
        matrix.append(enc)

rep_ds = pd.DataFrame(data=np.array(rep_matrix).astype(int), columns=umesh)
rep_ds["id"] = rep_index
rep_ds.set_index("id", inplace=True)
print(f"dataframe, index is report id: {rep_ds.shape}")
# display(rep_ds.head())

img_ds = pd.DataFrame(data=np.array(matrix).astype(int), columns=umesh)
img_ds["filename"] = img_index
img_ds.set_index("filename", inplace=True)
# display(img_ds.head())

#
# r e m o v e    s o m e     l a b e l s 
#
# some reports with "No Indxing" are actually considered to be of class "NORMAL" in the AUTOMATIC TERMS field
def remove_tags(tags, dataf):
    print("removing columns:", tags)
    nrows1 = dataf.shape[0]
    dataf = dataf.drop(columns=tags)
    # remove images left with 0 labels
    counts = dataf.sum(axis=1)
    # remove rows that are now without labels
    iii = counts == 0
    print(f"removing {nnz(iii)} rows, shape is {dataf.shape}")
    dataf = dataf.drop(index=counts[iii].index)
    nrows2= dataf.shape[0]
    assert nrows1 == nrows2 + nnz(iii)
    print(f"removed {nnz(iii)} rows: ok, shape is {dataf.shape}")
    return dataf

tags = ["technical quality of image unsatisfactory", "no indexing"]
img_ds2 = remove_tags(tags, img_ds)
rep_ds = remove_tags(tags, rep_ds)
print(f"after tag removal, img_ds, shape is {img_ds2.shape}")
print(f"after tag removal, shape is {rep_ds.shape}")

removed_reports = [id for id in df.index if id not in rep_ds.index]

# text associated to the reports with tags ["technical quality of image unsatisfactory", "no indexing"]
# Many of them seem normal
# for t in df.loc[removed_reports].raw_text:
#     print(t)
# automatic tags associated to removed reports
# print(df.loc[removed_reports, "auto_term"].value_counts())

print(f"id of removed reports {len(removed_reports)}:", removed_reports)
df3 = df.drop(labels=removed_reports)
# reports_raw3 used for pipeline using MeSH terms
df3.to_pickle(join(meta_fld, "reports_raw3.pkl"))
print("new dataframe, rows wrt previous ds2:", df3.shape[0] - df2.shape[0])
print(f"saved: {join(meta_fld, 'reports_raw3.pkl')}")

n_removed_images = img_ds.shape[0] - img_ds2.shape[0]  # [id for id in img_ds.index if id not in img2_ds.index]
print("|removed images|:", n_removed_images)
img_ds = img_ds2

def save_pkl_csv(dataframe, fn):
    out_fn = join(meta_fld, fn + ".pkl")
    img_ds.to_pickle(out_fn)
    print(f"saved {out_fn}")
    out_fn = join(meta_fld, fn + ".csv")
    img_ds.to_csv(out_fn, sep="\t")
    print(f"saved` {out_fn}")

save_pkl_csv(img_ds, "img_dataset")
save_pkl_csv(rep_ds, "rep_dataset")
print("cell MeSH terms, completed")


In [22]:
# BINARY ENCODING OF THE LABELS
# A U T O T E R M

   ##    #    #   #####   ####
  #  #   #    #     #    #    #
 #    #  #    #     #    #    #
 ######  #    #     #    #    #
 #    #  #    #     #    #    #
 #    #   ####      #     ####



from bs4 import BeautifulSoup
from collections import defaultdict
import numpy as np
import os
import numpy as np
from numpy import count_nonzero as nnz
import pandas as pd
import pickle
from posixpath import join
from tqdm.notebook import tqdm

from lightning.vocabulary_light import Vocabulary

%load_ext autoreload
%autoreload 2
base_path = "/mnt/datasets/uc5/std-dataset"  # subfolders image, text
reports_fld = join(base_path, "text")
images_fld = join(base_path, "image")


# 1: ALL AVAILABLE DATA
meta_fld =  "/mnt/datasets/uc5/meta/iuchest_light"
in_fn = join(meta_fld, "reports_raw2.pkl")
df = pd.read_pickle(in_fn)
print(f"read {in_fn}, shape: {df.shape}")
# display(df.T)

# encode data as binary matrix with 1-hot encoding of labels - still using all available data
# unique combinations of mesh terms
auto_term_s = df.auto_term.str.join("/").tolist()
df["auto_term_s"] = auto_term_s
auto_term_s = set(auto_term_s)
print("unique combos of auto terms:", len(auto_term_s))

# unique mesh terms
uterms = set()
def add_terms(l: list):
    for t in l:
        uterms.add(t)
    #<

# A U T O T E R M


df.auto_term.apply(lambda l: add_terms(l))
print("unique auto terms:", len(uterms))

# encode image based binary matrix
uterms = ["normal"] + sorted([l for l in uterms if l != "normal"])
print("number of unique auto terms:", len(uterms))
for i, t in enumerate(uterms):
    print(f"{i}) {t}")

# step build another file indexed by image filenames, this file will be used in training
matrix = []  # rows correspond to images
img_index = []  # used with matrix
rep_index = []
paths = []
rep_matrix = []  # rows correspond to reports

df["labels"] = df.auto_term
df2 = df.reset_index()
for t in df2.itertuples():
    enc = []
    for term in uterms:
        enc.append(term in t.labels)
    rep_matrix.append(enc)
    rep_index.append(t.id)
    for fn in t.image_filename:
        img_index.append(fn)
        paths.append(join(images_fld, fn))
        matrix.append(enc)

rep_ds = pd.DataFrame(data=np.array(rep_matrix).astype(int), columns=uterms)
rep_ds["id"] = rep_index
rep_ds.set_index("id", inplace=True)
print(f"dataframe, index is report id: {rep_ds.shape}")
display(rep_ds.head())

img_ds = pd.DataFrame(data=np.array(matrix).astype(int), columns=uterms)
img_ds["filename"] = img_index
img_ds.set_index("filename", inplace=True)
display(img_ds.head())

# remove some labels
def remove_tags(tags, dataf):
    print("removing columns:", tags)
    nrows1 = dataf.shape[0]
    dataf = dataf.drop(columns=tags)
    # remove images left with 0 labels
    counts = dataf.sum(axis=1)
    # remove rows that are now without labels
    iii = counts == 0
    print(f"removing {nnz(iii)} rows, shape is {dataf.shape}")
    dataf = dataf.drop(index=counts[iii].index)
    nrows2= dataf.shape[0]
    assert nrows1 == nrows2 + nnz(iii)
    print(f"removed {nnz(iii)} rows: ok, shape is {dataf.shape}")
    return dataf

#tags = ["technical quality of image unsatisfactory", "no indexing"]
#img_ds = remove_tags(tags, img_ds)
#rep_ds = remove_tags(tags, rep_ds)

#print(f"img_ds, shape is {img_ds.shape}")
#print(f"rep_ds, shape is {rep_ds.shape}")

#removed_reports = [id for id in df.index if id not in rep_ds.index]
#print(f"id of removed reports {len(removed_reports)}:", removed_reports)
#df3 = df.drop(labels=removed_reports)
#print("new dataframe, rows wrt previous ds2:", df3.shape[0] - df2.shape[0])
#df3.to_pickle(join(meta_fld, "reports_raw3.pkl"))
#print(f"saved: {join(meta_fld, 'reports_raw3.pkl')}")


def save_pkl_csv(dataframe, fn):
    out_fn = join(meta_fld, fn + ".pkl")
    img_ds.to_pickle(out_fn)
    print(f"saved {out_fn}")
    out_fn = join(meta_fld, fn + ".csv")
    img_ds.to_csv(out_fn, sep="\t")
    print(f"saved` {out_fn}")

save_pkl_csv(img_ds, "img_dataset_auto_light")
save_pkl_csv(rep_ds, "rep_dataset_auto_light")

print("cell AUTO terms completed")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
read /mnt/datasets/uc5/meta/iuchest_light/reports_raw2.pkl, shape: (3851, 24)
unique combos of auto terms: 1270
unique auto terms: 529
number of unique auto terms: 529
0) normal
1) abdomen
2) abdominal surgery
3) absence of right pulmonary artery
4) absorptiometry, photon
5) acromioclavicular separation
6) acute pneumonia
7) adenopathy
8) adipose tissue
9) air
10) air trapping
11) alveolar edema
12) amputation
13) anchors
14) aneurysm
15) aneurysm, dissecting
16) anterior mediastinal masses
17) anterolisthesis
18) aorta
19) aorta tortuous
20) aorta, abdominal
21) aorta, thoracic
22) aortic aneurysm
23) aortic aneurysm, thoracic
24) aortic atherosclerosis
25) aortic calcifications
26) aortic disease
27) aortic dissection
28) aortic ectasia
29) aortic valve
30) aortic valve replacement
31) apical granuloma
32) artefact
33) arterial abnormality
34) arthritic changes
35) arthritis
36) arthroplasties
37)

Unnamed: 0_level_0,normal,abdomen,abdominal surgery,absence of right pulmonary artery,"absorptiometry, photon",acromioclavicular separation,acute pneumonia,adenopathy,adipose tissue,air,...,venous congestion,venous engorgement,venous hypertension,ventricular hypertrophy,vertebral fracture,vertebroplasty,viral bronchiolitis,viral pneumonias,volume overload,"wounds, gunshot"
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,normal,abdomen,abdominal surgery,absence of right pulmonary artery,"absorptiometry, photon",acromioclavicular separation,acute pneumonia,adenopathy,adipose tissue,air,...,venous congestion,venous engorgement,venous hypertension,ventricular hypertrophy,vertebral fracture,vertebroplasty,viral bronchiolitis,viral pneumonias,volume overload,"wounds, gunshot"
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CXR1_1_IM-0001-3001.png,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR1_1_IM-0001-4001.png,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR10_IM-0002-1001.png,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR10_IM-0002-2001.png,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR100_IM-0002-1001.png,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


saved /mnt/datasets/uc5/meta/iuchest_light/img_dataset_auto_light.pkl
saved` /mnt/datasets/uc5/meta/iuchest_light/img_dataset_auto_light.csv
saved /mnt/datasets/uc5/meta/iuchest_light/rep_dataset_auto_light.pkl
saved` /mnt/datasets/uc5/meta/iuchest_light/rep_dataset_auto_light.csv
cell AUTO terms completed


In [15]:
# build vocabulary
# 1) if using auto terms, then consider all text in raw_reports2
# 2) if using mesh terms, then consider all text in raw_reports3. 
#       Raw_reports3 does not contain images with the tags "no indexing" and "technical quality of image unsatisfactory"


 #    #   ####    ####     ##    #####
 #    #  #    #  #    #   #  #   #    #
 #    #  #    #  #       #    #  #####
 #    #  #    #  #       ######  #    #
  #  #   #    #  #    #  #    #  #    #
   ##     ####    ####   #    #  #####

# 1) if using auto terms, then consider all text in raw_reports2
# 2) if using mesh terms, then consider all text in raw_reports3. 



# !   I M P O R T A N T
vocab_for_terms = "auto"  # "mesh" or "auto"


from bs4 import BeautifulSoup
from collections import defaultdict
import numpy as np
import os
import numpy as np
from numpy import count_nonzero as nnz
import pandas as pd
import pickle
from posixpath import join
from tqdm.notebook import tqdm
from nltk import sent_tokenize

from lightning.vocabulary_light import Vocabulary

%load_ext autoreload
%autoreload 2
base_path = "/mnt/datasets/uc5/std-dataset"  # subfolders image, text
reports_fld = join(base_path, "text")
images_fld = join(base_path, "image")


# 1:  reports_raw3: all data - mesh "technical quality of image unsatisfactory", "no indexing"

meta_fld =  "/mnt/datasets/uc5/meta/iuchest_light"

# ! IMPORTANT
in_filename = "reports_raw3.pkl" if vocab_for_terms == "mesh" else "reports_raw2.pkl"
out_filename = "all_vocab.pkl" if vocab_for_terms == "mesh" else "all_vocab_auto.pkl"


in_fn = join(meta_fld, in_filename)
df = pd.read_pickle(in_fn)
display(df.T)
print(f"{in_fn}, shape: {df.shape}")

print("--")

# step, build vocabulary
text_col = df.text
vocab = Vocabulary()
for (id, text) in text_col.iteritems():
    for sentence in sent_tokenize(text):
        if len(sentence) == 0:
            print("ERROR, sentence length == 0")
        vocab.add_sentence(sentence)

print("number of distinct words:", len(vocab.word2idx))
print("total number of words:", vocab.word_count)

out_fn = join(meta_fld, out_filename)
with open(out_fn, "wb") as fout:
    pickle.dump(vocab, fout)


# # step
# # now encode image labels as a matrix of binary vectors
# in_fn = join(meta_fld, "reports_raw2.pkl")
# df = pd.read_pickle(in_fn)
# print(f"read {in_fn}, shape: {df.shape}")
# display(df.head().T)


# label2idx = {}
# idx2label = {}
# for i, t in enumerate(umesh):
#     label2idx[t] = i
#     idx2label[i] = t

# import yaml
# with open(join(out_fld, "label2idx.yaml"), "w") as fout:
#     yaml.dump(label2idx, fout)

# with open(join(out_fld, "idx2label.yaml"), "w") as fout:
#     yaml.dump(idx2label, fout)

# # test
# with open(join(out_fld, "label2idx.yaml"), "r") as fin:
#     label2idx = yaml.safe_load(fin)

# with open(join(out_fld, "idx2label.yaml"), "r") as fin:
#     idx2label = yaml.safe_load(fin)

# for lbl, idx in label2idx.items():
#     print(f"{lbl}: {idx}, {idx2label[idx]}")


print(f"all done, saved vocabulary: {out_fn}")






The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


id,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
comparison,none.,chest radiographs xxxx.,none.,xxxx pa and lateral chest radiographs,none,,,two views of the chest dated xxxx.,none.,,...,two-view chest from xxxx.,rib radiographs dated xxxx.,none,none.,xxxx,"xxxx, xxxx.",,chest x-xxxx dated xxxx at xxxx hours.,none.,none
indication,positive tb test,"xxxx-year-old male, chest pain.",,"xxxx-year-old male, xxxx.","dyspnea, subjective fevers, arthritis, immigra...",history of chest pain,acute bronchitis.,xxxx-year-old with xxxx for one month. history...,pruritic.,",786.05 xxxx xxxx to xxxx",...,xxxx-year-old male with altered mental status.,xxxx-year-old with pneumonia. shortness of bre...,melanoma,possible tuberculosis,xxxx for one xxxx,sp cabg sob no rales,chest pain.,"xxxx-year-old female, transplant workup.",xxxx-year-old xxxx with pain.,chest pain
findings,the cardiac silhouette and mediastinum size ar...,the cardiomediastinal silhouette is within nor...,both lungs are clear and expanded. heart and m...,there is xxxx increased opacity within the rig...,interstitial markings are diffusely prominent ...,,heart size and pulmonary vascularity appear wi...,"the heart, pulmonary xxxx and mediastinum are ...",cardiac and mediastinal contours are within no...,the lungs appear clear. there are no focal air...,...,heart size within normal limits. no focal airs...,the heart size is upper limits of normal. the ...,there are bilateral pulmonary nodules whose ap...,heart size normal. lungs are clear. xxxx are n...,the lungs are clear. the cardiomediastinal sil...,sternotomy sutures and bypass grafts have been...,,calcified mediastinal xxxx. no focal areas of ...,cardiomediastinal silhouette demonstrates norm...,lungs are clear bilaterally. there is no focal...
impression,normal chest x-xxxx.,no acute cardiopulmonary process.,no active disease.,1. increased opacity in the right upper lobe w...,diffuse fibrosis. no visible focal acute disease.,status post left mastectomy. heart size normal...,1. retrocardiac soft tissue density. the appea...,no acute cardiopulmonary disease.,no acute findings.,no acute cardiopulmonary disease.,...,no acute cardiopulmonary findings.,minimal perihilar opacity which could indicate...,1. bilateral pulmonary nodules suggesting pulm...,normal chest no evidence of tuberculosis.,negative chest .,post operative chest with no acute disease.,heart size is normal and lungs are clear. no p...,no acute cardiopulmonary abnormality. .,no acute cardiopulmonary abnormality.,no acute cardiopulmonary abnormality.
orig_mesh_terms,[normal],[calcified granuloma/lung/upper lobe/right],[normal],"[opacity/lung/upper lobe/right, pulmonary atel...",[markings/lung/bilateral/interstitial/diffuse/...,[mastectomy/left],"[density/retrocardiac, calcinosis/blood vessel...","[aorta/tortuous, shoulder/bilateral/degenerati...",[normal],[normal],...,[normal],[opacity/lung/hilum/streaky/mild],"[nodule/lung/bilateral/multiple, surgical inst...",[normal],[normal],[sutures/sternum],[normal],"[calcinosis/mediastinum, thoracic vertebrae/de...","[atherosclerosis/aorta, thoracic, aorta, thora...",[normal]
orig_auto_term,[normal],[calcified granuloma],[normal],"[atelectases, mass lesion, opacity, atelectasi...","[diffuse fibrosis, fibrosis, pulmonary fibrosis]","[mastectomies, mastectomy, surgery]","[atelectases, calcified granuloma, hiatal hern...",[degenerative change],[normal],[normal],...,[normal],"[atypical pneumonias, opacity]","[metastatic disease, nodule, opacity, pulmonar...",[normal],[normal],"[bypass grafts, sternotomy, coronary artery by...",[normal],[degenerative change],"[atheroscleroses, degenerative disc diseases, ...",[normal]
image_filename,"[CXR1_1_IM-0001-3001.png, CXR1_1_IM-0001-4001....","[CXR10_IM-0002-1001.png, CXR10_IM-0002-2001.png]","[CXR100_IM-0002-1001.png, CXR100_IM-0002-2001....","[CXR1000_IM-0003-1001.png, CXR1000_IM-0003-200...","[CXR1001_IM-0004-1001.png, CXR1001_IM-0004-100...","[CXR1002_IM-0004-1001.png, CXR1002_IM-0004-200...",[CXR1003_IM-0005-2002.png],"[CXR1004_IM-0005-1001.png, CXR1004_IM-0005-200...","[CXR1005_IM-0006-1001.png, CXR1005_IM-0006-300...","[CXR1006_IM-0007-1001.png, CXR1006_IM-0007-300...",...,"[CXR990_IM-2476-1001.png, CXR990_IM-2476-2001....","[CXR991_IM-2476-1001.png, CXR991_IM-2476-2001....","[CXR992_IM-2477-0001-0001.png, CXR992_IM-2477-...","[CXR993_IM-2478-1001.png, CXR993_IM-2478-1002....","[CXR994_IM-2478-1001.png, CXR994_IM-2478-2001....","[CXR995_IM-2478-1001.png, CXR995_IM-2478-1002....","[CXR996_IM-2479-1001.png, CXR996_IM-2479-2001....","[CXR997_IM-2479-1001.png, CXR997_IM-2479-2001....","[CXR998_IM-2479-1001.png, CXR998_IM-2479-2001....","[CXR999_IM-2480-1001.png, CXR999_IM-2480-2001...."
filename,1.xml,10.xml,100.xml,1000.xml,1001.xml,1002.xml,1003.xml,1004.xml,1005.xml,1006.xml,...,990.xml,991.xml,992.xml,993.xml,994.xml,995.xml,996.xml,997.xml,998.xml,999.xml
n_images,2,2,2,3,2,2,1,2,2,2,...,2,2,2,2,2,2,2,2,2,3
n_orig_mesh_terms,1,1,1,3,2,1,5,4,1,1,...,1,1,3,1,1,1,1,2,3,1


/mnt/datasets/uc5/meta/iuchest_light/reports_raw2.pkl, shape: (3851, 24)
--
number of distinct words: 2674
total number of words: 143372
all done, saved vocabulary: /mnt/datasets/uc5/meta/iuchest_light/all_vocab_auto.pkl


In [17]:
# independent
# process vocabulary MAX WORDS

#     #    #    #     #         #     # ####### ######  ######   #####
##   ##   # #    #   #          #  #  # #     # #     # #     # #     #
# # # #  #   #    # #           #  #  # #     # #     # #     # #
#  #  # #     #    #            #  #  # #     # ######  #     #  #####
#     # #######   # #           #  #  # #     # #   #   #     #       #
#     # #     #  #   #          #  #  # #     # #    #  #     # #     #
#     # #     # #     #          ## ##  ####### #     # ######   #####


# ! important
vocab_for_terms = "auto"  # "mesh" or "auto"


from bs4 import BeautifulSoup
from collections import defaultdict
import numpy as np
import os
import numpy as np
from numpy import count_nonzero as nnz
import pandas as pd
import pickle
from posixpath import join
from tqdm.notebook import tqdm

from lightning.vocabulary_light import Vocabulary

%load_ext autoreload
%autoreload 2
base_path = "/mnt/datasets/uc5/std-dataset"  # subfolders image, text
reports_fld = join(base_path, "text")
images_fld = join(base_path, "image")

# dataset
meta_fld =  "/mnt/datasets/uc5/meta/iuchest_light"
in_filename = "reports_raw3.pkl" if vocab_for_terms == "mesh" else "reports_raw2.pkl"
vocab_filename = "all_vocab" if vocab_for_terms == "mesh" else "all_vocab_auto"

in_fn = join(meta_fld, in_filename)
df = pd.read_pickle(in_fn)
print(f"read {in_fn}, shape: {df.shape}")
# display(df.T)
# read vocabulary
with open( join(meta_fld, vocab_filename + ".pkl"), "rb") as fin:
    vocab = pickle.load(fin)

print("(all) num words:", len(vocab.word2idx))
print("(all) word count:", vocab.word_count)

sel_words = []
n_words = 1000
wc = list(vocab.word2count.items())
# words sorted according to their absolute frequency in the dataset
wc = sorted(wc, key=lambda elem: -elem[1])


n = 0
for w, c in wc:
    n += c
print("(all) total number of words (summed):", n)

wc2 = wc[:1000]

n2 = 0
for w, c in wc2:
    # print(f"{w}:{c}")
    n2 += c

# for word, count in vocab.word2count.items():
#     print(f"{word}: {count}")

print("(all) n:", n)
print("(filt) n2:", n2)

print("coverage after filtering:", n2 / n)
print("diff in word count:", n2 - n)
vocab.keep_n_words(1000)
print("saved word count:", vocab.word_count)
out_vocab_fn = join(meta_fld, vocab_filename + "_1000.pkl")
with open(out_vocab_fn, "wb") as fout:
    pickle.dump(vocab, fout)
print("all done, saved:", out_vocab_fn)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
read /mnt/datasets/uc5/meta/iuchest_light/reports_raw2.pkl, shape: (3851, 24)
(all) num words: 2674
(all) word count: 143372
(all) total number of words (summed): 143372
(all) n: 143372
(filt) n2: 140148
coverage after filtering: 0.9775130429930531
diff in word count: -3224
(vocabulary) initial word count (total): 143372
(vocabulary) initial number of words: 2670
(vocabulary) after iterating with add_word number of words: 1000
(vocabulary) final word_count (total):  140148
(vocabulary) final number of words: 1000
saved word count: 140148
all done, saved: /mnt/datasets/uc5/meta/iuchest_light/all_vocab_auto_1000.pkl


<font color="yellow">experiments</font>

In [25]:
# indipendent: can be run 
# 2: keep only label with a minimum frequency, select words and encode text
from bs4 import BeautifulSoup
from collections import defaultdict
import numpy as np
import os
import numpy as np
from numpy import count_nonzero as nnz
import pandas as pd
import pickle
from posixpath import join
from tqdm.notebook import tqdm
import yaml

from lightning.vocabulary_light import Vocabulary

#! IMPORTANT labels
LABELS = "auto"  # "mesh" or "auto"
# min frequency
# min_freq = 130 for mesh
min_freq = 100
# ! important exp_fld
exp_fld = "/opt/uc5/results/experiments_lightning/wp6_lt"
import os
os.makedirs(exp_fld, exist_ok=True)



meta_fld = "/mnt/datasets/uc5/meta/iuchest_light"
reports_fn = "reports_raw2.pkl" if LABELS == "auto" else "reports_raw3.pkl"
# * dataset
ds = pd.read_pickle(join(meta_fld, reports_fn))
print(f"read {reports_fn}, shape: {ds.shape}")
display(ds.head())


img_ds_fn = "img_dataset_auto_light.pkl" if LABELS == "auto" else "img_dataset_auto.pkl"
in_vocab_fn = "all_vocab_auto_1000.pkl" if LABELS == "auto" else "all_vocab_1000.pkl"


img_ds = pd.read_pickle(join(meta_fld, img_ds_fn))
# rep_ds = pd.read_pickle(join(meta_fld, "rep_dataset.pkl"))
display(img_ds.head().T)

counts = img_ds.sum(axis=0)
iii = counts >= min_freq
print(f"{nnz(iii)} labels have at least {min_freq} freq")

keep_labels = img_ds.columns[iii]
drop_labels = img_ds.columns[~iii]
img_ds["misc"] = 0
misc_iii = img_ds[drop_labels].sum(axis=1) > 0
img_ds.loc[misc_iii, "misc"] = 1
img_ds.drop(columns=drop_labels, inplace=True)
# display(img_ds)

def save_label_indexes(df, out_fld):
    cols = df.columns
    lab2idx = {}
    idx2lab = {}
    for i, c in enumerate(cols):
        lab2idx[c] = i
        idx2lab[i] = c
        print(f"{i}) {c}")
    with open(join(out_fld, "label2idx.yaml"), "w") as fout:
        yaml.safe_dump(lab2idx, fout)
    print(f"saved {join(out_fld, 'label2idx.yaml')}")
    print(f"lab2idx with {len(lab2idx)} labels")
    with open(join(out_fld, "idx2label.yaml"), "w") as fout:
        yaml.safe_dump(idx2lab, fout)
    print(f"saved {join(out_fld, 'idx2label.yaml')}")
#<

save_label_indexes(img_ds, exp_fld)
img_ds.to_pickle(join(exp_fld, "img_dataset.pkl"))
print(f"saved {join(exp_fld, 'img_dataset.pkl')}")

# now the text
with open(join(meta_fld, in_vocab_fn), "rb") as fin:
    vocab = pickle.load(fin)
with open(join(exp_fld, "vocab.pkl"), "wb") as fout:
    pickle.dump(vocab, fout)
print(f"vocabulary saved in exp folder, {join(exp_fld, 'vocab.pkl')}")
print(f"num words: {len(vocab.word2idx)}, including special token")

# build aux dataset with columns: report, image filename, text, encoded text


def encode_text(sentences, vocab):
    word_indexes = []
    for sent in sentences.split("."):
        tokens = sent.strip().split()
        enc_sent = []
        for t in tokens:
            enc_sent.append(vocab.word2idx.get(t, Vocabulary.OOV))
        word_indexes.append(enc_sent)

    return word_indexes


rep_ids = []
image_filenames = []
texts = []
for row in ds.reset_index().itertuples():
    for fn in row.image_filename:
        rep_ids.append(row.id)
        image_filenames.append(fn)
        texts.append(row.text)
img_text_ds = pd.DataFrame()
img_text_ds["id"] = rep_ids
img_text_ds["image_filename"] = image_filenames
img_text_ds["text"] = texts
pd.columns = ["id", "image_filename", "text"]
display(img_text_ds.head())

enc_text = img_text_ds.text.apply(encode_text, args=(vocab,))
img_text_ds["enc_text"] = enc_text

display(img_text_ds.head())
out_fn = "img_text_dataset.pkl"
img_text_ds.to_pickle(join(exp_fld, out_fn))
print(f"saved {join(exp_fld, out_fn)}")


print("done")

read reports_raw2.pkl, shape: (3851, 24)


Unnamed: 0_level_0,comparison,indication,findings,impression,orig_mesh_terms,orig_auto_term,image_filename,filename,n_images,n_orig_mesh_terms,...,len_findings,nsents_impression,nsents_findings,raw_text,len_raw_text,nsents_raw_text,major_mesh,n_major_mesh,auto_term,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,none.,positive tb test,the cardiac silhouette and mediastinum size ar...,normal chest x-xxxx.,[normal],[normal],"[CXR1_1_IM-0001-3001.png, CXR1_1_IM-0001-4001....",1.xml,2,1,...,210,1,5,the cardiac silhouette and mediastinum size ar...,231,6,[normal],1,[normal],the cardiac silhouette and mediastinum size ar...
10,chest radiographs xxxx.,"xxxx-year-old male, chest pain.",the cardiomediastinal silhouette is within nor...,no acute cardiopulmonary process.,[calcified granuloma/lung/upper lobe/right],[calcified granuloma],"[CXR10_IM-0002-1001.png, CXR10_IM-0002-2001.png]",10.xml,2,1,...,273,1,4,the cardiomediastinal silhouette is within nor...,307,5,[calcified granuloma],1,[calcified granuloma],the cardiomediastinal silhouette is within nor...
100,none.,,both lungs are clear and expanded. heart and m...,no active disease.,[normal],[normal],"[CXR100_IM-0002-1001.png, CXR100_IM-0002-2001....",100.xml,2,1,...,64,1,2,both lungs are clear and expanded. heart and m...,83,3,[normal],1,[normal],both lungs are clear and expanded. heart and m...
1000,xxxx pa and lateral chest radiographs,"xxxx-year-old male, xxxx.",there is xxxx increased opacity within the rig...,1. increased opacity in the right upper lobe w...,"[opacity/lung/upper lobe/right, pulmonary atel...","[atelectases, mass lesion, opacity, atelectasi...","[CXR1000_IM-0003-1001.png, CXR1000_IM-0003-200...",1000.xml,3,3,...,357,3,5,there is xxxx increased opacity within the rig...,626,8,"[pulmonary atelectasis, opacity]",2,"[atelectasis, mass lesion, opacity, rib]",there is xxxx increased opacity within the rig...
1001,none,"dyspnea, subjective fevers, arthritis, immigra...",interstitial markings are diffusely prominent ...,diffuse fibrosis. no visible focal acute disease.,[markings/lung/bilateral/interstitial/diffuse/...,"[diffuse fibrosis, fibrosis, pulmonary fibrosis]","[CXR1001_IM-0004-1001.png, CXR1001_IM-0004-100...",1001.xml,2,2,...,113,2,3,interstitial markings are diffusely prominent ...,163,5,"[markings, fibrosis]",2,"[diffuse fibrosis, fibrosis, pulmonary fibrosis]",interstitial markings are diffusely prominent ...


filename,CXR1_1_IM-0001-3001.png,CXR1_1_IM-0001-4001.png,CXR10_IM-0002-1001.png,CXR10_IM-0002-2001.png,CXR100_IM-0002-1001.png
normal,1,1,0,0,1
abdomen,0,0,0,0,0
abdominal surgery,0,0,0,0,0
absence of right pulmonary artery,0,0,0,0,0
"absorptiometry, photon",0,0,0,0,0
...,...,...,...,...,...
vertebroplasty,0,0,0,0,0
viral bronchiolitis,0,0,0,0,0
viral pneumonias,0,0,0,0,0
volume overload,0,0,0,0,0


20 labels have at least 100 freq
0) normal
1) atelectasis
2) calcified granuloma
3) cardiomegaly
4) deformity
5) degenerative change
6) diaphragm
7) emphysema
8) granuloma
9) granulomatous disease
10) infiltrates
11) nodule
12) opacity
13) osteophyte
14) pleural effusion
15) pneumonia
16) pulmonary emphysema
17) scarring
18) sternotomy
19) thoracic aorta
20) misc
saved /opt/uc5/results/experiments_lightning/auto_terms/label2idx.yaml
lab2idx with 21 labels
saved /opt/uc5/results/experiments_lightning/auto_terms/idx2label.yaml
saved /opt/uc5/results/experiments_lightning/auto_terms/img_dataset.pkl
vocabulary saved in exp folder, /opt/uc5/results/experiments_lightning/auto_terms/vocab.pkl
num words: 1004, including special token


Unnamed: 0,id,image_filename,text
0,1,CXR1_1_IM-0001-3001.png,the cardiac silhouette and mediastinum size ar...
1,1,CXR1_1_IM-0001-4001.png,the cardiac silhouette and mediastinum size ar...
2,10,CXR10_IM-0002-1001.png,the cardiomediastinal silhouette is within nor...
3,10,CXR10_IM-0002-2001.png,the cardiomediastinal silhouette is within nor...
4,100,CXR100_IM-0002-1001.png,both lungs are clear and expanded. heart and m...


Unnamed: 0,id,image_filename,text,enc_text
0,1,CXR1_1_IM-0001-3001.png,the cardiac silhouette and mediastinum size ar...,"[[5, 61, 39, 9, 53, 19, 7, 22, 14, 66], [15, 6..."
1,1,CXR1_1_IM-0001-4001.png,the cardiac silhouette and mediastinum size ar...,"[[5, 61, 39, 9, 53, 19, 7, 22, 14, 66], [15, 6..."
2,10,CXR10_IM-0002-1001.png,the cardiomediastinal silhouette is within nor...,"[[5, 41, 39, 6, 22, 14, 66, 50, 19, 9, 87], [5..."
3,10,CXR10_IM-0002-2001.png,the cardiomediastinal silhouette is within nor...,"[[5, 41, 39, 6, 22, 14, 66, 50, 19, 9, 87], [5..."
4,100,CXR100_IM-0002-1001.png,both lungs are clear and expanded. heart and m...,"[[138, 17, 7, 40, 9, 434], [16, 9, 53, 14], [4..."


saved /opt/uc5/results/experiments_lightning/auto_terms/img_text_dataset.pkl
done


In [None]:
# EXP DEPENDENT



In [6]:
# ! BINARY NORMAL VS REST - UNBALANCED
# 2 labels
# indipendent: can be run 
# 2: keep only label with a minimum frequency, select words and encode text
from bs4 import BeautifulSoup
from collections import defaultdict
import numpy as np
import os
import numpy as np
from numpy import count_nonzero as nnz
import pandas as pd
import pickle
from posixpath import join
from tqdm.notebook import tqdm
import yaml
import os
import iuchest.reports as iureports
from utils.vocabulary import Vocabulary

#! IMPORTANT
LABELS = "mesh"  # "mesh" or "auto"
EXP_FLD = "/opt/uc5/results/sicaai/normal_vs_rest_unbal"
os.makedirs(EXP_FLD, exist_ok=True)

meta_fld = "/mnt/datasets/uc5/meta/iuchest"
reports_fn = "reports_raw3.pkl" if LABELS == "mesh" else "reports_raw2.pkl"
img_ds_fn = "img_dataset.pkl" if LABELS == "mesh" else "img_dataset_auto.pkl"
in_vocab_fn = "all_vocab_1000.pkl" if LABELS == "mesh" else "all_vocab_auto_1000.pkl"

# * dataset
ds = pd.read_pickle(join(meta_fld, reports_fn))
print(f"read {reports_fn}, shape: {ds.shape}")
display(ds.head())


# !
# ! IMAGES
# !

img_ds = pd.read_pickle(join(meta_fld, img_ds_fn))
# rep_ds = pd.read_pickle(join(meta_fld, "rep_dataset.pkl"))
display(img_ds.head().T)

# normal images
# normal_ds = img_ds[img_ds.normal == True]

drop_cols = [c for c in img_ds.columns if c != "normal"]
img_ds = img_ds.drop(columns=drop_cols)
img_ds["other"] = (img_ds.normal != True).astype(int)

display(img_ds.head().T)
ofn = join(EXP_FLD, "img_dataset.pkl")
img_ds.to_pickle( ofn )
print(f"saved image-based ds: {ofn}, {img_ds.shape}")

def save_label_indexes(df, out_fld):
    cols = df.columns
    lab2idx = {}
    idx2lab = {}
    for i, c in enumerate(cols):
        lab2idx[c] = i
        idx2lab[i] = c
        print(f"{i}) {c}")
    with open(join(out_fld, "label2idx.yaml"), "w") as fout:
        yaml.safe_dump(lab2idx, fout)
    print(f"saved {join(out_fld, 'label2idx.yaml')}")
    print(f"lab2idx with {len(lab2idx)} labels")
    with open(join(out_fld, "idx2label.yaml"), "w") as fout:
        yaml.safe_dump(idx2lab, fout)
    print(f"saved {join(out_fld, 'idx2label.yaml')}")
#<

save_label_indexes(img_ds, EXP_FLD)

# !
# !  TEXT
# !
with open(join(meta_fld, in_vocab_fn), "rb") as fin:
    vocab = pickle.load(fin)
# copy vocab to EXP_FLD
with open(join(EXP_FLD, "vocab.pkl"), "wb") as fout:
    pickle.dump(vocab, fout)
print(f"num words: {len(vocab.word2idx)}, including special tokens")

for i in range(5):
    print(f"word {i}: {vocab.idx2word[i]}")

rep_ids = []
image_filenames = []
texts = []
for row in ds.reset_index().itertuples():
    for fn in row.image_filename:
        rep_ids.append(row.id)
        image_filenames.append(fn)
        texts.append(row.text)
img_text_ds = pd.DataFrame()
img_text_ds["id"] = rep_ids
img_text_ds["image_filename"] = image_filenames
img_text_ds["text"] = texts
pd.columns = ["id", "image_filename", "text"]

def encode_text(sentences, vocab):
    word_indexes = []
    for sent in sentences.split("."):
        tokens = sent.strip().split()
        enc_sent = []
        for t in tokens:
            enc_sent.append(vocab.word2idx.get(t, Vocabulary.OOV))
        word_indexes.append(enc_sent)

    return word_indexes
enc_text = img_text_ds.text.apply(encode_text, args=(vocab,))
img_text_ds["enc_text"] = enc_text

display(img_text_ds.head())

ofn = join( EXP_FLD, "img_text_dataset.pkl")
img_text_ds.to_pickle(ofn)
print("saved aux dataset for text:", ofn)
print("done with 'normal vs others', unbalanced")

read reports_raw3.pkl, shape: (3753, 26)


Unnamed: 0_level_0,comparison,indication,findings,impression,orig_mesh_terms,orig_auto_term,image_filename,filename,n_images,n_orig_mesh_terms,...,nsents_findings,raw_text,len_raw_text,nsents_raw_text,major_mesh,n_major_mesh,auto_term,text,mesh_term_s,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,none.,positive tb test,the cardiac silhouette and mediastinum size ar...,normal chest x-xxxx.,[normal],[normal],"[CXR1_1_IM-0001-3001.png, CXR1_1_IM-0001-4001....",1.xml,2,1,...,5,the cardiac silhouette and mediastinum size ar...,231,6,[normal],1,[normal],the cardiac silhouette and mediastinum size ar...,normal,[normal]
10,chest radiographs xxxx.,"xxxx-year-old male, chest pain.",the cardiomediastinal silhouette is within nor...,no acute cardiopulmonary process.,[calcified granuloma/lung/upper lobe/right],[calcified granuloma],"[CXR10_IM-0002-1001.png, CXR10_IM-0002-2001.png]",10.xml,2,1,...,4,the cardiomediastinal silhouette is within nor...,307,5,[calcified granuloma],1,[calcified granuloma],the cardiomediastinal silhouette is within nor...,calcified granuloma/lung/upper lobe/right,[calcified granuloma]
100,none.,,both lungs are clear and expanded. heart and m...,no active disease.,[normal],[normal],"[CXR100_IM-0002-1001.png, CXR100_IM-0002-2001....",100.xml,2,1,...,2,both lungs are clear and expanded. heart and m...,83,3,[normal],1,[normal],both lungs are clear and expanded. heart and m...,normal,[normal]
1000,xxxx pa and lateral chest radiographs,"xxxx-year-old male, xxxx.",there is xxxx increased opacity within the rig...,1. increased opacity in the right upper lobe w...,"[opacity/lung/upper lobe/right, pulmonary atel...","[atelectases, mass lesion, opacity, atelectasi...","[CXR1000_IM-0003-1001.png, CXR1000_IM-0003-200...",1000.xml,3,3,...,5,there is xxxx increased opacity within the rig...,626,8,"[pulmonary atelectasis, opacity]",2,"[atelectasis, mass lesion, opacity, rib]",there is xxxx increased opacity within the rig...,opacity/lung/upper lobe/right/pulmonary atelec...,"[pulmonary atelectasis, opacity]"
1001,none,"dyspnea, subjective fevers, arthritis, immigra...",interstitial markings are diffusely prominent ...,diffuse fibrosis. no visible focal acute disease.,[markings/lung/bilateral/interstitial/diffuse/...,"[diffuse fibrosis, fibrosis, pulmonary fibrosis]","[CXR1001_IM-0004-1001.png, CXR1001_IM-0004-100...",1001.xml,2,2,...,3,interstitial markings are diffusely prominent ...,163,5,"[fibrosis, markings]",2,"[diffuse fibrosis, fibrosis, pulmonary fibrosis]",interstitial markings are diffusely prominent ...,markings/lung/bilateral/interstitial/diffuse/p...,"[fibrosis, markings]"


filename,CXR1_1_IM-0001-3001.png,CXR1_1_IM-0001-4001.png,CXR10_IM-0002-1001.png,CXR10_IM-0002-2001.png,CXR100_IM-0002-1001.png
normal,1,1,0,0,1
abdomen,0,0,0,0,0
adipose tissue,0,0,0,0,0
airspace disease,0,0,0,0,0
aorta,0,0,0,0,0
...,...,...,...,...,...
trachea,0,0,0,0,0
"trachea, carina",0,0,0,0,0
"tube, inserted",0,0,0,0,0
tuberculosis,0,0,0,0,0


filename,CXR1_1_IM-0001-3001.png,CXR1_1_IM-0001-4001.png,CXR10_IM-0002-1001.png,CXR10_IM-0002-2001.png,CXR100_IM-0002-1001.png
normal,1,1,0,0,1
other,0,0,1,1,0


saved image-based ds: /opt/uc5/results/sicaai/normal_vs_rest_unbal/img_dataset.pkl, (7284, 2)
0) normal
1) other
saved /opt/uc5/results/sicaai/normal_vs_rest_unbal/label2idx.yaml
lab2idx with 2 labels
saved /opt/uc5/results/sicaai/normal_vs_rest_unbal/idx2label.yaml
num words: 1004, including special tokens
word 0: <pad>
word 1: <oov>
word 2: <bos>
word 3: <eos>
word 4: no


Unnamed: 0,id,image_filename,text,enc_text
0,1,CXR1_1_IM-0001-3001.png,the cardiac silhouette and mediastinum size ar...,"[[5, 62, 39, 9, 55, 19, 7, 21, 13, 66], [15, 6..."
1,1,CXR1_1_IM-0001-4001.png,the cardiac silhouette and mediastinum size ar...,"[[5, 62, 39, 9, 55, 19, 7, 21, 13, 66], [15, 6..."
2,10,CXR10_IM-0002-1001.png,the cardiomediastinal silhouette is within nor...,"[[5, 40, 39, 6, 21, 13, 66, 50, 19, 9, 87], [5..."
3,10,CXR10_IM-0002-2001.png,the cardiomediastinal silhouette is within nor...,"[[5, 40, 39, 6, 21, 13, 66, 50, 19, 9, 87], [5..."
4,100,CXR100_IM-0002-1001.png,both lungs are clear and expanded. heart and m...,"[[186, 17, 7, 41, 9, 432], [16, 9, 55, 13], [4..."


saved aux dataset for text: /opt/uc5/results/sicaai/normal_vs_rest_unbal/img_text_dataset.pkl
done with 'normal vs others', unbalanced


In [7]:
#! BINARY NORMAL VS REST - BALANCED

# indipendent: can be run 
# 2: keep only label with a minimum frequency, select words and encode text
from bs4 import BeautifulSoup
from collections import defaultdict
import numpy as np
import os
import numpy as np
from numpy import count_nonzero as nnz
import pandas as pd
import pickle
from posixpath import join
from tqdm.notebook import tqdm
import yaml
import os
import iuchest.reports as iureports
from utils.vocabulary import Vocabulary

#! IMPORTANT
LABELS = "mesh"  # "mesh" or "auto"
EXP_FLD = "/opt/uc5/results/sicaai/normal_vs_rest_bal"
os.makedirs(EXP_FLD, exist_ok=True)

meta_fld = "/mnt/datasets/uc5/meta/iuchest"
reports_fn = "reports_raw3.pkl" if LABELS == "mesh" else "reports_raw2.pkl"
img_ds_fn = "img_dataset.pkl" if LABELS == "mesh" else "img_dataset_auto.pkl"
in_vocab_fn = "all_vocab_1000.pkl" if LABELS == "mesh" else "all_vocab_auto_1000.pkl"

# * dataset
ds = pd.read_pickle(join(meta_fld, reports_fn))
print(f"read {reports_fn}, shape: {ds.shape}")
display(ds.head())


# !
# ! IMAGES
# !

img_ds = pd.read_pickle(join(meta_fld, img_ds_fn))
# rep_ds = pd.read_pickle(join(meta_fld, "rep_dataset.pkl"))
display(img_ds.head().T)

# normal images
iii = img_ds.normal == True
print(f"num normal: {nnz(iii)}")
print(f"num rest: {nnz(~iii)}")
half = min([nnz(iii), nnz(~iii)])
print(f"balanced, {half} examples per class")

balanced = img_ds.groupby("normal").sample(half)
print(f"balanced, {len(balanced)} examples")

assert nnz(balanced.normal == True) == nnz(balanced.normal ==False)
img_ds = balanced

drop_cols = [c for c in img_ds.columns if c != "normal"]
img_ds = img_ds.drop(columns=drop_cols)
img_ds["other"] = (img_ds.normal != True).astype(int)
display(img_ds.head().T)
ofn = join(EXP_FLD, "img_dataset.pkl")
img_ds.to_pickle( ofn )
print(f"saved image-based ds: {ofn}")

def save_label_indexes(df, out_fld):
    cols = df.columns
    lab2idx = {}
    idx2lab = {}
    for i, c in enumerate(cols):
        lab2idx[c] = i
        idx2lab[i] = c
        print(f"{i}) {c}")
    with open(join(out_fld, "label2idx.yaml"), "w") as fout:
        yaml.safe_dump(lab2idx, fout)
    print(f"saved {join(out_fld, 'label2idx.yaml')}")
    print(f"lab2idx with |labels|: {len(lab2idx)}")
    with open(join(out_fld, "idx2label.yaml"), "w") as fout:
        yaml.safe_dump(idx2lab, fout)
    print(f"saved {join(out_fld, 'idx2label.yaml')}")
#<

save_label_indexes(img_ds, EXP_FLD)

# !
# !  TEXT
# !
with open(join(meta_fld, in_vocab_fn), "rb") as fin:
    vocab = pickle.load(fin)
# copy vocab to EXP_FLD
with open(join(EXP_FLD, "vocab.pkl"), "wb") as fout:
    pickle.dump(vocab, fout)
print(f"num words: {len(vocab.word2idx)}, including special tokens")

for i in range(5):
    print(f"word {i}: {vocab.idx2word[i]}")

rep_ids = []
image_filenames = []
texts = []
for row in ds.reset_index().itertuples():
    for fn in row.image_filename:
        rep_ids.append(row.id)
        image_filenames.append(fn)
        texts.append(row.text)
img_text_ds = pd.DataFrame()
img_text_ds["id"] = rep_ids
img_text_ds["image_filename"] = image_filenames
img_text_ds["text"] = texts
pd.columns = ["id", "image_filename", "text"]

def encode_text(sentences, vocab):
    word_indexes = []
    for sent in sentences.split("."):
        tokens = sent.strip().split()
        enc_sent = []
        for t in tokens:
            enc_sent.append(vocab.word2idx.get(t, Vocabulary.OOV))
        word_indexes.append(enc_sent)

    return word_indexes
enc_text = img_text_ds.text.apply(encode_text, args=(vocab,))
img_text_ds["enc_text"] = enc_text

display(img_text_ds.head())

ofn = join( EXP_FLD, "img_text_dataset.pkl")
img_text_ds.to_pickle(ofn)
print("saved aux dataset for text:", ofn)

print("done with 'normal vs rest', balanced")

read reports_raw3.pkl, shape: (3753, 26)


Unnamed: 0_level_0,comparison,indication,findings,impression,orig_mesh_terms,orig_auto_term,image_filename,filename,n_images,n_orig_mesh_terms,...,nsents_findings,raw_text,len_raw_text,nsents_raw_text,major_mesh,n_major_mesh,auto_term,text,mesh_term_s,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,none.,positive tb test,the cardiac silhouette and mediastinum size ar...,normal chest x-xxxx.,[normal],[normal],"[CXR1_1_IM-0001-3001.png, CXR1_1_IM-0001-4001....",1.xml,2,1,...,5,the cardiac silhouette and mediastinum size ar...,231,6,[normal],1,[normal],the cardiac silhouette and mediastinum size ar...,normal,[normal]
10,chest radiographs xxxx.,"xxxx-year-old male, chest pain.",the cardiomediastinal silhouette is within nor...,no acute cardiopulmonary process.,[calcified granuloma/lung/upper lobe/right],[calcified granuloma],"[CXR10_IM-0002-1001.png, CXR10_IM-0002-2001.png]",10.xml,2,1,...,4,the cardiomediastinal silhouette is within nor...,307,5,[calcified granuloma],1,[calcified granuloma],the cardiomediastinal silhouette is within nor...,calcified granuloma/lung/upper lobe/right,[calcified granuloma]
100,none.,,both lungs are clear and expanded. heart and m...,no active disease.,[normal],[normal],"[CXR100_IM-0002-1001.png, CXR100_IM-0002-2001....",100.xml,2,1,...,2,both lungs are clear and expanded. heart and m...,83,3,[normal],1,[normal],both lungs are clear and expanded. heart and m...,normal,[normal]
1000,xxxx pa and lateral chest radiographs,"xxxx-year-old male, xxxx.",there is xxxx increased opacity within the rig...,1. increased opacity in the right upper lobe w...,"[opacity/lung/upper lobe/right, pulmonary atel...","[atelectases, mass lesion, opacity, atelectasi...","[CXR1000_IM-0003-1001.png, CXR1000_IM-0003-200...",1000.xml,3,3,...,5,there is xxxx increased opacity within the rig...,626,8,"[pulmonary atelectasis, opacity]",2,"[atelectasis, mass lesion, opacity, rib]",there is xxxx increased opacity within the rig...,opacity/lung/upper lobe/right/pulmonary atelec...,"[pulmonary atelectasis, opacity]"
1001,none,"dyspnea, subjective fevers, arthritis, immigra...",interstitial markings are diffusely prominent ...,diffuse fibrosis. no visible focal acute disease.,[markings/lung/bilateral/interstitial/diffuse/...,"[diffuse fibrosis, fibrosis, pulmonary fibrosis]","[CXR1001_IM-0004-1001.png, CXR1001_IM-0004-100...",1001.xml,2,2,...,3,interstitial markings are diffusely prominent ...,163,5,"[fibrosis, markings]",2,"[diffuse fibrosis, fibrosis, pulmonary fibrosis]",interstitial markings are diffusely prominent ...,markings/lung/bilateral/interstitial/diffuse/p...,"[fibrosis, markings]"


filename,CXR1_1_IM-0001-3001.png,CXR1_1_IM-0001-4001.png,CXR10_IM-0002-1001.png,CXR10_IM-0002-2001.png,CXR100_IM-0002-1001.png
normal,1,1,0,0,1
abdomen,0,0,0,0,0
adipose tissue,0,0,0,0,0
airspace disease,0,0,0,0,0
aorta,0,0,0,0,0
...,...,...,...,...,...
trachea,0,0,0,0,0
"trachea, carina",0,0,0,0,0
"tube, inserted",0,0,0,0,0
tuberculosis,0,0,0,0,0


num normal: 2696
num rest: 4588
balanced, 2696 examples per class
balanced, 5392 examples


filename,CXR3013_IM-1391-0001-0001.png,CXR3448_IM-1671-3001.png,CXR1767_IM-0501-0001-0002.png,CXR2121_IM-0747-2001.png,CXR1819_IM-0530-1001-0001.png
normal,0,0,0,0,0
other,1,1,1,1,1


saved image-based ds: /opt/uc5/results/sicaai/normal_vs_rest_bal/img_dataset.pkl
0) normal
1) other
saved /opt/uc5/results/sicaai/normal_vs_rest_bal/label2idx.yaml
lab2idx with |labels|: 2
saved /opt/uc5/results/sicaai/normal_vs_rest_bal/idx2label.yaml
num words: 1004, including special tokens
word 0: <pad>
word 1: <oov>
word 2: <bos>
word 3: <eos>
word 4: no


Unnamed: 0,id,image_filename,text,enc_text
0,1,CXR1_1_IM-0001-3001.png,the cardiac silhouette and mediastinum size ar...,"[[5, 62, 39, 9, 55, 19, 7, 21, 13, 66], [15, 6..."
1,1,CXR1_1_IM-0001-4001.png,the cardiac silhouette and mediastinum size ar...,"[[5, 62, 39, 9, 55, 19, 7, 21, 13, 66], [15, 6..."
2,10,CXR10_IM-0002-1001.png,the cardiomediastinal silhouette is within nor...,"[[5, 40, 39, 6, 21, 13, 66, 50, 19, 9, 87], [5..."
3,10,CXR10_IM-0002-2001.png,the cardiomediastinal silhouette is within nor...,"[[5, 40, 39, 6, 21, 13, 66, 50, 19, 9, 87], [5..."
4,100,CXR100_IM-0002-1001.png,both lungs are clear and expanded. heart and m...,"[[186, 17, 7, 41, 9, 432], [16, 9, 55, 13], [4..."


saved aux dataset for text: /opt/uc5/results/sicaai/normal_vs_rest_bal/img_text_dataset.pkl
done with 'normal vs rest', balanced


In [1]:
#! BINARY NORMAL VS REST - BALANCED - 1 output unit

# indipendent: can be run 
# 2: keep only label with a minimum frequency, select words and encode text
from bs4 import BeautifulSoup
from collections import defaultdict
import numpy as np
import os
import numpy as np
from numpy import count_nonzero as nnz
import pandas as pd
import pickle
from posixpath import join
from tqdm.notebook import tqdm
import yaml
import os
import iuchest.reports as iureports
from utils.vocabulary import Vocabulary

#! IMPORTANT
LABELS = "mesh"  # "mesh" or "auto"
EXP_FLD = "/mnt/datasets/uc5/EXPS/siccai/normal_vs_rest_bal_1unit"
os.makedirs(EXP_FLD, exist_ok=True)

meta_fld = "/mnt/datasets/uc5/meta/iuchest"
reports_fn = "reports_raw3.pkl" if LABELS == "mesh" else "reports_raw2.pkl"
img_ds_fn = "img_dataset.pkl" if LABELS == "mesh" else "img_dataset_auto.pkl"
in_vocab_fn = "all_vocab_1000.pkl" if LABELS == "mesh" else "all_vocab_auto_1000.pkl"

# * dataset
ds = pd.read_pickle(join(meta_fld, reports_fn))
print(f"read {reports_fn}, shape: {ds.shape}")
display(ds.head())


# !
# ! IMAGES
# !

img_ds = pd.read_pickle(join(meta_fld, img_ds_fn))
# rep_ds = pd.read_pickle(join(meta_fld, "rep_dataset.pkl"))
display(img_ds.head().T)

# normal images
iii = img_ds.normal == True
print(f"num normal: {nnz(iii)}")
print(f"num rest: {nnz(~iii)}")
half = min([nnz(iii), nnz(~iii)])
print(f"balanced, {half} examples per class")

balanced = img_ds.groupby("normal").sample(half)
print(f"balanced, {len(balanced)} examples")

assert nnz(balanced.normal == True) == nnz(balanced.normal ==False)
img_ds = balanced

drop_cols = [c for c in img_ds.columns if c != "normal"]
img_ds = img_ds.drop(columns=drop_cols)
img_ds["other"] = (img_ds.normal != True).astype(int)
display(img_ds.head().T)
ofn = join(EXP_FLD, "img_dataset.pkl")
img_ds.to_pickle( ofn )
print(f"saved image-based ds: {ofn}")

def save_label_indexes(df, out_fld):
    cols = df.columns
    lab2idx = {}
    idx2lab = {}
    for i, c in enumerate(cols):
        lab2idx[c] = i
        idx2lab[i] = c
        print(f"{i}) {c}")
    with open(join(out_fld, "label2idx.yaml"), "w") as fout:
        yaml.safe_dump(lab2idx, fout)
    print(f"saved {join(out_fld, 'label2idx.yaml')}")
    print(f"lab2idx with |labels|: {len(lab2idx)}")
    with open(join(out_fld, "idx2label.yaml"), "w") as fout:
        yaml.safe_dump(idx2lab, fout)
    print(f"saved {join(out_fld, 'idx2label.yaml')}")
#<

save_label_indexes(img_ds, EXP_FLD)

# !
# !  TEXT
# !
with open(join(meta_fld, in_vocab_fn), "rb") as fin:
    vocab = pickle.load(fin)
# copy vocab to EXP_FLD
with open(join(EXP_FLD, "vocab.pkl"), "wb") as fout:
    pickle.dump(vocab, fout)
print(f"num words: {len(vocab.word2idx)}, including special tokens")

for i in range(5):
    print(f"word {i}: {vocab.idx2word[i]}")

rep_ids = []
image_filenames = []
texts = []
for row in ds.reset_index().itertuples():
    for fn in row.image_filename:
        rep_ids.append(row.id)
        image_filenames.append(fn)
        texts.append(row.text)
img_text_ds = pd.DataFrame()
img_text_ds["id"] = rep_ids
img_text_ds["image_filename"] = image_filenames
img_text_ds["text"] = texts
pd.columns = ["id", "image_filename", "text"]

def encode_text(sentences, vocab):
    word_indexes = []
    for sent in sentences.split("."):
        tokens = sent.strip().split()
        enc_sent = []
        for t in tokens:
            enc_sent.append(vocab.word2idx.get(t, Vocabulary.OOV))
        word_indexes.append(enc_sent)

    return word_indexes
enc_text = img_text_ds.text.apply(encode_text, args=(vocab,))
img_text_ds["enc_text"] = enc_text

display(img_text_ds.head())

ofn = join( EXP_FLD, "img_text_dataset.pkl")
img_text_ds.to_pickle(ofn)
print("saved aux dataset for text:", ofn)

print("done with 'normal vs rest', balanced")

read reports_raw3.pkl, shape: (3753, 26)


Unnamed: 0_level_0,comparison,indication,findings,impression,orig_mesh_terms,orig_auto_term,image_filename,filename,n_images,n_orig_mesh_terms,...,nsents_findings,raw_text,len_raw_text,nsents_raw_text,major_mesh,n_major_mesh,auto_term,text,mesh_term_s,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,none.,positive tb test,the cardiac silhouette and mediastinum size ar...,normal chest x-xxxx.,[normal],[normal],"[CXR1_1_IM-0001-3001.png, CXR1_1_IM-0001-4001....",1.xml,2,1,...,5,the cardiac silhouette and mediastinum size ar...,231,6,[normal],1,[normal],the cardiac silhouette and mediastinum size ar...,normal,[normal]
10,chest radiographs xxxx.,"xxxx-year-old male, chest pain.",the cardiomediastinal silhouette is within nor...,no acute cardiopulmonary process.,[calcified granuloma/lung/upper lobe/right],[calcified granuloma],"[CXR10_IM-0002-1001.png, CXR10_IM-0002-2001.png]",10.xml,2,1,...,4,the cardiomediastinal silhouette is within nor...,307,5,[calcified granuloma],1,[calcified granuloma],the cardiomediastinal silhouette is within nor...,calcified granuloma/lung/upper lobe/right,[calcified granuloma]
100,none.,,both lungs are clear and expanded. heart and m...,no active disease.,[normal],[normal],"[CXR100_IM-0002-1001.png, CXR100_IM-0002-2001....",100.xml,2,1,...,2,both lungs are clear and expanded. heart and m...,83,3,[normal],1,[normal],both lungs are clear and expanded. heart and m...,normal,[normal]
1000,xxxx pa and lateral chest radiographs,"xxxx-year-old male, xxxx.",there is xxxx increased opacity within the rig...,1. increased opacity in the right upper lobe w...,"[opacity/lung/upper lobe/right, pulmonary atel...","[atelectases, mass lesion, opacity, atelectasi...","[CXR1000_IM-0003-1001.png, CXR1000_IM-0003-200...",1000.xml,3,3,...,5,there is xxxx increased opacity within the rig...,626,8,"[pulmonary atelectasis, opacity]",2,"[atelectasis, mass lesion, opacity, rib]",there is xxxx increased opacity within the rig...,opacity/lung/upper lobe/right/pulmonary atelec...,"[pulmonary atelectasis, opacity]"
1001,none,"dyspnea, subjective fevers, arthritis, immigra...",interstitial markings are diffusely prominent ...,diffuse fibrosis. no visible focal acute disease.,[markings/lung/bilateral/interstitial/diffuse/...,"[diffuse fibrosis, fibrosis, pulmonary fibrosis]","[CXR1001_IM-0004-1001.png, CXR1001_IM-0004-100...",1001.xml,2,2,...,3,interstitial markings are diffusely prominent ...,163,5,"[fibrosis, markings]",2,"[diffuse fibrosis, fibrosis, pulmonary fibrosis]",interstitial markings are diffusely prominent ...,markings/lung/bilateral/interstitial/diffuse/p...,"[fibrosis, markings]"


filename,CXR1_1_IM-0001-3001.png,CXR1_1_IM-0001-4001.png,CXR10_IM-0002-1001.png,CXR10_IM-0002-2001.png,CXR100_IM-0002-1001.png
normal,1,1,0,0,1
abdomen,0,0,0,0,0
adipose tissue,0,0,0,0,0
airspace disease,0,0,0,0,0
aorta,0,0,0,0,0
...,...,...,...,...,...
trachea,0,0,0,0,0
"trachea, carina",0,0,0,0,0
"tube, inserted",0,0,0,0,0
tuberculosis,0,0,0,0,0


num normal: 2696
num rest: 4588
balanced, 2696 examples per class
balanced, 5392 examples


filename,CXR2695_IM-1166-1001.png,CXR938_IM-2434-1001.png,CXR3581_IM-1761-3001.png,CXR396_IM-2024-2002.png,CXR3417_IM-1652-2001.png
normal,0,0,0,0,0
other,1,1,1,1,1


saved image-based ds: /mnt/datasets/uc5/EXPS/siccai/normal_vs_rest_bal_1unit/img_dataset.pkl
0) normal
1) other
saved /mnt/datasets/uc5/EXPS/siccai/normal_vs_rest_bal_1unit/label2idx.yaml
lab2idx with |labels|: 2
saved /mnt/datasets/uc5/EXPS/siccai/normal_vs_rest_bal_1unit/idx2label.yaml
num words: 1004, including special tokens
word 0: <pad>
word 1: <oov>
word 2: <bos>
word 3: <eos>
word 4: no


Unnamed: 0,id,image_filename,text,enc_text
0,1,CXR1_1_IM-0001-3001.png,the cardiac silhouette and mediastinum size ar...,"[[5, 62, 39, 9, 55, 19, 7, 21, 13, 66], [15, 6..."
1,1,CXR1_1_IM-0001-4001.png,the cardiac silhouette and mediastinum size ar...,"[[5, 62, 39, 9, 55, 19, 7, 21, 13, 66], [15, 6..."
2,10,CXR10_IM-0002-1001.png,the cardiomediastinal silhouette is within nor...,"[[5, 40, 39, 6, 21, 13, 66, 50, 19, 9, 87], [5..."
3,10,CXR10_IM-0002-2001.png,the cardiomediastinal silhouette is within nor...,"[[5, 40, 39, 6, 21, 13, 66, 50, 19, 9, 87], [5..."
4,100,CXR100_IM-0002-1001.png,both lungs are clear and expanded. heart and m...,"[[186, 17, 7, 41, 9, 432], [16, 9, 55, 13], [4..."


saved aux dataset for text: /mnt/datasets/uc5/EXPS/siccai/normal_vs_rest_bal_1unit/img_text_dataset.pkl
done with 'normal vs rest', balanced


In [27]:
# EXPERIMENT 
# experiment: labels with minium frequency 130, balanced on normal across splits

import numpy as np
import pandas as pd
import pickle
from posixpath import join
import os
from sklearn.model_selection import train_test_split

in_fld = "/opt/uc5/results/experiments_lightning/wp6last_lt"
df = pd.read_pickle(join(in_fld, "img_dataset.pkl"))
print("dataset read, shape: ", df.shape)

display(df.head())
display(df.reset_index().groupby("normal").count())
display( df.reset_index()[["filename", "normal"]].groupby("normal").count() / len(df))

df = pd.read_pickle(join(in_fld, "img_dataset.pkl"))

n_bootstrap = 3
shuffle_seed = 10

train_p = 0.7
valid_p = 0.1
test_p = 1 - train_p - valid_p
print(f"expected |train| = {train_p * len(df)}")
print(f"expected |valid| = {valid_p * len(df)}")
print(f"expected |test| = {test_p * len(df)}")


filenames = np.array(df.index.values)
labels = np.array(df.normal)
df_splits = pd.DataFrame(columns=["filename", "split"])
#df_splits["filename"] = None
#df_splits["split"] = None

for i in range(n_bootstrap):
    x_train, x_test, y_train, y_test = train_test_split(
        filenames, labels, stratify=labels, 
        train_size=train_p, test_size=test_p, 
        random_state=shuffle_seed + i, shuffle=True)
    x_valid = np.array([x for x in filenames if ((x not in x_train) and (x not in x_test))])
    y_valid = np.array([df.loc[x, "normal"] for x in x_valid])
    assert len(x_train) + len(x_valid) + len(x_test) == len(filenames)
    
    print(f"bootstrap {i}/{n_bootstrap}")
    print(f"\ttrain: {len(x_train)}, normal {sum(y_train)/len(y_train)}")
    print(f"\tvalid: {len(x_valid)}, normal {sum(y_valid)/len(y_valid)}")
    print(f"\ttest: {len(x_test)}, normal {sum(y_test)/len(y_test)}")
    dfs = [pd.DataFrame({"filename": x_train, "split": "train"}), 
        pd.DataFrame({"filename": x_valid, "split": "valid"}),
        pd.DataFrame({"filename": x_test, "split": "test"})]
    out_df = pd.concat(dfs, axis=0)
    display(out_df.head())
    print(out_df.shape)
    out_df.to_pickle(join(in_fld, f"split_{i}.pkl"))
    print(f"saved to {join(in_fld, f'split_{i}.pkl')}")


dataset read, shape:  (7470, 21)


Unnamed: 0_level_0,normal,atelectasis,calcified granuloma,cardiomegaly,deformity,degenerative change,diaphragm,emphysema,granuloma,granulomatous disease,...,nodule,opacity,osteophyte,pleural effusion,pneumonia,pulmonary emphysema,scarring,sternotomy,thoracic aorta,misc
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CXR1_1_IM-0001-3001.png,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR1_1_IM-0001-4001.png,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR10_IM-0002-1001.png,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR10_IM-0002-2001.png,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR100_IM-0002-1001.png,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,filename,atelectasis,calcified granuloma,cardiomegaly,deformity,degenerative change,diaphragm,emphysema,granuloma,granulomatous disease,...,nodule,opacity,osteophyte,pleural effusion,pneumonia,pulmonary emphysema,scarring,sternotomy,thoracic aorta,misc
normal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4062,4062,4062,4062,4062,4062,4062,4062,4062,4062,...,4062,4062,4062,4062,4062,4062,4062,4062,4062,4062
1,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,...,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408


Unnamed: 0_level_0,filename
normal,Unnamed: 1_level_1
0,0.543775
1,0.456225


expected |train| = 5229.0
expected |valid| = 747.0
expected |test| = 1494.0000000000002
bootstrap 0/3
	train: 5229, normal 0.4563013960604322
	valid: 746, normal 0.45576407506702415
	test: 1495, normal 0.4561872909698997


Unnamed: 0,filename,split
0,CXR1575_IM-0374-2001.png,train
1,CXR1448_IM-0289-2001.png,train
2,CXR3729_IM-1864-1002.png,train
3,CXR1971_IM-0633-1001.png,train
4,CXR2944_IM-1344-2002.png,train


(7470, 2)
saved to /opt/uc5/results/experiments_lightning/wp6last_lt/split_0.pkl
bootstrap 1/3
	train: 5229, normal 0.4563013960604322
	valid: 746, normal 0.45576407506702415
	test: 1495, normal 0.4561872909698997


Unnamed: 0,filename,split
0,CXR59_IM-2184-2001.png,train
1,CXR657_IM-2233-0001-0001.png,train
2,CXR207_IM-0703-2001.png,train
3,CXR1677_IM-0446-1001.png,train
4,CXR1403_IM-0258-1001.png,train


(7470, 2)
saved to /opt/uc5/results/experiments_lightning/wp6last_lt/split_1.pkl
bootstrap 2/3
	train: 5229, normal 0.4563013960604322
	valid: 746, normal 0.45576407506702415
	test: 1495, normal 0.4561872909698997


Unnamed: 0,filename,split
0,CXR881_IM-2396-2001.png,train
1,CXR3078_IM-1438-2001.png,train
2,CXR3628_IM-1796-1001.png,train
3,CXR3551_IM-1740-1001.png,train
4,CXR842_IM-2366-2001.png,train


(7470, 2)
saved to /opt/uc5/results/experiments_lightning/wp6last_lt/split_2.pkl


In [16]:
import pandas as pd
import pickle
from posixpath import join

fn = "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_lightning/wp6last_lt/split_0.pkl"
split = pd.read_pickle(fn)
display(split.head())
ds = pd.read_pickle("/mnt/datasets/uc5/UC5_pipeline_forked/experiments_lightning/wp6last_lt/img_dataset.pkl")
display(ds.head())
train_ids = split.loc[split.split == "train", "filename"].values
print(split[split.split == "train"].filename)
# print(train_ids)

train = ds.loc[train_ids]

display(train.head())

Unnamed: 0,filename,split
0,CXR1575_IM-0374-2001.png,train
1,CXR1448_IM-0289-2001.png,train
2,CXR3729_IM-1864-1002.png,train
3,CXR1971_IM-0633-1001.png,train
4,CXR2944_IM-1344-2002.png,train


Unnamed: 0_level_0,normal,atelectasis,calcified granuloma,cardiomegaly,deformity,degenerative change,diaphragm,emphysema,granuloma,granulomatous disease,...,nodule,opacity,osteophyte,pleural effusion,pneumonia,pulmonary emphysema,scarring,sternotomy,thoracic aorta,misc
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CXR1_1_IM-0001-3001.png,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR1_1_IM-0001-4001.png,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR10_IM-0002-1001.png,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR10_IM-0002-2001.png,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR100_IM-0002-1001.png,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


0            CXR1575_IM-0374-2001.png
1            CXR1448_IM-0289-2001.png
2            CXR3729_IM-1864-1002.png
3            CXR1971_IM-0633-1001.png
4            CXR2944_IM-1344-2002.png
                    ...              
5224    CXR3892_IM-1974-0001-0002.png
5225         CXR3775_IM-1893-1001.png
5226         CXR2884_IM-1286-2001.png
5227          CXR780_IM-2323-2001.png
5228          CXR227_IM-0859-4004.png
Name: filename, Length: 5229, dtype: object


Unnamed: 0_level_0,normal,atelectasis,calcified granuloma,cardiomegaly,deformity,degenerative change,diaphragm,emphysema,granuloma,granulomatous disease,...,nodule,opacity,osteophyte,pleural effusion,pneumonia,pulmonary emphysema,scarring,sternotomy,thoracic aorta,misc
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CXR1575_IM-0374-2001.png,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
CXR1448_IM-0289-2001.png,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR3729_IM-1864-1002.png,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR1971_IM-0633-1001.png,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CXR2944_IM-1344-2002.png,0,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
