In [1]:
import pandas as pd
from posixpath import join
import numpy as np
import random
from numpy import count_nonzero as nnz
from collections import defaultdict
from IPython.display import display
from IPython.display import Image
import os

# note: ds_home is the base folder for the images
ds_home = "/mnt/datasets/uc5/std-dataset/image"

# some preprocessed files
base_path = "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/eddl_ext_CNN_20tags"
# raw reports processed
ds = pd.read_csv( "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/reports_raw.tsv", sep="\t", na_filter=False )


In [2]:
# reports without images

iii = ds.n_images == 0
print(f"*** number of reports without images {nnz(iii)}, removed")
ds = ds.loc[~iii].reset_index()

num_reports = ds.shape[0]
num_images = ds.n_images.sum()
print(f"reports {num_reports}, images {num_images}")


img_g = ds.loc[:, ["id", "n_images"]].groupby(["n_images"]).agg(["count"])
display(img_g)

mm_g = ds[["id", "n_major_mesh"]].groupby(["n_major_mesh"]).agg(["count"])
display(mm_g)

at_g = ds[["id", "n_auto_term"]].groupby(["n_auto_term"]).agg(["count"])
display(at_g)

normal_ids = ds.major_mesh == "normal"
print(f"normal reports (mesh): {nnz(normal_ids)}")

empty_auto_ids = ds.n_auto_term == 0
normal_auto_ids = empty_auto_ids & normal_ids
print(f"empty auto terms that are normal according to mesh: {nnz(normal_auto_ids)}/{nnz(empty_auto_ids)}")

empty_auto_ids = empty_auto_ids & ~normal_auto_ids
print(f"empty auto terms that are not normal according to mesh: {nnz(empty_auto_ids)}")

print("major mesh, sample:\n\t", random.sample(ds.major_mesh.tolist(), 5))
print("auto term, sample:\n\t", random.sample(ds.auto_term.tolist(), 5))

def get_unique_set(col):
    # out = [t.lower() for g in col for t in g.split(";") if len(g) > 0]
    out = set()
    for terms in col:
        if len(terms) == 0:
            continue
        for t in terms.split(";"):
            out.add(t.strip().lower())
    return out

u_mesh_combos = get_unique_set(ds.major_mesh)
u_auto_combos = get_unique_set(ds.auto_term)

print(f"unique mesh combos: {len(u_mesh_combos)}")
print(f"unique auto combos: {len(u_auto_combos)}")

print("unique mesh, sample:", random.sample(u_mesh_combos, 10))
print("unique auto, sample:", random.sample(u_auto_combos, 10))

mesh_in_empty_auto = get_unique_set(ds.major_mesh.loc[empty_auto_ids])
print(f"auto tags associated to non-normal empty auto terms: {len(mesh_in_empty_auto)} terms")
# print(mesh_in_empty_auto

*** number of reports without images 104, removed
reports 3851, images 7470


Unnamed: 0_level_0,id
Unnamed: 0_level_1,count
n_images,Unnamed: 1_level_2
1,446
2,3208
3,181
4,15
5,1


Unnamed: 0_level_0,id
Unnamed: 0_level_1,count
n_major_mesh,Unnamed: 1_level_2
1,2233
2,585
3,378
4,286
5,161
6,95
7,61
8,29
9,10
10,6


Unnamed: 0_level_0,id
Unnamed: 0_level_1,count
n_auto_term,Unnamed: 1_level_2
0,1751
1,597
2,414
3,346
4,251
5,186
6,123
7,69
8,45
9,20


normal reports (mesh): 1379
empty auto terms that are normal according to mesh: 1379/1751
empty auto terms that are not normal according to mesh: 372
major mesh, sample:
	 ['normal', 'normal', 'Sutures/sternum', 'Cardiomegaly/mild;Opacity/lung/base/bilateral/reticular/multiple/mild;Pulmonary Edema;Pulmonary Congestion/mild', 'Nodule/lung/upper lobe/bilateral/multiple;Nodule/lung/lower lobe/bilateral/multiple;Diaphragm/left/elevated/mild']
auto term, sample:
	 ['', 'opacity;pneumonia;Pneumonia', '', 'degenerative change', '']
unique mesh combos: 1679
unique auto combos: 571
unique mesh, sample: ['airspace disease/lung/left', 'pleural effusion/right/severe', 'diaphragmatic eventration/mild', 'density/costophrenic angle/anterior/round', 'nodule/lung/apex/bilateral/round', 'cysts/humerus/left', 'sutures/lung/apex/right', 'airspace disease/lung/base', 'opacity/lung/bilateral/scattered', 'markings/lung/upper lobe/right/interstitial/prominent']
unique auto, sample: ['hemothorax', 'cysts', 'pa

<font color="red">MeSH terms only</font>

In [3]:
# there are no empty major mesh fields
# expected syntax:
import re
print("sample of mesh terms")
print(random.sample(ds.major_mesh.tolist(), 3))

# TODO
# some headings terms appear more than once, ex:
#      Calcified Granuloma/mediastinum/large;Calcified Granuloma/lung/hilum/right/large --> ['calcified granuloma', 'calcified granuloma']
def simplify_terms(terms):
    # split on ; [heading/subheadings, heading/subheading]
    # then take only the first
    out = [g.split("/")[0].strip().lower() for g in terms.split(";")]  # heading/subheadings ; heading/subheadings ; 
    
    # in some multi-word headings, words are separated by multiple spaces:
    return list(set( [re.sub(r"\s+", ' ', terms) for terms in out] ))

simplified_mesh = ds.major_mesh.apply(lambda x: simplify_terms(x))
# for m, s in zip(ds.major_mesh.tolist(), simplified_mesh.tolist()):
#      print(f" {m} --> {s}")

ds["labels"] = simplified_mesh
# labels_s: labels as string
ds["labels_s"] = simplified_mesh.apply(lambda x: ";".join(x))  # labels joined as string
ds["n_labels"] = ds.labels.apply(lambda l: len(l))


# the two "counts" that follow show different results because
# some tags contain the same heading more than once followed by different subheadings (n_major_mesh)
# in n_labels only unique headings are included
mm_g = ds[["id", "n_major_mesh"]].groupby(["n_major_mesh"]).agg(["count"])
gnl = ds[["id", "n_labels"]].groupby(["n_labels"]).agg(["count"])
display(gnl)
display(mm_g)

u_mesh = set()
for l in ds.labels:
    for v in l:
        u_mesh.add(v)

print("unique mesh terms: ", len(u_mesh))
del mm_g, gnl


unique_mesh_combos = ds.labels_s.value_counts()
print("unique mesh combinations:", len(unique_mesh_combos))

sample of mesh terms
['normal', 'Lung/hyperdistention;Opacity/lung/bilateral/scattered;Opacity/lung/upper lobe/left/scattered;Cicatrix/lung/upper lobe/left;Aorta, Thoracic/tortuous/mild;Atherosclerosis/aorta, thoracic;Spine/degenerative;Emphysema', 'Granulomatous Disease;Thoracic Vertebrae/degenerative']


Unnamed: 0_level_0,id
Unnamed: 0_level_1,count
n_labels,Unnamed: 1_level_2
1,2267
2,596
3,388
4,285
5,144
6,93
7,44
8,20
9,9
10,4


Unnamed: 0_level_0,id
Unnamed: 0_level_1,count
n_major_mesh,Unnamed: 1_level_2
1,2233
2,585
3,378
4,286
5,161
6,95
7,61
8,29
9,10
10,6


unique mesh terms:  118
unique mesh combinations: 1316


In [None]:
# display some random images associated to studies with n_images
n_images = 4
sds = ds.loc[ds.n_images == n_images, ["id", "image_filename"]].set_index("id")
print(f"subset with exactly {n_images} images: {sds.shape}")
display(sds)

# select n_samples random images
n_samples = 10
idx = random.sample(list(sds.index.values), n_samples)
# idx = [1170,42,2177]

for row in sds.loc[idx].itertuples():
    print(f"*** {row[0]} ***")
    filenames = sorted([join(ds_home, fn) for fn in row.image_filename.split(";")])
    # ipyplot.plot_images(filenames, img_width=100)
    for i, fn in enumerate(filenames):
        print(f"{i}/{len(filenames)}")
        display(Image(fn, width=224, height=224))



In [15]:
# build a binary occurrence matrix

# rep_tags = np.zeros( (ds.shape[0], len(u_mesh)), dtype=int)
# img_tags = np.zeros( (n_images, len(u_mesh)), dtype=int)
terms = sorted(u_mesh)
print(f"|mesh terms| = {len(terms)}")
assert "normal" in terms, "normal tag missing"

matrix = []  # rows correspond to reports
rep_matrix = []  # rows correspond to reports
index = []
image_report = []
for t in ds.itertuples():
    enc = []
    for term in terms:
        enc.append(term in t.labels)
        
    rep_matrix.append(enc)
    for i in [fn for fn in t.image_filename.split(";")]:
        index.append(i)
        matrix.append(enc)
        image_report.append(t.id)

rep_ds = pd.DataFrame(data=np.array(rep_matrix).astype(int), columns=terms)
rep_ds["id"] = ds.id
print(f"dataframe 'reports', index is report id: {rep_ds.shape}")
display(rep_ds)


img_ds = pd.DataFrame(data=np.array(matrix).astype(int), columns = terms)
img_ds["image_filename"] = pd.Series(index)
img_ds["report"] = pd.Series(image_report)
img_ds = img_ds.set_index(["image_filename"])
print(f"dataframe images, index is image_filename: {img_ds.shape}")
display(img_ds)
# npterms = np.array(terms, dtype=object)
# matrix = np.array(matrix)
# idx = 3
# print(matrix[idx,:])
# print(npterms[matrix[idx,:]])

display(img_ds.T)

label_counts = img_ds.drop(columns=["report"]).sum(axis=0)
n_labels = img_ds.drop(columns=["report"]).sum(axis=1)
print(n_labels.value_counts())

img_ds_filename = "/mnt/datasets/uc5/std-dataset/img_ds_no_text.tsv"
img_ds.to_csv(img_ds_filename, sep="\t")
print(f"image dataset saved in {img_ds_filename}")
print("all done.")

|mesh terms| = 118
dataframe 'reports', index is report id: (3851, 119)


Unnamed: 0,abdomen,adipose tissue,airspace disease,aorta,"aorta, thoracic",aortic aneurysm,arthritis,atherosclerosis,blister,blood vessels,...,technical quality of image unsatisfactory,thickening,thoracic vertebrae,thorax,trachea,"trachea, carina","tube, inserted",tuberculosis,volume loss,id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2122
1,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,2502
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,435
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3718
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1926
3847,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,670
3848,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3580
3849,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1522


dataframe images, index is image_filename: (7470, 119)


Unnamed: 0_level_0,abdomen,adipose tissue,airspace disease,aorta,"aorta, thoracic",aortic aneurysm,arthritis,atherosclerosis,blister,blood vessels,...,technical quality of image unsatisfactory,thickening,thoracic vertebrae,thorax,trachea,"trachea, carina","tube, inserted",tuberculosis,volume loss,report
image_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CXR2122_IM-0747-1001.png,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2122
CXR2502_IM-1027-1001-0001.png,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,2502
CXR2502_IM-1027-1001-0002.png,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,2502
CXR435_IM-2075-1001.png,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,435
CXR435_IM-2075-2001.png,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CXR3580_IM-1760-1001.png,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3580
CXR3580_IM-1760-2001.png,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3580
CXR1522_IM-0338-0001-0002.png,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1522
CXR2316_IM-0889-1001.png,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2316


image_filename,CXR2122_IM-0747-1001.png,CXR2502_IM-1027-1001-0001.png,CXR2502_IM-1027-1001-0002.png,CXR435_IM-2075-1001.png,CXR435_IM-2075-2001.png,CXR3718_IM-1856-2001.png,CXR1847_IM-0550-1001.png,CXR1847_IM-0550-2001.png,CXR372_IM-1858-0001-0001.png,CXR372_IM-1858-0001-0002.png,...,CXR3954_IM-2021-1002.png,CXR1926_IM-0600-1001.png,CXR1926_IM-0600-2001.png,CXR670_IM-2244-85049001.png,CXR670_IM-2244-85049002.png,CXR3580_IM-1760-1001.png,CXR3580_IM-1760-2001.png,CXR1522_IM-0338-0001-0002.png,CXR2316_IM-0889-1001.png,CXR2316_IM-0889-2001.png
abdomen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
adipose tissue,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
airspace disease,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
aorta,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"aorta, thoracic",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"trachea, carina",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"tube, inserted",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tuberculosis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
volume loss,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


1     4404
2     1162
3      761
4      549
5      266
6      176
7       84
8       39
9       19
10       8
12       2
dtype: int64
image dataset saved in /mnt/datasets/uc5/std-dataset/img_ds_no_text.tsv
all done.


inspect some random samples

In [None]:
display(ds.loc[:, ["id", "major_mesh", "image_filename", "n_images"]])  # these are the columns used in this notebook

- when there is a single image, it can be either FRONTAL or LATERAL;
- when a report has exactly two images, the first one corresponds to a FRONTAL view and the secondo one to a LATERAL view. 
- When a report is associate to more than two images, we cannot say anything about the views of the images.


<font color="red">select images for preparing various ecvl datasets</fond>

- lateral vs frontal
- only frontal: normal vs rest
- normal vs rest
- classify rest - labels

<font color="yellow"> frontal vs lateral: begin </font>

In [None]:
# prepare dataset
sub = ds.loc[ds.n_images == 2]
print(sub.shape)
frontal = []
lateral = []

def separate_images(filenames):
    f = filenames.split(";")
    assert len(f) == 2
    frontal.append(f[0])
    lateral.append(f[1])

for row in sub.itertuples():
    separate_images(row.image_filename)
#subset.image_filename.apply(lambda filenames: separate_images(filenames))

print(f"frontal: {len(frontal)}")
print(f"lateral: {len(lateral)}")



In [None]:
# check
print('frontal')
fn = join(ds_home, random.sample(frontal, 1)[0])
display(Image(fn, width=224, height=224))

print('lateral')
fn = join(ds_home, random.sample(lateral, 1)[0])
display(Image(fn, width=224, height=224))

In [None]:
def prepare_ecvl_dlds(train, valid, test, y_train, y_valid, y_test):
     d = {
        "name"        : "ECVL dataset for UC5",
        "description" : description,
        "classes"     : [], 
        "images"      : [],
        "split"       : dict(training = list(range(n_train)), 
                            validation = list(range(n_train, n_train + n_valid)), 
                            test=list(range(n_train + n_valid, len(ds))))
    }


import random
from sklearn.model_selection import train_test_split, StratifiedKFold

frontal_lab = [1, 0]
lateral_lab = [0, 1]

X = frontal + lateral
y = [1] * len(frontal) + [2] * len(lateral)

shuffle_seed = 11
train_p = 0.7
valid_p = 0.1
test_p = 1 - train_p - valid_p
print(f"X={len(X)}, Y={len(y)}")
print(f"expected train={len(X)*train_p}, val={len(X)*valid_p}, test={len(X)*test_p}")

X = np.array(frontal + lateral)
y = np.array([1] * len(frontal) + [0] * len(lateral))
assert X.shape[0] == 2 * sub.shape[0]
assert y.shape[0] == 2 * sub.shape[0]
skf= StratifiedKFold(n_splits=5, shuffle=True, random_state=shuffle_seed)
for i, (others, test) in enumerate(skf.split(X, y)):
    print(15 * "=" + f" {i+1}/{skf.n_splits}")
    print(f"train/test: label distribution, others -  {np.bincount(y[others])}   |   test -  {np.bincount(y[test])}")
    test_p = 1 / skf.n_splits
    train, valid = train_test_split(others, test_size=valid_p/(1-test_p), shuffle=True, stratify=y[others], random_state=shuffle_seed+i)
    print(f"train/val:  label distribution, train -  {np.bincount(y[train])}   |   validation -  {np.bincount(y[valid])}")
    assert len(train) + len(valid) == len(others)
    #prepare_ecvl_dlds(X_train, X_valid, X[test], y_train, y_valid, y[test])


<font color="yellow">frontal vs lateral: end</font>

<font color="yellow"> normal vs rest: begin </font>

In [29]:
ds = pd.read_csv("/mnt/datasets/uc5/std-dataset/img_ds_no_text.tsv", sep ="\t").set_index("image_filename")
display(ds.head().T)
ds.drop(columns=["report"], inplace=True)
print(ds.columns)
assert "normal" in ds.columns, "normal label NOT found"

normal_ids = ds.normal == 1
rest_ids = ~normal_ids
print(f"|normal|= {nnz(normal_ids)}")
print(f"|rest|= {nnz(rest_ids)}")
assert nnz(normal_ids) + nnz(rest_ids) == ds.shape[0], "normal + rest != number of rows in dataset"

ds["target"] = np.nan
ds.loc[normal_ids, "target"] = 1
ds.loc[rest_ids, "target"] = 0
assert nnz(ds["target"].isna()) == 0, "this should never be printed: nan in target column"
ds2 = ds[["target"]]
display(ds2.T)

ofn = "/mnt/datasets/mimic-cxr/training_data/iuchest"
os.makedirs(ofn, exist_ok=True)
ofn = join(ofn, "iu_normal_rest.tsv")
ds2.to_csv(ofn, sep="\t")
print(f"saved {ofn}")

# now use NB_dataset_2_ecvl_ds.ipynb to turn this dataset into a YAML file for EDDL
print("all done.")

image_filename,CXR2122_IM-0747-1001.png,CXR2502_IM-1027-1001-0001.png,CXR2502_IM-1027-1001-0002.png,CXR435_IM-2075-1001.png,CXR435_IM-2075-2001.png
abdomen,0,0,0,0,0
adipose tissue,0,0,0,0,0
airspace disease,0,0,0,0,0
aorta,0,0,0,0,0
"aorta, thoracic",0,0,0,0,0
...,...,...,...,...,...
"trachea, carina",0,0,0,0,0
"tube, inserted",0,0,0,0,0
tuberculosis,0,0,0,0,0
volume loss,0,0,0,0,0


Index(['abdomen', 'adipose tissue', 'airspace disease', 'aorta',
       'aorta, thoracic', 'aortic aneurysm', 'arthritis', 'atherosclerosis',
       'blister', 'blood vessels',
       ...
       'sutures', 'technical quality of image unsatisfactory', 'thickening',
       'thoracic vertebrae', 'thorax', 'trachea', 'trachea, carina',
       'tube, inserted', 'tuberculosis', 'volume loss'],
      dtype='object', length=118)
|normal|= 2696
|rest|= 4774


image_filename,CXR2122_IM-0747-1001.png,CXR2502_IM-1027-1001-0001.png,CXR2502_IM-1027-1001-0002.png,CXR435_IM-2075-1001.png,CXR435_IM-2075-2001.png,CXR3718_IM-1856-2001.png,CXR1847_IM-0550-1001.png,CXR1847_IM-0550-2001.png,CXR372_IM-1858-0001-0001.png,CXR372_IM-1858-0001-0002.png,...,CXR3954_IM-2021-1002.png,CXR1926_IM-0600-1001.png,CXR1926_IM-0600-2001.png,CXR670_IM-2244-85049001.png,CXR670_IM-2244-85049002.png,CXR3580_IM-1760-1001.png,CXR3580_IM-1760-2001.png,CXR1522_IM-0338-0001-0002.png,CXR2316_IM-0889-1001.png,CXR2316_IM-0889-2001.png
target,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0


saved /mnt/datasets/mimic-cxr/training_data/iuchest/iu_normal_rest.tsv
all done.
