In [None]:
import pandas as pd
from posixpath import join
import numpy as np
import random
from numpy import count_nonzero as nnz
from collections import defaultdict
from IPython.display import display
from IPython.display import Image
 
# note: ds_home is the base folder for the images
ds_home = "/mnt/datasets/uc5/std-dataset/image"

# some preprocessed files
base_path = "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/eddl_ext_CNN_20tags"
# raw reports processed
ds = pd.read_csv( "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/reports_raw.tsv", sep="\t", na_filter=False )


In [None]:
# reports without images

iii = ds.n_images == 0
print(f"*** number of reports without images {nnz(iii)}, removed")
ds = ds.loc[~iii].reset_index()

num_reports = ds.shape[0]
num_images = ds.n_images.sum()
print(f"reports {num_reports}, images {num_images}")


img_g = ds.loc[:, ["id", "n_images"]].groupby(["n_images"]).agg(["count"])
display(img_g)

mm_g = ds[["id", "n_major_mesh"]].groupby(["n_major_mesh"]).agg(["count"])
display(mm_g)

at_g = ds[["id", "n_auto_term"]].groupby(["n_auto_term"]).agg(["count"])
display(at_g)

normal_ids = ds.major_mesh == "normal"
print(f"normal reports (mesh): {nnz(normal_ids)}")

empty_auto_ids = ds.n_auto_term == 0
normal_auto_ids = empty_auto_ids & normal_ids
print(f"empty auto terms that are normal according to mesh: {nnz(normal_auto_ids)}/{nnz(empty_auto_ids)}")

empty_auto_ids = empty_auto_ids & ~normal_auto_ids
print(f"empty auto terms that are not normal according to mesh: {nnz(empty_auto_ids)}")

print("major mesh, sample:\n\t", random.sample(ds.major_mesh.tolist(), 5))
print("auto term, sample:\n\t", random.sample(ds.auto_term.tolist(), 5))

def get_unique_set(col):
    # out = [t.lower() for g in col for t in g.split(";") if len(g) > 0]
    out = set()
    for terms in col:
        if len(terms) == 0:
            continue
        for t in terms.split(";"):
            out.add(t.strip().lower())
    return out

u_mesh_combos = get_unique_set(ds.major_mesh)
u_auto_combos = get_unique_set(ds.auto_term)

print(f"unique mesh combos: {len(u_mesh_combos)}")
print(f"unique auto combos: {len(u_auto_combos)}")

print("unique mesh, sample:", random.sample(u_mesh_combos, 10))
print("unique auto, sample:", random.sample(u_auto_combos, 10))

mesh_in_empty_auto = get_unique_set(ds.major_mesh.loc[empty_auto_ids])
print(f"auto tags associated to non-normal empty auto terms: {len(mesh_in_empty_auto)} terms")
# print(mesh_in_empty_auto

<font color="red">MeSH terms only</font>

In [None]:
# there are no empty major mesh fields
# expected syntax:
import re
print("sample of mesh terms")
print(random.sample(ds.major_mesh.tolist(), 3))

# TODO
# some headings terms appear more than once, ex:
#      Calcified Granuloma/mediastinum/large;Calcified Granuloma/lung/hilum/right/large --> ['calcified granuloma', 'calcified granuloma']
def simplify_terms(terms):
    # split on ; [heading/subheadings, heading/subheading]
    # then take only the first
    out = [g.split("/")[0].strip().lower() for g in terms.split(";")]  # heading/subheadings ; heading/subheadings ; 
    
    # in some multi-word headings, words are separated by multiple spaces:
    return list(set( [re.sub(r"\s+", ' ', terms) for terms in out] ))

simplified_mesh = ds.major_mesh.apply(lambda x: simplify_terms(x))
# for m, s in zip(ds.major_mesh.tolist(), simplified_mesh.tolist()):
#      print(f" {m} --> {s}")

ds["labels"] = simplified_mesh
# labels_s: labels as string
ds["labels_s"] = simplified_mesh.apply(lambda x: ";".join(x))  # labels joined as string
ds["n_labels"] = ds.labels.apply(lambda l: len(l))


# the two "counts" that follow show different results because
# some tags contain the same heading more than once followed by different subheadings (n_major_mesh)
# in n_labels only unique headings are included
mm_g = ds[["id", "n_major_mesh"]].groupby(["n_major_mesh"]).agg(["count"])
gnl = ds[["id", "n_labels"]].groupby(["n_labels"]).agg(["count"])
display(gnl)
display(mm_g)

u_mesh = set()
for l in ds.labels:
    for v in l:
        u_mesh.add(v)

print("unique mesh terms: ", len(u_mesh))
del mm_g, gnl


unique_mesh_combos = ds.labels_s.value_counts()
print("unique mesh combinations:", len(unique_mesh_combos))

In [None]:
# display some random images associated to studies with n_images
n_images = 4
sds = ds.loc[ds.n_images == n_images, ["id", "image_filename"]].set_index("id")
print(f"subset with exactly {n_images} images: {sds.shape}")
display(sds)

# select n_samples random images
n_samples = 10
idx = random.sample(list(sds.index.values), n_samples)
# idx = [1170,42,2177]

for row in sds.loc[idx].itertuples():
    print(f"*** {row[0]} ***")
    filenames = sorted([join(ds_home, fn) for fn in row.image_filename.split(";")])
    # ipyplot.plot_images(filenames, img_width=100)
    for i, fn in enumerate(filenames):
        print(f"{i}/{len(filenames)}")
        display(Image(fn, width=224, height=224))



In [None]:
# build a binary occurrence matrix

# rep_tags = np.zeros( (ds.shape[0], len(u_mesh)), dtype=int)
# img_tags = np.zeros( (n_images, len(u_mesh)), dtype=int)
terms = sorted(u_mesh)
print(len(terms))
assert "normal" in terms, "normal tag missing"

matrix = []  # rows correspond to reports
rep_matrix = []  # rows correspond to reports
index = []
image_report = []
for t in ds.itertuples():
    enc = []
    for term in terms:
        enc.append(term in t.labels)
        
    rep_matrix.append(enc)
    for i in [fn for fn in t.image_filename.split(";")]:
        index.append(i)
        matrix.append(enc)
        image_report.append(t.id)

rep_ds = pd.DataFrame(data=np.array(rep_matrix).astype(int), columns=terms)
rep_ds["id"] = ds.id
print(f"dataframe report, index is report id: {rep_ds.shape}")
display(rep_ds)


img_ds = pd.DataFrame(data=np.array(matrix).astype(int), columns = terms)
img_ds["image_filename"] = pd.Series(index)
img_ds["report"] = pd.Series(image_report)
img_ds = img_ds.set_index(["image_filename"])
print(f"dataframe images, index is image_filename: {img_ds.shape}")
display(img_ds)
# npterms = np.array(terms, dtype=object)
# matrix = np.array(matrix)
# idx = 3
# print(matrix[idx,:])
# print(npterms[matrix[idx,:]])


label_counts = img_ds.sum(axis=0)
n_labels = img_ds.sum(axis=1)

print("n labels per images:\n", n_labels.value_counts())

iii = n_labels == 12
rows = img_ds.loc[iii]
a = np.array(rows).astype(bool)
  

img_ds.to_csv("/mnt/datasets/uc5/std-dataset/img_ds_no_text.tsv", sep="\t")

inspect some random samples

In [None]:
display(ds.loc[:, ["id", "major_mesh", "image_filename", "n_images"]])  # these are the columns used in this notebook

- when there is a single image, it can be either FRONTAL or LATERAL;
- when a report has exactly two images, the first one corresponds to a FRONTAL view and the secondo one to a LATERAL view. 
- When a report is associate to more than two images, we cannot say anything about the views of the images.


<font color="red">select images for preparing various ecvl datasets</fond>

- lateral vs frontal
- only frontal: normal vs rest
- normal vs rest
- classify rest - labels

In [None]:
# prepare dataset
sub = ds.loc[ds.n_images == 2]
print(sub.shape)
frontal = []
lateral = []

def separate_images(filenames):
    f = filenames.split(";")
    assert len(f) == 2
    frontal.append(f[0])
    lateral.append(f[1])

for row in sub.itertuples():
    separate_images(row.image_filename)
#subset.image_filename.apply(lambda filenames: separate_images(filenames))

print(f"frontal: {len(frontal)}")
print(f"lateral: {len(lateral)}")



In [None]:
# check
print('frontal')
fn = join(ds_home, random.sample(frontal, 1)[0])
display(Image(fn, width=224, height=224))

print('lateral')
fn = join(ds_home, random.sample(lateral, 1)[0])
display(Image(fn, width=224, height=224))

In [None]:
def prepare_ecvl_dlds(train, valid, test, y_train, y_valid, y_test):
     d = {
        "name"        : "ECVL dataset for UC5",
        "description" : description,
        "classes"     : [], 
        "images"      : [],
        "split"       : dict(training = list(range(n_train)), 
                            validation = list(range(n_train, n_train + n_valid)), 
                            test=list(range(n_train + n_valid, len(ds))))
    }


import random
from sklearn.model_selection import train_test_split, StratifiedKFold

frontal_lab = [1, 0]
lateral_lab = [0, 1]

X = frontal + lateral
y = [1] * len(frontal) + [2] * len(lateral)

shuffle_seed = 11
train_p = 0.7
valid_p = 0.1
test_p = 1 - train_p - valid_p
print(f"X={len(X)}, Y={len(y)}")
print(f"expected train={len(X)*train_p}, val={len(X)*valid_p}, test={len(X)*test_p}")

X = np.array(frontal + lateral)
y = np.array([1] * len(frontal) + [0] * len(lateral))
assert X.shape[0] == 2 * sub.shape[0]
assert y.shape[0] == 2 * sub.shape[0]
skf= StratifiedKFold(n_splits=5, shuffle=True, random_state=shuffle_seed)
for i, (others, test) in enumerate(skf.split(X, y)):
    print(15 * "=" + f" {i+1}/{skf.n_splits}")
    print(f"train/test: label distribution, others -  {np.bincount(y[others])}   |   test -  {np.bincount(y[test])}")
    test_p = 1 / skf.n_splits
    train, valid = train_test_split(others, test_size=valid_p/(1-test_p), shuffle=True, stratify=y[others], random_state=shuffle_seed+i)
    print(f"train/val:  label distribution, train -  {np.bincount(y[train])}   |   validation -  {np.bincount(y[valid])}")
    assert len(train) + len(valid) == len(others)
    #prepare_ecvl_dlds(X_train, X_valid, X[test], y_train, y_valid, y[test])
