In [None]:
#
# CHEST-XRAY8
# notebook for stats and building training sets
#

import numpy as np
from numpy import count_nonzero as nnz
import pandas as pd
from posixpath import join
from sklearn.model_selection import train_test_split

base_fld = "/mnt/datasets/mimic-cxr/chestx-ray8"
shuffle_seed = 2  # for repeatibility

### <font color="#d07326">PART I </font>
<font color="#21b3d2">PREPROCESS RAW DATASET</text>

output: multi-label (1-hot) encoding

In [None]:
#
# dataset
# 

#> metadata and labels for each image:
datae = pd.read_csv( join(base_fld, "Data_Entry_2017_v2020.csv"), index_col="Image Index")
print("Data_Entry_2017_v2020.csv, shape:", datae.shape)
print(datae.columns)
display(datae)
#<

# > View position
print("View position:")
print(datae.loc[:, "View Position"].value_counts())  # AP or PA
# <

# > labels (classes)
lab_col = datae.loc[:, "Finding Labels"].value_counts()  # classes
# combinations of labels with form: Effusion|Infiltration, separated by |
print("unique label combos:", len(lab_col))

# single labels
labels = set()
datae.loc[:, "Finding Labels"].apply(lambda x: labels.update([y for y in x.split("|")]))
labels = sorted(labels)
print(f"unique labels ({len(labels)}):", labels)
# <

# > used for binary encoding the labels
l2i, i2l = {}, {}
for i, l in enumerate(labels):
    l2i[l] = i
    i2l[str(i)] = l
# <

In [None]:
#
# dataset
# 

#> metadata and labels for each image:
datae = pd.read_csv( join(base_fld, "Data_Entry_2017_v2020.csv"), index_col="Image Index")
print("Data_Entry_2017_v2020.csv, shape:", datae.shape)
print(datae.columns)
display(datae)
#<

# > View position
print("View position:")
print(datae.loc[:, "View Position"].value_counts())  # AP or PA
# <

# > labels (classes)
lab_col = datae.loc[:, "Finding Labels"].value_counts()  # classes
# combinations of labels with form: Effusion|Infiltration, separated by |
print("unique label combos:", len(lab_col))

# single labels
labels = set()
datae.loc[:, "Finding Labels"].apply(lambda x: labels.update([y for y in x.split("|")]))
labels = sorted(labels)
print(f"unique labels ({len(labels)}):", labels)
# <

# > used for binary encoding the labels
l2i, i2l = {}, {}
for i, l in enumerate(labels):
    l2i[l] = i
    i2l[str(i)] = l
# <

In [None]:
# encode labelling as a binary matrix n_images x n_labels:
#  - rows -> images
#  - columns -> labels
filenames = []
encoding = np.zeros( (datae.shape[0], len(labels)), dtype=int) # bool mat encoding labels
for r, t in enumerate(datae.loc[:, ["Finding Labels"]].itertuples()):
    filenames.append(t[0])
    labs = t[1].split("|")
    for l in labs:
        encoding[r, l2i[l]] = 1

df = pd.DataFrame(data = encoding, columns=labels, index=pd.Series(filenames))
# print(datae.loc['00030801_001.png', "Finding Labels"])  # check
display(df)

In [None]:
#
# in the original labels only 8 labels 'paper_labels' were used
#  the "No finding" and the other labels were "collapsed" in a 0-vector

# >
paper_labels = ["Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass", "Nodule", "Pneumonia", "Pneumothorax"]
paper_labels = sorted(["No Finding"] + paper_labels)  # add a label for "No Finding"

# check
# there is an error in the paper Pneumathorax, corrected in the labels above. the following check should pass
for pl in paper_labels:
    assert pl in labels
# <

# > columns not used in the original paper: substitute all with 'other', populated with logical_or(other_columns)
other_columns = [c for c in df.columns if (c not in paper_labels)]
other_df = df.loc[:, other_columns]
other_col = other_df.sum(axis=1)
other_col.where(other_col > 0, 1)  # set to 1(true) where one of those label occurs
df["other"] = other_col
paper_labels = paper_labels + ["other"]
# <

# >
old_shape = df.shape
df = df.drop(columns=other_columns)  #  + ["other"])  # remove columns
print(f"new shape of dataset {old_shape} ->", df.shape)
del old_shape
print(df.columns)
# < 

# > counts
frequencies = df.sum(axis=0)
n_labels = df.sum(axis=1)
assert nnz(n_labels==0) == 0  # no row without at least one label
# <

display(df)

In [None]:
#
# dataset comes with a standard split train - test
#

# check train_val_list and test_list
# 1. image exists in folder?
# 2. image exists in dataset?
def read_list(fn):
    with open( fn, "r") as fin:
        ids = fin.readlines()
    return ids

# >
# this list would be filled with index values corresponding to image without labels
#       when the "other" column is not added. Now all the images have labels.
removed_filenames = []  

train_val = [s.strip() for s in read_list( join(base_fld, "train_val_list.txt") )]
train_val = [s for s in train_val if (s not in removed_filenames)]

test = [s.strip() for s in read_list(join (join(base_fld, "test_list.txt")))]
test = [s for s in test if (s not in removed_filenames)]

print(f"train&validation: {len(train_val)}")
print(f"test: {len(test)}")
print(f"total: {len(train_val)+len(test)}")
assert len(train_val) + len(test) == df.shape[0] # + nnz(with_zero_labels)  # add the rows removed
# <

# > check
# all images have been downloaded? PASSED
#import os
#for r in ds.iterrows():
#    fn = r[0]
#    assert os.path.exists( join(base_fld, "images", fn) )
# DONE: all images exists
# <

# > split column (df index is filename)
df["split"] = None
df.loc[train_val, "split"] = "train"
df.loc[test, "split"] = "test"
df.index.name = "Image Index"
# <

df.to_csv( join(base_fld, "chest-xray8_uc5.tsv"), sep="\t")
datae.to_csv( join(base_fld, "chest-xray8_uc5_encoding.tsv"), sep="\t")
print("dataset saved, location:", join(base_fld, "chest-xray8_uc5.tsv"))
print("encoding saved at:", join(base_fld, "chest-xray8_uc5_encoding.tsv"))

 # <<< FIRST PART ENDS HERE

### <font color="#d07326">PART II </font>
<font color="#21b3d2">INDIPENDENT FROM HERE (but for imports and base_fld): USING RESUTS OF THE CELLS ABOVE</text>

<font color="red">PREPARE CROSS_VALIDATION</font>

In [None]:
# prepare train, valid and test for Positive and Negative
# keep it balanced

#>
ds = pd.read_csv( join(base_fld, "chest-xray8_uc5.tsv"), sep="\t", index_col="Image Index" )
datae = pd.read_csv( join(base_fld, "Data_Entry_2017_v2020.csv"), index_col="Image Index")

ds["View Position"] = datae["View Position"]  # same index
display(ds)

print("files read:")
print(f"dataset: {ds.shape}")
print(f"\t - with columns: {ds.columns}")

# >
train = ds[ds.split == "train"]
training_set = train.groupby(["No Finding", "View Position"]).apply(lambda x: x.sample( int((4000 + 1000) / 4), random_state=shuffle_seed) )
test = ds[ds.split == "test"]
test_set = test.groupby(["No Finding", "View Position"]).apply(lambda x: x.sample( int(2000/4), random_state=shuffle_seed) )
# <

# >
# ViewPosition and NoFinding are both in the index and in the columns
def filter_columns(df):
    df = df.drop(columns=["View Position", "No Finding"])  # after groupby the two columns are also in the index...
    df = df.reset_index().set_index("Image Index")  # ...so reset_index() would fail if not dropped 
    df = df.drop(columns=[c for c in df.columns if (c not in paper_labels)])
    return df

training_set = filter_columns(training_set)
test_set = filter_columns(test_set)
# <

# > check no intersection
def common_index(d1, d2):
    idx = d1.index.isin(d2.index)
    return idx

assert nnz(common_index(training_set, test_set)) ==0 
# <

# > check: PASSED
# print(test_set.iloc[0])
# key = "00028856_000.png"
# print (key in train.index.values)
# print (key in test.index.values)

# print(ds.loc[key])
# <
from sklearn.model_selection import StratifiedKFold
target = "No Finding"
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=shuffle_seed)
X = training_set.index.to_numpy()
y = training_set[target].to_numpy()

for i, (train_idx, valid_idx) in skf.split(training_set, y, stratify=training_set["No Finding", "View Position"]):
    pass




<font color="red">N_ITER BOOTSTRAPS</font>

In [None]:
# prepare train, valid and test for Positive and Negative
# keep it balanced

# >
ds = pd.read_csv( join(base_fld, "chest-xray8_uc5.tsv"), sep="\t", index_col="Image Index" )
datae = pd.read_csv( join(base_fld, "Data_Entry_2017_v2020.csv"), index_col="Image Index")
ds["View Position"] = datae["View Position"]  # same index
# display(ds)

print("files read:")
print(f"dataset: {ds.shape}")
print(f"\t - with columns: {ds.columns}")
# <

# > ViewPosition and NoFinding are both in the index and in the columns
def filter_columns(df):
    # df = df.drop(columns=["View Position", "No Finding"])  # after groupby the two columns are also in the index...
    # df = df.reset_index().set_index("Image Index")  # ...so reset_index() would fail if not dropped 
    df = df.drop(columns=[c for c in df.columns if (c not in paper_labels)])
    return df
# <


# >
def train_validation_split(dataset, n_train, n_valid, n_test, seed=1, n_iter=1):
    #> single test set for all the train/valid splits
    test = dataset[dataset.split == "test"]
    test_set = test.groupby(["No Finding", "View Position"]).apply(lambda x: x.sample( n_test//4, random_state=shuffle_seed) )
    test_set = test_set.reset_index(level=[0,1], drop=True)
    test_set = filter_columns(test_set)

    train_data = dataset[dataset.split == "train"]
    print(f"* all training examples: {train_data.shape}")
    print(train_data["No Finding"].value_counts() / len(train_data))
    print(train_data["View Position"].value_counts() / len(train_data))
    print(train_data[["No Finding", "View Position"]].value_counts() / len(train_data))

    for i in range(n_iter):        
        training_set = train_data.groupby(["No Finding", "View Position"]).apply(lambda x: x.sample( (n_train + n_valid) // 4, random_state=seed+i) )
        training_set, validation_set = train_test_split(training_set, 
                                                        test_size=1000, 
                                                        shuffle=True, 
                                                        random_state=shuffle_seed, 
                                                        stratify=training_set[["No Finding", "View Position"]])
        # print("training set:", training_set.shape)
        # print("validation set", validation_set.shape)
        # index for training_set and validation_set is: "No FInding", "View Position", "Image Index"
        #   remove first two level
        training_set = training_set.reset_index(level=[0,1], drop=True)
        print("*TRAIN*")
        print(training_set.loc[:,["No Finding", "View Position"]].value_counts() / len(training_set))
        training_set = filter_columns(training_set)

        validation_set = validation_set.reset_index(level=[0,1], drop=True)
        print("*VALID*")
        print(validation_set.loc[:,["No Finding", "View Position"]].value_counts() / len(validation_set))
        validation_set = filter_columns(validation_set)
        
        yield training_set, validation_set, test_set
    # <

# > produce ecvl yaml
def ecvl_yaml(filenames, labels, train_ids, valid_ids, test_ids):
    d = {
        "name"        : "chest-xrays8, normal-vs-rest",
        "description" : "normal-vs-rest",
        "classes"     : [], 
        "images"      : [],
        "split"       : dict(training = train_ids, 
                            validation = valid_ids, 
                            test=test_ids)
    }
    imgs = []
    for fn, l in zip(filenames, labels):
        imgs.append({
            "location": fn,
            "label": l
        })
    d["images"] = imgs
    d["classes"] = sorted(list(set(labels)))
    return d
# <

def prepare_full_dataset(train, valid, test, label_col):
    df = pd.concat([
            (train.reset_index())[["Image Index", label_col]],
            (valid.reset_index())[["Image Index", label_col]],
            (test.reset_index())[["Image Index", label_col]]
    ], axis=0)
    return df
# <


# > 
import os
import yaml
output_fld = "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/chestxray_normal"

for i, (training_set, validation_set, test_set) in \
    enumerate(train_validation_split(ds, 4000, 1000, 2000, seed=shuffle_seed, n_iter=3)):
    # > check intersection
    def common_index(d1, d2):
        idx = d1.index.isin(d2.index)
        return idx
    #
    assert nnz(common_index(training_set, validation_set))==0
    assert nnz(common_index(training_set, test_set)) ==0 
    assert nnz(common_index(validation_set, test_set)) == 0
    # <

    # prepare single dataframe with all the examples
    examples = prepare_full_dataset(training_set, validation_set, test_set, "No Finding")
    filenames = examples["Image Index"].tolist()
    labels = examples["No Finding"].tolist()
    # idxs for ecvl dataset
    train_ids = list((range(len(training_set))))
    validation_ids = [train_ids[-1] + v for v in list(range(len(validation_set)))]
    test_ids = [validation_ids[-1] + v for v  in list(range(len(test_set)))]

    ecvl_ds = ecvl_yaml(filenames, labels, train_ids, validation_ids, test_ids)
    folder = join( output_fld, f"fold_{i}")
    os.makedirs(folder, exist_ok = True)
    with open(join(folder, "dataset.yml"), "w") as fout:
        yaml.safe_dump(ecvl_ds, fout, default_flow_style=True)
    
    print(f"dataset {examples.shape}, saved in: {folder}")  

print("done.")

In [None]:
g = ds.groupby(["No Finding", "View Position"], as_index=False).apply(lambda x: x.sample( (5000) // 4, random_state=1) )
display(g)

to ecvl dataset

In [None]:
# select data for the ecvl dataset

def prepare_full_dataset(train, valid, test, label_col):
    df = pd.concat([
            (train.reset_index())[["Image Index", label_col]],
            (valid.reset_index())[["Image Index", label_col]],
            (test.reset_index())[["Image Index", label_col]]
    ], axis=0)
    return df
# <

examples = prepare_full_dataset(training_set, validation_set, test_set, "No Finding").reset_index()
print(ds.split.value_counts())
display(examples)

# > t e s t: PASSED
# ex = examples.iloc[len(training_set)-1]
# idx = ex["Image Index"]
# split = ds.loc[idx, "split"]
# assert split == "train"

# ex = examples.iloc[len(training_set)+1]
# idx = ex["Image Index"]
# print(idx)
# print(validation_set)
# assert idx in validation_set.index

# ex = examples.iloc[len(training_set) + len(validation_set) +1]
# idx = ex["Image Index"]
# split = ds.loc[idx, "split"]
# assert split == "test"
# <




In [None]:
# > produce ecvl yaml
def ecvl_yaml(filenames, labels, train_ids, valid_ids, test_ids):
    d = {
        "name"        : "chest-xrays8, normal-vs-rest",
        "description" : "normal-vs-rest",
        "classes"     : [], 
        "images"      : [],
        "split"       : dict(training = train_ids, 
                            validation = valid_ids, 
                            test=test_ids)
    }
    imgs = []
    for fn, l in zip(filenames, labels):
        imgs.append({
            "location": fn,
            "label": l
        })
    d["images"] = imgs
    d["classes"] = sorted(list(set(labels)))
    return d
# <


# > 
filenames = examples["Image Index"].tolist()
labels = examples["No Finding"].tolist()
train_ids = list((range(len(training_set))))
validation_ids = [train_ids[-1] + v for v in list(range(len(validation_set)))]
test_ids = [validation_ids[-1] + v for v  in list(range(len(test_set)))]

ecvl_ds = ecvl_yaml(filenames, labels, train_ids, validation_ids, test_ids)

import yaml
output_fld = "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/chestxray_normal"
os.makedirs(output_fld, exist_ok=True)
with open(join(output_fld, "dataset.yml"), "w") as fout:
    yaml.safe_dump(ecvl_ds, fout, default_flow_style=True)

print(f"dataset saved in: {output_fld}")  

<font color="red">Following cells are tests or outdated</font>

In [None]:
# study frequencies

cols = [c for c in ds.columns if c != "split"]
print(cols)
train_ds = ds.loc[ds.split == "train", cols]
test_ds  = ds.loc[ds.split == "test", cols]

train_freqs = train_ds.sum(axis=0).to_numpy()
test_freqs = test_ds.sum(axis=0)

import matplotlib as mpl
import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_subplot(121)

_ = ax.bar(cols, train_freqs)

ax = fig.add_subplot(122)
_ = ax.bar(cols, test_freqs, alpha=0.5)



In [None]:
train_ds.index.name = "filename"
test_ds.index.name = "filename"
train_ds.to_csv( join(base_fld, "train_set.csv"))
test_ds.to_csv( join(base_fld, "test_set.csv"))

In [None]:
train_ds = pd.read_csv( join(base_fld, "train_set.csv"), index_col="filename")
test_ds = pd.read_csv( join(base_fld, "test_set.csv"), index_col="filename")
cols = train_ds.columns.to_numpy()

def labels_to_str(row):
    e = row.to_numpy().astype(bool)
    return ";".join(cols[e])

train_ds["labels"] = train_ds.apply(lambda row: labels_to_str(row), axis=1)
label_counts = train_ds["labels"].value_counts()
print(label_counts)
