In [None]:
import numpy as np
from numpy import count_nonzero as nnz
import pandas as pd
from posixpath import join

base_fld = "/mnt/datasets/mimic-cxr/chestx-ray8"

In [None]:
# read dataset
datae = pd.read_csv( join(base_fld, "Data_Entry_2017_v2020.csv"))
print("Data_Entry_2017_v2020.csv, shape:", datae.shape)
print(datae.columns)

lab_col = datae.loc[:, "Finding Labels"].value_counts()
print("unique combos:", len(lab_col))

In [None]:
labels = set()
datae.loc[:, "Finding Labels"].apply(lambda x: labels.update([y for y in x.split("|")]))
labels.remove("No Finding")
labels = ["No Finding"] + sorted(labels)
print(f"unique labels ({len(labels)}):", labels)

l2i = {}
i2l = {}
for i, l in enumerate(labels):
    l2i[l] = i
    i2l[str(i)] = l

In [None]:
filenames = []
encoding = np.zeros( (datae.shape[0], len(labels)), dtype=int)
print("encoding: ", encoding.shape)
for r, t in enumerate(datae.loc[:, ["Image Index", "Finding Labels"]].itertuples()):
    # t[0] is the index
    filenames.append(t[1])
    labs = t[2].split("|")
    for l in labs:
        encoding[r, l2i[l]] = 1

print(len(filenames))
print(encoding.shape)
df = pd.DataFrame(data = encoding, columns=labels, index=pd.Series(filenames))
display(df)

In [None]:
frequencies = df.sum(axis=0)
print("label frequencies:", frequencies)

# Atelectasis, Cardiomegaly, Effusion, Infiltration, Mass, Nodule, Pneumonia and Pneumathorax
paper_labels = ["No Finding"] + ["Atelectasis", "Cardiomegaly", "Effusion", "Infiltration", "Mass", "Nodule", "Pneumonia", "Pneumothorax"]

# there is an error in the paper Pneumathorax, corrected in the labels above. the following check should pass
for pl in paper_labels:
    assert pl in labels

rem_cols = [c for c in df.columns if c not in paper_labels]
print("will be removed:", rem_cols)

df2 = df.drop(columns=rem_cols)
print("new shape of dataset:", df2.shape)

# some images may now have 0 labels: remove them
n_labels = df2.loc[:, paper_labels].sum(axis=1)
with_zero_labels = n_labels == 0
print(f"{nnz(with_zero_labels)} images without labels... will be removed <=NO: associated with No Finding")

# ! important, see note in next code cell
# df3 = df2.drop(df2.loc[with_zero_labels].index)
df2.loc[with_zero_labels, "No Finding"] = 1

#assert df2.shape[0] - df3.shape[0] == nnz(with_zero_labels)
assert df2.shape[1] == len(paper_labels)

frequencies = df2.sum(axis=0)
n_labels = df2.sum(axis=1)
assert nnz(n_labels == 0) == 0
assert n_labels.sum() == df2.sum().sum()

ds = df2
del df, df2, 

print(f"dataset ds, final shape: {ds.shape}")


In [None]:
# check train_val_list and test_list
# 1. image exists in folder?
# 2. image exists in dataset?
def read_list(fn):
    with open( fn, "r") as fin:
        ids = fin.readlines()
    return ids

train_val = [s.strip() for s in read_list( join(base_fld, "train_val_list.txt") )]
test = [s.strip() for s in read_list(join (join(base_fld, "test_list.txt")))]

print(f"train&validation: {len(train_val)}")
print(f"test: {len(test)}")
print(f"total: {len(train_val)+len(test)}")
assert len(train_val) + len(test) == ds.shape[0]
# NOTE: it seems that the authos kept also the images with 0 labels after
# selected the labels in paper_labels. From the paper, it looks like
# they were encoded as all zeros, i.e., in the "No Finding" class.

#import os
#for r in ds.iterrows():
#    fn = r[0]
#    assert os.path.exists( join(base_fld, "images", fn) )
# DONE: all images exists
# 
ds["split"] = None
ds.loc[train_val, "split"] = "train"
ds.loc[test, "split"] = "test"

display(ds)

In [None]:
# study frequencies

cols = [c for c in ds.columns if c != "split"]
print(cols)
train_ds = ds.loc[ds.split == "train", cols]
test_ds  = ds.loc[ds.split == "test", cols]

train_freqs = train_ds.sum(axis=0).to_numpy()
test_freqs = test_ds.sum(axis=0)

import matplotlib as mpl
import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_subplot(121)

_ = ax.bar(cols, train_freqs)

ax = fig.add_subplot(122)
_ = ax.bar(cols, test_freqs, alpha=0.5)



In [None]:
train_ds.index.name = "filename"
test_ds.index.name = "filename"
train_ds.to_csv( join(base_fld, "train_set.csv"))
test_ds.to_csv( join(base_fld, "test_set.csv"))

In [None]:
train_ds = pd.read_csv( join(base_fld, "train_set.csv"), index_col="filename")
test_ds = pd.read_csv( join(base_fld, "test_set.csv"), index_col="filename")
cols = train_ds.columns.to_numpy()

def labels_to_str(row):
    e = row.to_numpy().astype(bool)
    return ";".join(cols[e])

train_ds["labels"] = train_ds.apply(lambda row: labels_to_str(row), axis=1)
label_counts = train_ds["labels"].value_counts()
print(label_counts)


In [None]:
# let's check view position
display(datae)
datae.loc[:, "View Position"].value_counts()