### MIMIC-CXR (dcm & jpg)
<font color="yellow">0. IMPORT AND DATA</font>

In [None]:
import pandas as pd

import numpy as np
from numpy import count_nonzero as nnz
import os
from posixpath import join
from IPython.display import display
from IPython.display import Image

# ! random seed

jpg_fld = "/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0"
dcm_fld = "/mnt/datasets/mimic-cxr/mimic-cxr-dcm/physionet.org/files/mimic-cxr/2.0.0"

# all the following files are split between the DCM and JPG versions

filename_records = join(dcm_fld, "cxr-record-list.csv") # path to dmc images (jpg are basically mirrored)
filename_reports = join(dcm_fld, "cxr-study-list.csv")  # text reports
filename_metadata = join(jpg_fld, "mimic-cxr-2.0.0-metadata.csv") 
filename_chexpert = join(jpg_fld, "mimic-cxr-2.0.0-chexpert.csv") #labels
filename_negbio = join(jpg_fld, "mimic-cxr-2.0.0-negbio.csv") #labels
filename_split = join(jpg_fld, "mimic-cxr-2.0.0-split.csv") #suggested training-test split
filename_sections = join(dcm_fld, "mimic-cxr-sections", "mimic_cxr_sectioned.csv")
# preprocessing code from: https://github.com/MIT-LCP/mimic-cxr/tree/master/notebooks
df = pd.read_csv(filename_records, header=0, sep=',')

n = df.shape[0]
print(f'{n} DICOMs in MIMIC-CXR v2.0.0.')

n = df['study_id'].nunique()
print(f'  {n} studies.')

n = df['subject_id'].nunique()
print(f'  {n} subjects.')

dicoms = set(df['dicom_id'].tolist())

df_split = pd.read_csv(filename_split)
df_metadata = pd.read_csv(filename_metadata)

display(df_metadata.T)

cx = pd.read_csv(filename_chexpert)
nb = pd.read_csv(filename_negbio)

print("chexpert")
display(cx.T)
print("negbio")
display(nb.T)

cols = [col for col in cx.columns if col not in nb.columns]
cols = cols + [col for col in nb.columns if col not in cx.columns]

assert len(cols) == 0


<font color="yellow">1. VIEW</font> correct "view" field

In [None]:
# initialize view with a mapping from ViewPosition
VIEW_MAP = {
    'AP': 'frontal',
    'PA': 'frontal',
    'LATERAL': 'lateral',
    'LL': 'lateral',
    'LPO': 'other',
    'RAO': 'other',
    'RPO': 'other',
    'LAO': 'other',
    # the below are overwritten in some instances by manual review
    'AP AXIAL': 'other',
    'XTABLE LATERAL': 'other',
    'AP LLD': 'other',
    'PA LLD': 'other',
    'L5 S1': 'other',
    'SWIMMERS': 'other',
    'AP RLD': 'other',
    'PA RLD': 'other',
}

df_metadata['view'] = df_metadata['ViewPosition'].map(VIEW_MAP)

# for 'other' category, currently many of these are simply unknown
# so try to update them with acq device map
ADPD_MAP = {
    'CHEST, LATERAL': 'lateral',
    'CHEST, PA': 'frontal',
    # manually checked 100 records, below is always frontal
    'CHEST, PORTABLE': 'frontal',
    'CHEST, PA X-WISE': 'frontal',
    'CHEST, AP (GRID)': 'frontal',
    'CHEST LAT': 'lateral',
    'CHEST PA': 'frontal',
    'CHEST, AP NON-GRID': 'frontal',
    'CHEST AP NON GRID': 'frontal',
    'CHEST PA X-WISE': 'frontal',
    'CHEST AP GRID': 'frontal',
    'CHEST, PORTABLE X-WISE': 'other',
    # below have < 25 samples each
    'CHEST PORT': 'frontal',
    'CHEST PORT X-WISE': 'frontal',
    # manually classified below
    'SHOULDER': 'other',
    'CHEST, PEDI (4-10 YRS)': 'other',
    'LOWER RIBS': 'other',
    'CHEST, DECUB.': 'other',
    'ABDOMEN, PORTABLE': 'other',
    'UPPER RIBS': 'frontal',
    'STERNUM, LATERAL': 'lateral',
    'KNEE, AP/OBL': 'other',
    'STERNUM, PA/OBL.': 'other',
    'CLAVICLE/ AC JOINTS': 'other',
    'ABDOMEN,GENERAL': 'other',
    'LOWER RIB': 'other',
    'SCOLIOSIS AP': 'frontal'
}

good_view = ['frontal', 'lateral']
idxUpdate = ~df_metadata['view'].isin(good_view)

# ! this field is not present in current file
c = 'AcquisitionDeviceProcessingDescription'
# idx = (df_metadata[c].notnull()) & idxUpdate
# df_metadata.loc[idx, 'view'] = df_metadata.loc[idx, c].map(ADPD_MAP)

DICOM_TO_VIEW = {
    '2164992c-f4abb30a-7aaaf4f4-383cab47-4e3eb1c8': ['PA', 'frontal'],
    '5e6881e2-ff4254e0-b99f0c2f-8964482a-031364db': ['LL', 'lateral'],
    'fcdf7a30-3236b74e-65b97587-cdd4cfde-63cd1de0': ['PA', 'frontal'],
    'fb074ec1-6715839c-84fa75e6-adc3f026-448b1481': ['PA', 'frontal'],
    'dfb8080a-8506e43e-840d9d58-0f738f41-82c120b0': ['PA', 'frontal'],
    '4b32608b-c2ead7c4-1fe5565f-42f7ab80-9dad30de': ['LL', 'lateral'],
    '53663e89-8f9ca9bb-df1bf434-8d6b1283-2b612609': ['LL', 'lateral'],
    # below are AP, but incorrectly in View Position
    '8672a4e7-366801a0-26cf2395-9344335c-aac8d728': ['AP', 'frontal'],
    '9800b28e-3ff3b417-18473be2-1a66131d-aca88488': ['AP', 'frontal'],
    '598cfe48-33a8643e-843e27e2-5dd584e7-3cd5f1c0': ['AP', 'frontal']
}

# we manually reviewed a few DICOMs to keep them in
for dcm, row in DICOM_TO_VIEW.items():
    view = row[1]
    idx = df_metadata['dicom_id'] == dcm
    if idx.any():
        df_metadata.loc[idx, 'view'] = view

# remove rows that do not have a good view
idxUpdate = ~df_metadata['view'].isin(good_view)
print(f"removing {nnz(idxUpdate)} rows")
n_rows = df_metadata.shape[0]
df_metadata.drop(df[idxUpdate].index, inplace=True)
assert nnz(idxUpdate) == n_rows - df_metadata.shape[0]

print(f"using {df_metadata.shape[0]} examples")



<font color="yellow">2. SPLIT & FINDINGS</font>

In [None]:
# merge dataframes
# METADATA + SPLIT
df = df_split.merge(df_metadata.drop(['study_id', 'subject_id'], axis=1),
                   on='dicom_id', how='inner')

nb = pd.read_csv(filename_negbio)
cx = pd.read_csv(filename_chexpert)

nb.drop('subject_id', axis=1, inplace=True)
cx.drop('subject_id', axis=1, inplace=True)

nb_findings = [x for x in nb.columns if x != 'study_id']
cx_findings = [x for x in cx.columns if x != 'study_id']

# METADATA + SPLIT + LABELS
df = df.merge(cx, how='left', on='study_id')

# indicator flag for the study having a Chexpert/NegBio finding
df['has_chexpert_finding'] = df[[x for x in cx_findings if x != 'No Finding']].notnull().sum(axis=1) > 0
df['has_chexpert_pos_finding'] = df[[x for x in cx_findings if x != 'No Finding']].gt(0).sum(axis=1) > 0
df['is_chexpert_normal'] = df[['No Finding']].gt(0).sum(axis=1) > 0
df['has_chexpert_neg_finding'] = df[[x for x in cx_findings if x != 'No Finding']].lt(0).sum(axis=1) > 0
# df['has_chexpert_pos_neg'] = df[[x for x in cx_findings if x != 'No Finding']].lt(0).sum(axis=1) > 0 & df[[x for x in cx_findings if x != 'No Finding']].gt(0).sum(axis=1) > 0
# df['has_negbio_finding'] = df[[x for x in nb_findings if x != 'No Finding']].notnull().sum(axis=1) > 0
# df['has_negbio_pos_finding'] = df[[x for x in nb_findings if x != 'No Finding']].notnull().sum(axis=1) > 0

df.drop(df[~df.view.isin(["frontal", "lateral"])].index, inplace=True)
print("view")
print(df["view"].value_counts())

# df[['dicom_id', 'split', 'view'] + findings].head().T

print("metadata + split + chexpert:", df.shape)
display(df.T)

In [None]:
# 1 use the entire training set

target = "No Finding"

df["target"] = df[target]
df.loc[df["target"].isna(), "target"] = 0  # No Finding is 1

display(df[['dicom_id', 'view']].groupby(['view']).count() / len(df))
display(df[['dicom_id', 'target']].groupby(['target']).count() / len(df))

sub_columns = ["study_id", "dicom_id", "view", "target", "split"]
training_set = df.loc[df.split=="train", sub_columns]  # .groupby(["study_id", "target"]).sample(1)
validation_set = df.loc[df.split=="validate", sub_columns]
test_set = df.loc[df.split == "test", sub_columns]

print(f"* training set {len(training_set)}")
print(  training_set[["dicom_id", "target"]].groupby(  ["target"]  ).count() / len(training_set)  )

print(f"* validation set {len(validation_set)}")
print(  validation_set[["dicom_id", "target"]].groupby(  ["target"]  ).count() / len(validation_set)  )

print(f"* test set {len(test_set)}")
print(  test_set[["dicom_id", "target"]].groupby(  ["target"]  ).count() / len(test_set)  )

sub_columns = ["dicom_id", "target", "split"]
dataset = pd.concat(  [training_set[sub_columns], validation_set[sub_columns], test_set[sub_columns]], axis=0  )
print(dataset.shape)

paths = pd.read_csv(filename_records)
display(paths)

dataset = dataset.merge(paths[["dicom_id", "path"]], on="dicom_id", how="left")
display(dataset.T)

out_fld = "/mnt/datasets/mimic-cxr/training_data/mimic"
dataset.to_csv( join(out_fld, "normal_bin_unbal.tsv"), sep="\t", index=False)

<font color="yellow">unbalanced training sets</font>
50%-50% normal vs rest

In [None]:
# No-finding vs rest
# encoding no_finding = 1, with_findings = 1
# 1 image per dicom study: either training or validation study
# test set untouched (so not balanced)

# we now have one image per available view for each study



In [None]:
import numpy as np
import pandas as pd

values = [[np.nan, 1], [np.nan, 1], [1, 1], [np.nan, 1], [3, 2]]
df = pd.DataFrame(data=values, columns=["A", "B"])
print(df)
df["A"].where(~df["A"].isna(), 0, inplace=True)

print(df)

In [None]:
# validation set
splits = ["validate", "test"]
d = df.loc[df.split.isin(splits), ["split", "dicom_id", "study_id", "view"]]
d = d.merge(cx, on="study_id", how="left")
for p in splits:
    partitions[p] = d[d.split==p].where(~d["No Finding"].isna(), 0)
for p in partitions:
    print(p)
    display(partitions[p].T)

In [None]:
# add labels for chexpert
display(d)
display(cx)
display(d)

In [None]:
display(d.T)

<font color="yellow">STATS ABOUT HAS_FINDING</font>

In [None]:
splits = ['train', 'validate', 'test']
split_views = df.groupby(['split', 'view'])[['dicom_id']].count()
row_idx = ['frontal', 'lateral']  #, 'other'] other removed

# number of images in each set
n_images = {}
for c in splits:
    n_images[c] = split_views.loc[c].loc[row_idx, 'dicom_id'].sum()

tbl = pd.DataFrame.from_dict(n_images, orient='index')
tbl.columns = ['Number of images']
tbl = tbl.T
display(tbl)

# number of images in each set for each view
n_images = {}
for c in splits:
    n_images[c] = {}
    for view in row_idx:
        n_images[c][view] = split_views.loc[c].loc[view, 'dicom_id']
n_images = pd.DataFrame.from_dict(n_images, orient='index')
display(n_images)
n_images = n_images.T
display(n_images)

# convert frontal/lateral/other into "N (%)"
for i in n_images.index:
    for c in splits:
        val = n_images.loc[i, c]
        n_images.loc[i, c] = f'{val} ({100.0*val/tbl.loc["Number of images", c]:3.1f}%)'

tbl = pd.concat([tbl, n_images], axis=0, sort=False)

# add in the number of subjects
n_studies = df.groupby('split')[['study_id']].nunique().T
n_studies.index = ['Number of studies']
tbl = pd.concat([tbl, n_studies], axis=0, sort=False)

# studies with a finding
n_studies = df.loc[df['has_chexpert_finding']].groupby('split')[['study_id']].nunique().T
n_studies.index = ['  with a finding']
for c in splits:
    val = n_studies.loc['  with a finding', c]
    n_studies.loc['  with a finding', c] = f'{val} ({100.0*val/tbl.loc["Number of studies", c]:3.1f}%)'
tbl = pd.concat([tbl, n_studies], axis=0, sort=False)

# patients
n_pt = df.groupby('split')[['subject_id']].nunique().T
n_pt.index = ['Number of patients']
tbl = pd.concat([tbl, n_pt], axis=0, sort=False)

# patients with a finding
n_studies = df.loc[df['has_chexpert_finding']].groupby('split')[['subject_id']].nunique().T
n_studies.index = ['  with a finding']
for c in splits:
    val = n_studies.loc['  with a finding', c]
    n_studies.loc['  with a finding', c] = f'{val} ({100.0*val/tbl.loc["Number of patients", c]:3.1f}%)'
tbl = pd.concat([tbl, n_studies], axis=0, sort=False)

# tbl.to_latex('table2.tex')

display(tbl)

<font color="yellow">AGREEMENT</font>

In [None]:
nb = pd.read_csv(filename_negbio)
cx = pd.read_csv(filename_chexpert)

# merge these findings to create a table
# both agree -> output label
# disagree -> output -9

# drop subject_id from cx - we have it in nb
df = cx.merge(
    nb.drop('subject_id', axis=1),
    how='left',
    left_on='study_id', right_on='study_id',
    suffixes=('', '_nb')
)

# subselect to training set
study_ids = set(df_split.loc[df_split['split']=='train', 'study_id'])
df = df.loc[df['study_id'].isin(study_ids)]

# replace numeric labels with meaningful labels
# also annotate disagreements between the two labelers
labels = {0: 'Negative', 1: 'Positive', -1: 'Uncertain', -9: 'Disagreement'}
for c in df.columns:
    if c in ('subject_id', 'study_id'):
        continue
    elif c.endswith('_nb'):
        continue
    
    # chexpert column
    c_nb = f'{c}_nb'
    
    # annotate disagreement
    for val in labels.keys():
        if val == -9:
            continue
        
        # check one is null and the other isn't
        idx = df[c].isnull() & df[c_nb].notnull()
        df.loc[idx, c] = -9
        
        idx = df[c].notnull() & df[c_nb].isnull()
        df.loc[idx, c] = -9
        
        # check both non-null, but different value
        idx = df[c].notnull() & df[c_nb].notnull() & (df[c] != df[c_nb])
        df.loc[idx, c] = -9
        
    # now for those missing in chexpert
    idx = df[c].isnull() & df[f'{c}_nb'].notnull()
    df.loc[idx, c] = -9
    
    df[c] = df[c].map(labels)
    
# drop chexpert columns
cols_drop = [c for c in df.columns if c.endswith('_nb')]
df.drop(cols_drop, axis=1, inplace=True)

# display a few example cases
display(df.head(n=10))

# create a summary table of the findings
grp_cols = [c for c in df.columns if c not in ('subject_id', 'study_id')]
tbl = {}
for c in grp_cols:
    tbl[c] = df[c].value_counts().to_dict()
tbl = pd.DataFrame.from_dict(tbl, orient='index')


# pretty format the labels
N = df.shape[0]
for c in tbl.columns:
    tbl[c] = tbl[c].apply(lambda x: f'{x:,} ({100.0*x/N:3.1f}%)')

# sort columns
print(f'Frequency of labels in MIMIC-CXR-JPG on the training subset of {df.shape[0]:,} unique radiologic studies.')
tbl = tbl[['Positive', 'Negative', 'Uncertain', 'Disagreement']]
# tbl.to_latex('findings_frequency.tex')
tbl

In [None]:
df = pd.read_csv(filename_records, header=0, sep=',')

sections = pd.read_csv(filename_sections)

print(sections.columns)
N = df['study_id'].nunique()
print(f'Of the total {N:,} reports.. ')
idx = sections['study'].notnull()
for c in ['impression', 'findings', 'last_paragraph']:
    n = sections.loc[idx, c].count()
    print(f'  {n:,} ({100.0*n/N:3.1f}%) had a {c} section')
    # limit next check to only rows where this section is null
    idx = idx & sections[c].isnull()



# ! code from github ends here

<font color="red">my code</font>

In [None]:
# from dcm: read all files
ds = pd.read_csv(filename_records).set_index(["subject_id", "study_id"])
print("record list - from dcm, shape:", ds.shape)
display(ds)

# from jpg
metadata = pd.read_csv( join(jpg_fld, "mimic-cxr-2.0.0-metadata.csv") ).set_index(["subject_id", "study_id"])
print("metadata - from jpg, shape:", metadata.shape)
display(metadata)


# jpg


display some images

In [None]:
vp = metadata.ViewPosition
print(vp.value_counts())
ds["ViewPosition"] = vp

display(ds)

# print("PA")
# pa = ds.loc[ds.ViewPosition == "PA", ["path"]]
# fns = pa.iloc[[0,1,2]].path
# for fn in fns:
#     fn = fn.replace(".dcm", ".jpg")
#     display(Image( join(jpg_fld, fn), width=224, height=224))

# print("AP")
# pa = ds.loc[ds.ViewPosition == "AP", ["path"]]
# fns = pa.iloc[[0,1,2]].path
# for fn in fns:
#     fn = fn.replace(".dcm", ".jpg")
#     display(Image( join(jpg_fld, fn), width=224, height=224))

# print("LATERAL")
# pa = ds.loc[ds.ViewPosition == "LATERAL", ["path"]]
# fns = pa.iloc[[0,1,2]].path
# for fn in fns:
#     fn = fn.replace(".dcm", ".jpg")
#     display(Image( join(jpg_fld, fn), width=224, height=224))

# print("XTABLE LATERAL")
# pa = ds.loc[ds.ViewPosition == "XTABLE LATERAL", ["path"]]
# print(len(pa))
# fns = pa.iloc[[0,1]].path
# for fn in fns:
#     fn = fn.replace(".dcm", ".jpg")
#     display(Image( join(jpg_fld, fn), width=224, height=224))

print("LL")
pa = ds.loc[ds.ViewPosition == "LL", ["path"]]
print(len(pa))
fns = pa.iloc[[10,20]].path
for fn in fns:
    fn = fn.replace(".dcm", ".jpg")
    display(Image( join(jpg_fld, fn), width=224, height=224))



- add column with labels
- add column with text

Read labels from chexpert/negbio in jpg folder
for each label
- +1 means positive mention
- -1 means negative mention
- 0 means uncertainty in the mention


In [None]:
# jpg
chexpert = pd.read_csv( join(jpg_fld, "mimic-cxr-2.0.0-chexpert.csv"), na_filter=False).set_index(["subject_id", "study_id"])
print("chexpert - from jpg, shape:", chexpert.shape)
display(chexpert)

columns = chexpert.columns
print(f"n labels: {len(columns)}")

There are several strategies for managing the uncertain label (zero), as suggested by the authors themselves:
- U-Ignore: We ignore the uncertain labels during training.
- U-Zeroes: We map all instances of the uncertain label to 0.
- U-Ones: We map all instances of the uncertain label to 1.
- U-SelfTrained: We first train a model using the U-Ignore approach to convergence, and then use the model to make predictions that re-label each of the uncertainty labels with the probability prediction outputted by the model.
- U-MultiClass: We treat the uncertainty label as its own class.

U-Ignore selected