In [None]:
import glob
from collections import OrderedDict

from sklearn.preprocessing import MultiLabelBinarizer
from itertools import chain
import pandas as pd
from pathlib import Path
import numpy as np
from typing import Dict

dataset_path = Path('/home/szymswiat/datasets/nih_dataset')
drop_no_findings_class = True
classes = None

data_entry_path = dataset_path / 'Data_Entry_2017.csv'
bbox_list_path = dataset_path / 'BBox_List_2017.csv'

data_entry_df = pd.read_csv(data_entry_path)
bbox_list_df = pd.read_csv(bbox_list_path)

all_image_paths = glob.glob(f'{dataset_path}/images_*/images/*.png', recursive=True)
all_image_paths.sort()
all_image_paths = {Path(x).name: Path(x).relative_to(dataset_path) for x in all_image_paths}

if drop_no_findings_class:
    data_entry_df['Finding Labels'] = data_entry_df['Finding Labels'].map(lambda x: x.replace('No Finding', ''))

data_entry_df['Finding Labels'] = data_entry_df['Finding Labels'].map(lambda x: x.replace('|', ','))

if classes is None:
    classes = np.unique(list(chain(*data_entry_df['Finding Labels'].map(lambda x: x.split(',')).tolist())))
    classes = list(sorted([x for x in classes if len(x) > 0]))

    if not drop_no_findings_class:
        classes.remove('No Finding')
        classes.insert(0, 'No Finding')

encoder = MultiLabelBinarizer(classes=classes)
labels_all = encoder.fit_transform([c.split(',') for c in list(data_entry_df['Finding Labels'])])

all_df = pd.DataFrame()

all_df['Image_Index'] = data_entry_df['Image Index']
all_df['Image_Path'] = all_df['Image_Index'].map(all_image_paths.get)
all_df['Label'] = labels_all.tolist()
all_df['Patient_ID'] = data_entry_df['Patient ID']

train_val_list = pd.read_csv(dataset_path / 'train_val_list.txt', header=None)
train_val_list.columns = ['Image_Index']
test_list = pd.read_csv(dataset_path / 'test_list.txt', header=None)
test_list.columns = ['Image_Index']

train_val_df = all_df.merge(train_val_list, on='Image_Index')
test_df = all_df.merge(test_list, on='Image_Index')

In [None]:
val_patients_perc = 0.045

train_val_unique_patients = pd.Series(train_val_df['Patient_ID'].unique())

val_patients = train_val_unique_patients.sample(frac=val_patients_perc, random_state=0)
train_patients = train_val_unique_patients.drop(index=val_patients.index)

train_patients = train_patients.to_frame(name='Patient_ID')
val_patients = val_patients.to_frame(name='Patient_ID')

train_df = all_df.merge(train_patients, on='Patient_ID')
val_df = all_df.merge(val_patients, on='Patient_ID')

In [None]:
all_data = OrderedDict([
    ('all', {'df': all_df}),
    ('train_val', {'df': train_val_df}),
    ('train', {'df': train_df}),
    ('val', {'df': val_df}),
    ('test', {'df': test_df})
])

for name, data in all_data.items():
    data['labels'] = np.array([x for x in data['df']['Label']])
    data['dist'] = data['labels'].sum(axis=0) / data['labels'].shape[0]

classes_dist_df = pd.DataFrame(
    index=[f'{n}_dist' for n in all_data],
    columns=classes,
    data=np.stack([v['dist'] for _, v in all_data.items()])
)

In [None]:
all_patients = data_entry_df['Patient ID'].nunique()

for name, data in all_data.items():
    data['patient_count'] = data['df']['Patient_ID'].nunique()
    data['patient_perc'] = data['patient_count'] / all_patients
    data['samples_perc'] = len(data['df']) / len(data_entry_df)

columns = ['patient_count', 'patient_perc', 'samples_perc']
patients_samples_dist = pd.DataFrame(
    index=[f'{n}' for n in all_data],
    columns=columns,
    data=np.stack([[v[c] for c in columns]
                   for _, v in all_data.items()])
)