## COVID dataset preparation
adapted from https://github.com/lindawangg/COVID-Net/blob/master/create_COVIDx_v3.ipynb

In [None]:
import numpy as np
import pandas as pd
import os
import random 
from shutil import copyfile
import pydicom as dicom
import cv2

In [None]:
cd

In [None]:
# set parameters here
savepath = './data/data_v3'
seed = 0
np.random.seed(seed) # Reset the seed so all runs are the same.
random.seed(seed)
MAXVAL = 255  # Range [0 255]

# path to covid-19 dataset from https://github.com/ieee8023/covid-chestxray-dataset
cohen_imgpath = './data/data_source_v3/covid-chestxray-dataset-master/images' 
cohen_csvpath = './data/data_source_v3/covid-chestxray-dataset-master/metadata.csv'

# path to covid-19 dataset from https://github.com/agchung/Figure1-COVID-chestxray-dataset
fig1_imgpath = './data/data_source_v3/Figure1-COVID-chestxray-dataset-master/images'
fig1_csvpath = './data/data_source_v3/Figure1-COVID-chestxray-dataset-master/metadata.csv'

# path to https://www.kaggle.com/c/rsna-pneumonia-detection-challenge
rsna_datapath = './data/data_source_v3/rsna-pneumonia-detection-challenge'
# get all the normal from here
rsna_csvname = 'stage_2_detailed_class_info.csv' 
# get all the 1s from here since 1 indicate pneumonia
# found that images that aren't pneunmonia and also not normal are classified as 0s
rsna_csvname2 = 'stage_2_train_labels.csv' 
rsna_imgpath = 'stage_2_train_images'

# parameters for COVIDx dataset
train = []
val = []
test = []
test_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
val_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
train_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}

mapping = dict()
mapping['COVID-19'] = 'COVID-19'
mapping['SARS'] = 'pneumonia'
mapping['MERS'] = 'pneumonia'
mapping['Streptococcus'] = 'pneumonia'
mapping['Klebsiella'] = 'pneumonia'
mapping['Chlamydophila'] = 'pneumonia'
mapping['Legionella'] = 'pneumonia'
mapping['Normal'] = 'normal'
mapping['Lung Opacity'] = 'pneumonia'
mapping['1'] = 'pneumonia'

# train/val/test split : 80:10:10
split = 0.1

# to avoid duplicates
patient_imgpath = {}

In [None]:
# adapted from https://github.com/mlmed/torchxrayvision/blob/master/torchxrayvision/datasets.py#L814
cohen_csv = pd.read_csv(cohen_csvpath, nrows=None)
#idx_pa = csv["view"] == "PA"  # Keep only the PA view
views = ["PA", "AP", "AP Supine", "AP semi erect", "AP erect"]
cohen_idx_keep = cohen_csv.view.isin(views)
cohen_csv = cohen_csv[cohen_idx_keep]

fig1_csv = pd.read_csv(fig1_csvpath, encoding='ISO-8859-1', nrows=None)
#fig1_idx_keep = fig1_csv.view.isin(views)
#fig1_csv = fig1_csv[fig1_idx_keep]

In [None]:
# get non-COVID19 viral, bacteria, and COVID-19 infections from covid-chestxray-dataset
# stored as patient id, image filename and label
filename_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}
count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
for index, row in cohen_csv.iterrows():
    f = row['finding'].split(',')[0] # take the first finding, for the case of COVID-19, ARDS
    if f in mapping: # 
        count[mapping[f]] += 1
        entry = [str(row['patientid']), row['filename'], mapping[f], row['view']]
        filename_label[mapping[f]].append(entry)
        
for index, row in fig1_csv.iterrows():
    if not str(row['finding']) == 'nan':
        f = row['finding'].split(',')[0] # take the first finding
        if f in mapping: # 
            count[mapping[f]] += 1
            if os.path.exists(os.path.join(fig1_imgpath, row['patientid'] + '.jpg')):
                entry = [row['patientid'], row['patientid'] + '.jpg', mapping[f]]
            elif os.path.exists(os.path.join(fig1_imgpath, row['patientid'] + '.png')):
                entry = [row['patientid'], row['patientid'] + '.png', mapping[f]]
            filename_label[mapping[f]].append(entry)

print('Data distribution from covid-chestxray-dataset:')
print(count)

In [None]:
# add covid-chestxray-dataset into COVIDx dataset
# since covid-chestxray-dataset doesn't have val/test dataset
# split into train/val/test by patientid

for key in filename_label.keys():
    arr = np.array(filename_label[key])
    if arr.size == 0:
        continue
    # split by patients
    # num_diff_patients = len(np.unique(arr[:,0]))
    # num_test = max(1, round(split*num_diff_patients))
    # select num_test number of random patients
    if key == 'pneumonia':
        val_patients = ['31']
        test_patients = ['3']
    elif key == 'COVID-19':
        val_patients = ['2','19', '86', '94', '132', '144', '163', '169']
        test_patients = ['20', '36', '42', '51', '97', '117', '138', '150', '183', '184'] # random.sample(list(arr[:,0]), num_test)
    else: 
        val_patients = []
        test_patients = []
    print('Key: ', key)
    print('Val patients: ', val_patients)
    print('Test patients: ', test_patients)
    # go through all the patients
    for patient in arr:
        if patient[0] not in patient_imgpath:
            patient_imgpath[patient[0]] = [patient[1]]
        else:
            if patient[1] not in patient_imgpath[patient[0]]:
                patient_imgpath[patient[0]].append(patient[1])
            else:
                continue  # skip since image has already been written
        if patient[0] in val_patients:
            copyfile(os.path.join(cohen_imgpath, patient[1]), os.path.join(savepath, 'val', patient[1]))
            val.append(patient)
            val_count[patient[2]] += 1
        
        elif patient[0] in test_patients:
            copyfile(os.path.join(cohen_imgpath, patient[1]), os.path.join(savepath, 'test', patient[1]))
            test.append(patient)
            test_count[patient[2]] += 1
        else:
            if 'COVID' in patient[0]:
                copyfile(os.path.join(fig1_imgpath, patient[1]), os.path.join(savepath, 'train', patient[1]))
            else:
                copyfile(os.path.join(cohen_imgpath, patient[1]), os.path.join(savepath, 'train', patient[1]))
            train.append(patient)
            train_count[patient[2]] += 1

print('train count: ', train_count)
print('val count: ', val_count)
print('test count: ', test_count)

In [None]:
# add normal and rest of pneumonia cases from https://www.kaggle.com/c/rsna-pneumonia-detection-challenge
csv_normal = pd.read_csv(os.path.join(rsna_datapath, rsna_csvname), nrows=None)
csv_pneu = pd.read_csv(os.path.join(rsna_datapath, rsna_csvname2), nrows=None)
patients = {'normal': [], 'pneumonia': []}

for index, row in csv_normal.iterrows():
    if row['class'] == 'Normal':
        patients['normal'].append(row['patientId'])

for index, row in csv_pneu.iterrows():
    if int(row['Target']) == 1:
        patients['pneumonia'].append(row['patientId'])

In [None]:
# add pneumonia images into COVIDx dataset
# split into train/val/test by patientid

arr_p = list(np.unique(patients['pneumonia']))
np.random.shuffle(arr_p)
num_test_p = round(split*len(np.unique(patients['pneumonia'])))
test_p = arr_p[:num_test_p]
val_p = arr_p[num_test_p : 2*num_test_p]
test_patients_pneumonia = test_patients + test_p
val_patients_pneumonia = val_patients + val_p

arr = np.array(patients['pneumonia'])

for patient in arr:
    if patient not in patient_imgpath:
        patient_imgpath[patient] = [patient]
    else:
        continue  # skip since image has already been written
                
    ds = dicom.dcmread(os.path.join(rsna_datapath, rsna_imgpath, patient + '.dcm'))
    pixel_array_numpy = ds.pixel_array
    imgname = patient + '.png'
    if patient in test_patients_pneumonia:
        cv2.imwrite(os.path.join(savepath, 'test', imgname), pixel_array_numpy)
        test.append([patient, imgname, 'pneumonia'])
        test_count['pneumonia'] += 1
    elif patient in val_patients_pneumonia:
        cv2.imwrite(os.path.join(savepath, 'val', imgname), pixel_array_numpy)
        val.append([patient, imgname, 'pneumonia'])
        val_count['pneumonia'] += 1
    else:
        cv2.imwrite(os.path.join(savepath, 'train', imgname), pixel_array_numpy)
        train.append([patient, imgname, 'pneumonia'])
        train_count['pneumonia'] += 1

print('train count: ', train_count)
print('val count: ', val_count)
print('test count: ', test_count)

In [None]:
# add normal/healthy images into COVIDx dataset
# split into train/val/test by patientid

arr_n = list(np.unique(patients['normal']))
np.random.shuffle(arr_n)
num_test_n = round(split*len(np.unique(patients['normal'])))
test_n = arr_n[:num_test_n]
val_n = arr_n[num_test_n : 2*num_test_n]
test_patients_normal = test_patients + test_n
val_patients_normal = val_patients + val_n

arr = np.array(patients['normal'])

for patient in arr:
    if patient not in patient_imgpath:
        patient_imgpath[patient] = [patient]
    else:
        continue  # skip since image has already been written
                
    ds = dicom.dcmread(os.path.join(rsna_datapath, rsna_imgpath, patient + '.dcm'))
    pixel_array_numpy = ds.pixel_array
    imgname = patient + '.png'
    if patient in test_patients_normal:
        cv2.imwrite(os.path.join(savepath, 'test', imgname), pixel_array_numpy)
        test.append([patient, imgname, 'normal'])
        test_count['normal'] += 1
    elif patient in val_patients_normal:
        cv2.imwrite(os.path.join(savepath, 'val', imgname), pixel_array_numpy)
        val.append([patient, imgname, 'normal'])
        val_count['normal'] += 1
    else:
        cv2.imwrite(os.path.join(savepath, 'train', imgname), pixel_array_numpy)
        train.append([patient, imgname, 'normal'])
        train_count['normal'] += 1

print('train count: ', train_count)
print('val count: ', val_count)
print('test count: ', test_count)

In [None]:
# final stats
print('Final stats')
print('Train count: ', train_count)
print('Val count: ', val_count)
print('Test count: ', test_count)
print('Total length of train: ', len(train))
print('Total length of val: ', len(val))
print('Total length of test: ', len(test))

In [None]:
# export to train and test csv
# format as patientid, imagename, class

train_df = pd.DataFrame(columns=['patientId', 'imagename', 'class'])
for sample in train:
    train_df = train_df.append({'patientId': str(sample[0]),
                                'imagename': sample[1],
                                'class': sample[2]}, ignore_index=True)
    
val_df = pd.DataFrame(columns=['patientId', 'imagename', 'class'])
for sample in val:
    val_df = val_df.append({'patientId': str(sample[0]),
                            'imagename': sample[1],
                            'class': sample[2]}, ignore_index=True)

test_df = pd.DataFrame(columns=['patientId', 'imagename', 'class'])
for sample in test:
    test_df = test_df.append({'patientId': str(sample[0]),
                              'imagename': sample[1],
                              'class': sample[2]}, ignore_index=True)
    
train_df.to_csv(os.path.join(savepath,'train.csv'),index=False)
val_df.to_csv(os.path.join(savepath,'val.csv'),index=False)
test_df.to_csv(os.path.join(savepath,'test.csv'),index=False)