### Download COVID Chest X-Ray and Kaggle Pneumonia challenge datasets.

Using the Kaggle API to fetch the dataset; make sure you have an API key, you're enrolled in the challenge at https://www.kaggle.com/c/rsna-pneumonia-detection-challenge/data, and have already run the instructions [here](https://github.com/Kaggle/kaggle-api).

Based on https://github.com/IliasPap/COVIDNet.

In [5]:
!mkdir ../datasets

In [6]:
!git clone https://github.com/ieee8023/covid-chestxray-dataset.git ../datasets/covid-chestxray-dataset

Cloning into '../datasets/covid-chestxray-dataset'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 1976 (delta 1), reused 0 (delta 0), pack-reused 1968[K
Receiving objects: 100% (1976/1976), 265.01 MiB | 77.78 MiB/s, done.
Resolving deltas: 100% (953/953), done.


In [10]:
!kaggle competitions download -p ../datasets/ rsna-pneumonia-detection-challenge
!mkdir ../datasets/rsna_dataset
!unzip -q ../datasets/rsna-pneumonia-detection-challenge.zip -d ../datasets/rsna_dataset

Downloading rsna-pneumonia-detection-challenge.zip to ../datasets
100%|██████████████████████████████████████▊| 3.65G/3.66G [00:21<00:00, 247MB/s]
100%|███████████████████████████████████████| 3.66G/3.66G [00:22<00:00, 178MB/s]


In [11]:
! pip install pydicom
import numpy as np
import pandas as pd
import os
import random 
from shutil import copyfile
import pydicom as dicom
import cv2

Collecting pydicom
  Downloading pydicom-1.4.2-py2.py3-none-any.whl (35.3 MB)
[K     |████████████████████████████████| 35.3 MB 37.1 MB/s eta 0:00:01
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-1.4.2


In [13]:
seed = 0
np.random.seed(seed) # Reset the seed so all runs are the same.
random.seed(seed)
MAXVAL = 255  # Range [0 255]

# Set up COVID chest x-ray dataset.
root = '../datasets/covid-chestxray-dataset'

savepath = root + '/data'
if(not os.path.exists(savepath)):
    os.makedirs(savepath)
savepath = root + '/data/train'
if(not os.path.exists(savepath)):
    os.makedirs(savepath)
savepath = root + '/data/test'
if(not os.path.exists(savepath)):
    os.makedirs(savepath)

savepath = root + '/data'
imgpath = root + '/images' 
csvpath = root + '/metadata.csv'

# Set up RSNA Kaggle Pneumonia challenge dataset.
kaggle_datapath = '../datasets/rsna_dataset'
kaggle_csvname = 'stage_2_detailed_class_info.csv'
kaggle_csvname2 = 'stage_2_train_labels.csv'
kaggle_imgpath = 'stage_2_train_images'

# Parameters for COVIDx dataset.
train = []
test = []
test_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
train_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}

mapping = dict()
mapping['COVID-19'] = 'COVID-19'
mapping['SARS'] = 'pneumonia'
mapping['MERS'] = 'pneumonia'
mapping['Streptococcus'] = 'pneumonia'
mapping['Normal'] = 'normal'
mapping['Lung Opacity'] = 'pneumonia'
mapping['1'] = 'pneumonia'

# Train/test split
split = 0.1

### Preprocess COVIDx dataset

In [14]:
csv = pd.read_csv(csvpath, nrows=None)
idx_pa = csv["view"] == "PA"  # Keep only the PA view
csv = csv[idx_pa]

pneumonias = ["COVID-19", "SARS", "MERS", "ARDS", "Streptococcus"]
pathologies = ["Pneumonia", "Viral Pneumonia", "Bacterial Pneumonia", "No Finding"] + pneumonias
pathologies = sorted(pathologies)

In [15]:
# Get non-COVID19 viral, bacteria, and COVID-19 infections from covid-chestxray-dataset.
# CSV stored as patient id, image filename and label.
filename_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}
count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
print(csv.keys())
for index, row in csv.iterrows():
    f = row['finding']
    if f in mapping:
        count[mapping[f]] += 1
        entry = [int(row['patientid']), row['filename'], mapping[f]]
        filename_label[mapping[f]].append(entry)

print('Data distribution from covid-chestxray-dataset:')
print(count)

Index(['patientid', 'offset', 'sex', 'age', 'finding', 'survival', 'intubated',
       'intubation_present', 'went_icu', 'in_icu', 'needed_supplemental_O2',
       'extubated', 'temperature', 'pO2_saturation', 'leukocyte_count',
       'neutrophil_count', 'lymphocyte_count', 'view', 'modality', 'date',
       'location', 'folder', 'filename', 'doi', 'url', 'license',
       'clinical_notes', 'other_notes', 'Unnamed: 28'],
      dtype='object')
Data distribution from covid-chestxray-dataset:
{'normal': 0, 'pneumonia': 21, 'COVID-19': 142}


In [16]:
# Add covid-chestxray-dataset into COVIDx dataset. Since covid-chestxray-dataset doesn't have a
# test dataset, split into train/test by patientid.
#
# For COVIDx:
# Patient 8 is used as non-COVID19 viral test
# Patient 31 is used as bacterial test
# Patients 19, 20, 36, 42, 86 are used as COVID-19 viral test

for key in filename_label.keys():
    arr = np.array(filename_label[key])
    if arr.size == 0:
        continue
    # split by patients
    # num_diff_patients = len(np.unique(arr[:,0]))
    # num_test = max(1, round(split*num_diff_patients))
    # select num_test number of random patients
    if key == 'pneumonia':
        test_patients = ['8', '31']
    elif key == 'COVID-19':
        test_patients = ['19', '20', '36', '42', '86'] # random.sample(list(arr[:,0]), num_test)
    else: 
        test_patients = []
    print('Key: ', key)
    print('Test patients: ', test_patients)
    # go through all the patients
    for patient in arr:
        if patient[0] in test_patients:
            copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'test', patient[1]))
            test.append(patient)
            test_count[patient[2]] += 1
        else:
            copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'train', patient[1]))
            train.append(patient)
            train_count[patient[2]] += 1

print('test count: ', test_count)
print('train count: ', train_count)

Key:  pneumonia
Test patients:  ['8', '31']
Key:  COVID-19
Test patients:  ['19', '20', '36', '42', '86']
test count:  {'normal': 0, 'pneumonia': 5, 'COVID-19': 6}
train count:  {'normal': 0, 'pneumonia': 16, 'COVID-19': 136}


### Preprocess Kaggle dataset

In [17]:
# Add normal and rest of pneumonia cases from RSNA dataset.

kaggle_datapath = '../datasets/rsna_dataset'

print(kaggle_datapath)
csv_normal = pd.read_csv(os.path.join(kaggle_datapath, kaggle_csvname), nrows=None)
csv_pneu = pd.read_csv(os.path.join(kaggle_datapath, kaggle_csvname2), nrows=None)
patients = {'normal': [], 'pneumonia': []}

for index, row in csv_normal.iterrows():
    if row['class'] == 'Normal':
        patients['normal'].append(row['patientId'])

for index, row in csv_pneu.iterrows():
    if int(row['Target']) == 1:
        patients['pneumonia'].append(row['patientId'])

for key in patients.keys():
    arr = np.array(patients[key])
    if arr.size == 0:
        continue
    # split by patients 
    # num_diff_patients = len(np.unique(arr))
    # num_test = max(1, round(split*num_diff_patients))
    test_patients = np.load('rsna_test_patients_{}.npy'.format(key))
    for patient in arr:
        ds = dicom.dcmread(os.path.join(kaggle_datapath, kaggle_imgpath, patient + '.dcm'))
        pixel_array_numpy = ds.pixel_array
        imgname = patient + '.png'
        if patient in test_patients:
            cv2.imwrite(os.path.join(savepath, 'test', imgname), pixel_array_numpy)
            test.append([patient, imgname, key])
            test_count[key] += 1
        else:
            cv2.imwrite(os.path.join(savepath, 'train', imgname), pixel_array_numpy)
            train.append([patient, imgname, key])
            train_count[key] += 1

print('test count: ', test_count)
print('train count: ', train_count)

../datasets/rsna_dataset
test count:  {'normal': 885, 'pneumonia': 1058, 'COVID-19': 6}
train count:  {'normal': 7966, 'pneumonia': 8518, 'COVID-19': 136}


### Data split

In [18]:
print('Train count: ', train_count)
print('Test count: ', test_count)
print('Total length of train: ', len(train))
print('Total length of test: ', len(test))

Train count:  {'normal': 7966, 'pneumonia': 8518, 'COVID-19': 136}
Test count:  {'normal': 885, 'pneumonia': 1058, 'COVID-19': 6}
Total length of train:  16620
Total length of test:  1949


In [20]:
# Export to train and test csv.
# Columns: [patientid, filename, label]
train_file = open("train_split.txt","w") 
for sample in train:
    info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n'
    train_file.write(info)

train_file.close()

test_file = open("test_split.txt", "w")
for sample in test:
    info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n'
    test_file.write(info)

test_file.close()