# Data Augmentation Sets

Load COVIDx dataset according to the protocol proposed in [1], code available in the [link](https://https://github.com/lindawangg/COVID-Net), and includes augmneted data. Files are saved in h5 format to speed up future loading.

The metadata can be download in [COVID-Net repositiry](https://https://github.com/lindawangg/COVID-Net).

[1] Wang, L., & Wong, A. (2020). COVID-Net: A tailored deep convolutional neural network design for detection of COVID-19 cases from chest radiography images. arXiv preprint arXiv:2003.09871.

In [0]:
from sklearn.metrics import confusion_matrix
import numpy as np
import tensorflow as tf
import os, argparse
import cv2
import numpy as np


In [0]:
mapping = {'normal': 0, 'pneumonia': 1, 'COVID-19': 2}

# number of images for each class
NUM_IMGS_N = 1000
NUM_IMGS_P = 1000
NUM_IMGS_COV = 1000  # the COVID-19 class is the only one augmented

In [0]:
y_train = []
x_train = []

count = 0
imgs_path = '/<path to COVID-19 augmented images>/'
for f in os.listdir(imgs_path): 
    if count < NUM_IMGS_COV:
      x = cv2.imread(os.path.join(imgs_path,f))
      x = cv2.resize(x, (224, 224))
      x = x.astype('float32') / 255.0
      y_train.append(mapping['COVID-19'])
      x_train.append(x)
      count = count + 1         
    

In [0]:
dim0x = np.array(x_train).shape
dim0y = np.array(y_train).shape

In [0]:
np.array(y_train).shape

(2000,)

In [0]:
np.array(x_train).shape

(2000, 224, 224, 3)

In [0]:
train_folder = '/<path to train data>/train'
train_file_paths = '/<path to COVIDx metadata files>/train_COVIDx.txt'

In [0]:
file = open(train_file_paths, 'r')
testfile = file.readlines()
count_P = 0
count_N = 0
for i in range(len(testfile)):
    line = testfile[i].split()
    try:
      
      if mapping[line[2]] == 0 and count_N < NUM_IMGS_N:
        x = cv2.imread(train_folder +'/' + line[1])
        x = cv2.resize(x, (224, 224))
        x = x.astype('float32') / 255.0
        y_train.append(mapping[line[2]])
        x_train.append(x)
        count_N = count_N+1

      if mapping[line[2]] == 1 and count_P < NUM_IMGS_P:
        x = cv2.imread(train_folder +'/' + line[1])
        x = cv2.resize(x, (224, 224))
        x = x.astype('float32') / 255.0
        y_train.append(mapping[line[2]])
        x_train.append(x)
        count_P = count_P+1

    except:
      None
    
    

In [0]:
dim1 = np.array(x_train).shape
dim2 = np.array(y_train).shape

In [0]:
test_folder = '/<path to test data>/test'
test_file_paths = '/<path to COVIdx metadata files>/test_COVIDx.dat'

In [0]:
file = open(test_file_paths, 'r')
testfile = file.readlines()
y_test = []
x_test = []
count = 0
for i in range(len(testfile)):
    line = testfile[i].split()
    try:
      x = cv2.imread(test_folder +'/' + line[1])
      x = cv2.resize(x, (224, 224))
      x = x.astype('float32') / 255.0
      y_test.append(mapping[line[2]])
      x_test.append(x)
      count = count+1
    except:
      None

In [0]:
dim3 = np.array(x_test).shape
dim4 = np.array(y_test).shape

In [0]:
x_train = np.array(x_train)
x_train.shape

(6000, 224, 224, 3)

In [0]:
y_train = np.array(y_train)
y_train.shape

(6000,)

In [0]:
x_test = np.array(x_test)
x_test.shape

(210, 224, 224, 3)

In [0]:
y_test = np.array(y_test)
y_test.shape

(210,)

In [0]:
uniqueValues, occurCount = np.unique(y_train, return_counts=True)
print("Unique Values for Train Set: " , uniqueValues)
print("Occurrence Count For Train Set: ", occurCount)

Unique Values Train:  [0 1 2]
Occurrence Count Train:  [2000 2000 2000]


In [0]:
uniqueValues, occurCount = np.unique(y_test, return_counts=True)
print("Unique Values for Train Set: " , uniqueValues)
print("Occurrence Count For Train Set: ", occurCount)

Unique Values Test:  [0 1 2]
Occurrence Count Test:  [100 100  10]


In [0]:
import h5py
h5f = h5py.File('/<path to save files>/h5/dataset_COVIx_BALANCED_augmented.h5', 'w')
h5f.create_dataset('train_X', dim1, data= np.array(x_train), chunks=True)
h5f.create_dataset('train_y', dim2, data= np.array(y_train), chunks=True)
h5f.create_dataset('test_X', dim3, data= np.array(x_test), chunks=True)
h5f.create_dataset('test_y', dim4, data= np.array(y_test), chunks=True)
h5f.close()