# Data Augmentation Sets for Hierarchical Clasification

Load COVIDx dataset according to the protocol proposed in [1], code available in the [link](https://https://github.com/lindawangg/COVID-Net), and includes augmneted data. Files are saved in h5 format to speed up future loading.

The metadata can be download in [COVID-Net repositiry](https://https://github.com/lindawangg/COVID-Net).

[1] Wang, L., & Wong, A. (2020). COVID-Net: A tailored deep convolutional neural network design for detection of COVID-19 cases from chest radiography images. arXiv preprint arXiv:2003.09871.

In [0]:
from sklearn.metrics import confusion_matrix
import numpy as np
import tensorflow as tf
import os, argparse
import cv2

In [0]:
mapping = {'normal': 0, 'pneumonia': 1, 'COVID-19': 2}

NUM_IMGS_N = 2000
NUM_IMGS_P = 2000
NUM_IMGS_COV = 2000

First Level of Hierarchical data

In [0]:
train_folder = '/<path to train data>/train'
train_file_paths = '/<path to COVIDx training files>/train_COVIDx.txt'

In [0]:
file = open(train_file_paths, 'r')
testfile = file.readlines()
y_train_1 = []
x_train_1 = []
count_P = 0
count_N = 0
for i in range(len(testfile)):
    line = testfile[i].split()
    #print(train_folder +'/' + line[1])
    try:
      
      if mapping[line[2]] == 2 : # caso de corona virus => pneumonia
        x = cv2.imread(train_folder +'/' + line[1])
        x = cv2.resize(x, (224, 224))
        x = x.astype('float32') / 255.0
        y_train_1.append(mapping['pneumonia'])
        x_train_1.append(x)        
      else:

        if mapping[line[2]] == 0 and count_N < NUM_IMGS_N:
          x = cv2.imread(train_folder +'/' + line[1])
          x = cv2.resize(x, (224, 224))
          x = x.astype('float32') / 255.0
          y_train_1.append(mapping[line[2]])
          x_train_1.append(x)
          count_N = count_N+1

        if mapping[line[2]] == 1 and count_P < NUM_IMGS_P:
          x = cv2.imread(train_folder +'/' + line[1])
          x = cv2.resize(x, (224, 224))
          x = x.astype('float32') / 255.0
          y_train_1.append(mapping[line[2]])
          x_train_1.append(x)
          count_P = count_P+1
          
    except:
      None
    
    

In [0]:
# includes COVID-19 images as PNEUMONIA class
count = 0
imgs_path = '/<path to augmented data>/'
for f in os.listdir(imgs_path): 
    if count < NUM_IMGS_COV:
      x = cv2.imread(os.path.join(imgs_path,f))
      x = cv2.resize(x, (224, 224))
      x = x.astype('float32') / 255.0
      y_train_1.append(mapping['pneumonia'])
      x_train_1.append(x)
      count = count + 1  

In [0]:
dim1 = np.array(x_train_1).shape
dim2 = np.array(y_train_1).shape

In [0]:
test_folder = '/<path to test data>/test'
test_file_paths = '/<path to COVIDx training files>/test_COVIDx.dat'

In [0]:
file = open(test_file_paths, 'r')
testfile = file.readlines()
y_test_1 = []
x_test_1 = []
count = 0
for i in range(len(testfile)):
    line = testfile[i].split()
    try:

      if mapping[line[2]] == 2 : # caso de corona virus => pneumonia
        x = cv2.imread(test_folder +'/' + line[1])
        x = cv2.resize(x, (224, 224))
        x = x.astype('float32') / 255.0
        y_test_1.append(mapping['pneumonia'])
        x_test_1.append(x)
        count = count+1
      else:
        x = cv2.imread(test_folder +'/' + line[1])
        x = cv2.resize(x, (224, 224))
        x = x.astype('float32') / 255.0
        y_test_1.append(mapping[line[2]])
        x_test_1.append(x)
        count = count+1
    except:
      None

In [0]:
dim3 = np.array(x_test_1).shape
dim4 = np.array(y_test_1).shape

In [0]:
x_train_1 = np.array(x_train_1)
x_train_1.shape

(4066, 224, 224, 3)

In [0]:
y_train_1 = np.array(y_train_1)
y_train_1.shape

(4066,)

In [0]:
x_test_1 = np.array(x_test_1)
x_test_1.shape

(210, 224, 224, 3)

In [0]:
y_test_1 = np.array(y_test_1)
y_test_1.shape

(210,)

In [0]:
uniqueValues, occurCount = np.unique(y_train_1, return_counts=True)
print("Unique Values Train: " , uniqueValues)
print("Occurrence Count Train: ", occurCount)

Unique Values Train:  [0 1]
Occurrence Count Train:  [2000 2066]


In [0]:
uniqueValues, occurCount = np.unique(y_test_1, return_counts=True)
print("Unique Values Test: " , uniqueValues)
print("Occurrence Count Test: ", occurCount)

Unique Values Test:  [0 1]
Occurrence Count Test:  [100 110]


In [0]:
import h5py
h5f = h5py.File('/<path to save first level dataset>/h5/dataset_Hierarch_part_1.h5', 'w')
h5f.create_dataset('train_X', dim1, data= np.array(x_train_1), chunks=True)
h5f.create_dataset('train_y', dim2, data= np.array(y_train_1), chunks=True)
h5f.create_dataset('test_X', dim3, data= np.array(x_test_1), chunks=True)
h5f.create_dataset('test_y', dim4, data= np.array(y_test_1), chunks=True)
h5f.close()

Second Level of Hierarchical data

In [0]:
# iIncludes CVID-19 augmented data
y_train_2 = []
x_train_2 = []

count = 0
imgs_path = '/<path to augmented data>/'
for f in os.listdir(imgs_path): 
    if count < NUM_IMGS_COV:
      x = cv2.imread(os.path.join(imgs_path,f))
      x = cv2.resize(x, (224, 224))
      x = x.astype('float32') / 255.0
      y_train_2.append(mapping['COVID-19']-1)
      x_train_2.append(x)
      count = count + 1      
    
  

In [0]:
train_folder = '/<path to train data>/train'
train_file_paths = '/<path to COVIDx training files>/train_COVIDx.txt'

In [0]:
file = open(train_file_paths, 'r')
testfile = file.readlines()
y_train_1 = []
x_train_1 = []
count_P = 0
for i in range(len(testfile)):
    line = testfile[i].split()
    #print(train_folder +'/' + line[1])
    try:
      
      if mapping[line[2]] == 1 and count_P < NUM_IMGS_P:
        x = cv2.imread(train_folder +'/' + line[1])
        x = cv2.resize(x, (224, 224))
        x = x.astype('float32') / 255.0
        y_train_2.append(mapping[line[2]] - 1)
        x_train_2.append(x)
        count_P = count_P+1
    except:
      None

In [0]:
test_folder = '/<path to test data>/test'
test_file_paths = '/<path to COVIDx testing files>/test_COVIDx.txt'

In [0]:
file = open(test_file_paths, 'r')
testfile = file.readlines()
y_test_2 = []
x_test_2 = []
count = 0
for i in range(len(testfile)):
    line = testfile[i].split()
    try:

      if mapping[line[2]] == 1 or mapping[line[2]] == 2 :
        x = cv2.imread(test_folder +'/' + line[1])
        x = cv2.resize(x, (224, 224))
        x = x.astype('float32') / 255.0
        y_test_2.append(mapping[line[2]] - 1)
        x_test_2.append(x)
        count = count+1
    except:
      None

In [0]:
dim1 = np.array(x_train_2).shape
dim2 = np.array(y_train_2).shape
dim3 = np.array(x_test_2).shape
dim4 = np.array(y_test_2).shape

In [0]:
import h5py
h5f = h5py.File('/<path to save data>/h5/dataset_covidNet_Hierarch_BALANCED_2000_part_2.h5', 'w')
h5f.create_dataset('train_X', dim1, data= np.array(x_train_2), chunks=True)
h5f.create_dataset('train_y', dim2, data= np.array(y_train_2), chunks=True)
h5f.create_dataset('test_X', dim3, data= np.array(x_test_2), chunks=True)
h5f.create_dataset('test_y', dim4, data= np.array(y_test_2), chunks=True)
h5f.close()