# Data augmentation

In this file, we will try out some data augmentation on training set. Hopefully, these augmentation of training data will provide us with better accuracy when training the model.

In [1]:
from PIL import Image, ImageOps
import os

## 1.0 Mirror images for normal x-rays

In [13]:
train_normal_path = './dataset/train/normal'
train_infected_path = './dataset/train/infected/non-covid'
train_covid_path = './dataset/train/infected/covid'

number_of_files = 0
directory_normal = train_normal_path

for file in os.listdir(directory_normal):
    number_of_files += 1
    

for file in os.listdir(directory_normal):
    full_path = train_normal_path + '/' + file
    
    new_path = './dataset_augmented/train/normal/'
    
    new_filename = new_path + str(number_of_files) + '.jpg'
    old_filename = new_path + file
    im = Image.open(full_path)
    im.save(old_filename)
    im_mirror = ImageOps.mirror(im)
    im_mirror.save(new_filename)
    number_of_files += 1
    
print("Done creating mirrored images for normal x-rays")

Done creating mirrored images


## 1.1 Mirror images for covid x-rays

In [14]:
train_normal_path = './dataset/train/normal'
train_infected_path = './dataset/train/infected/non-covid'
train_covid_path = './dataset/train/infected/covid'

number_of_files = 0
directory_covid = train_covid_path

for file in os.listdir(directory_covid):
    number_of_files += 1
    

for file in os.listdir(directory_covid):
    full_path = train_covid_path + '/' + file
    
    new_path = './dataset_augmented/train/infected/covid/'
    
    new_filename = new_path + str(number_of_files) + '.jpg'
    old_filename = new_path + file
    im = Image.open(full_path)
    im.save(old_filename)
    im_mirror = ImageOps.mirror(im)
    im_mirror.save(new_filename)
    number_of_files += 1
    
print("Done creating mirrored images for covid")

Done creating mirrored images


## 1.2 Mirror images for non-covid but infected x-rays

In [15]:
train_normal_path = './dataset/train/normal'
train_infected_path = './dataset/train/infected/non-covid'
train_covid_path = './dataset/train/infected/covid'

number_of_files = 0
directory_infected = train_infected_path

for file in os.listdir(directory_infected):
    number_of_files += 1
    

for file in os.listdir(directory_infected):
    full_path = train_infected_path + '/' + file
    
    new_path = './dataset_augmented/train/infected/non-covid/'
    
    new_filename = new_path + str(number_of_files) + '.jpg'
    old_filename = new_path + file
    im = Image.open(full_path)
    im.save(old_filename)
    im_mirror = ImageOps.mirror(im)
    im_mirror.save(new_filename)
    number_of_files += 1
    
print("Done creating mirrored images for infected")

Done creating mirrored images


## 2.0 Data augmentation by doing histogram equalization

In [33]:
train_normal_path = './dataset/train/normal'
train_infected_path = './dataset/train/infected/non-covid'
train_covid_path = './dataset/train/infected/covid'

test_normal_path = './dataset/test/normal'
test_infected_path = './dataset/test/infected/non-covid'
test_covid_path = './dataset/test/infected/covid'

val_normal_path = './dataset/val/normal'
val_infected_path = './dataset/val/infected/non-covid'
val_covid_path = './dataset/val/infected/covid'

try:
    os.makedirs('./dataset_hist_equalization/train/normal')
except FileExistsError:
    pass
try:
    os.makedirs('./dataset_hist_equalization/train/infected/non-covid')
except FileExistsError:
    pass
try:
    os.mkdir('./dataset_hist_equalization/train/infected/covid')
except FileExistsError:
    pass
try:
    os.makedirs('./dataset_hist_equalization/test/normal')
except FileExistsError:
    pass
try:
    os.makedirs('./dataset_hist_equalization/test/infected/non-covid')
except FileExistsError:
    pass
try:
    os.mkdir('./dataset_hist_equalization/test/infected/covid')
except FileExistsError:
    pass
try:
    os.makedirs('./dataset_hist_equalization/val/normal')
except FileExistsError:
    pass
try:
    os.makedirs('./dataset_hist_equalization/val/infected/non-covid')
except FileExistsError:
    pass
try:
    os.mkdir('./dataset_hist_equalization/val/infected/covid')
except FileExistsError:
    pass


def histogram_equalization(src_folder, dst_folder):
    '''
    datatype: 'train', 'test', or 'val' (string)
    src_folder: folder for the original images (string)
    dst_folder: foledr to store images that have been applied histogram equalization (string)
    '''
    pre_string = './' + src_folder
    post_normal = '/normal'
    post_infected = '/infected/non-covid'
    post_covid = '/infected/covid'
    
    # PATH TO ORIGINAL IMAGES
    normal_path = pre_string + post_normal
    infected_path = pre_string + post_infected
    covid_path = pre_string + post_covid
    
    # PATH TO NEW AUGMENTED IMAGES
    dst_normal_path = './' + dst_folder + post_normal
    dst_infected_path = './' + dst_folder + post_infected
    dst_covid_path = './' + dst_folder + post_covid
    
    for file in os.listdir(normal_path):
        new_filename = dst_normal_path + '/' + file
        full_original_path = normal_path + '/' + file
        im = Image.open(full_original_path)
        im_mirror = ImageOps.equalize(im)
        im_mirror.save(new_filename)

        
    for file in os.listdir(infected_path):
        new_filename = dst_infected_path + '/' + file
        full_original_path = infected_path + '/' + file
        im = Image.open(full_original_path)
        im_mirror = ImageOps.equalize(im)
        im_mirror.save(new_filename)
        
        
    for file in os.listdir(covid_path):
        new_filename = dst_covid_path + '/' + file
        full_original_path = covid_path + '/' + file
        im = Image.open(full_original_path)
        im_mirror = ImageOps.equalize(im)
        im_mirror.save(new_filename)
        
    print('Done')

src_folder = './dataset/val'
dst_folder = './dataset_hist_equalization/val'
histogram_equalization(src_folder, dst_folder)

src_folder = './dataset/test'
dst_folder = './dataset_hist_equalization/test'
histogram_equalization(src_folder, dst_folder)

src_folder = './dataset/train'
dst_folder = './dataset_hist_equalization/train'
histogram_equalization(src_folder, dst_folder)

Done
Done
Done
