# This is Medical Image Data Generator

In [1]:
import os
import shutil

### Change this name to new for creating new data folder from the original data folder

In [3]:
base_dir = 'data_v2'

In [11]:
train_dir = os.path.join(base_dir, 'train')
validation_dir = os.path.join(base_dir, 'validation')
test_dir = os.path.join(base_dir, 'test')


main_classification_types = ['dyed-lifted-polyps',
                             'dyed-resection-margins',
                             'esophagitis',
                             'normal-cecum',
                             'normal-pylorus',
                             'normal-z-line',
                             'polyps',
                             'ulcerative-colitis']

# original data location path
original_data_folder_name = '/home/vajira/simula/Datasets/kvasir_v2_preprocessed_borders_navbox_removed'

## Change these values for generating new train, validation and test data sets

In [6]:
train_size = 600
validation_size = 200
test_size = 200

## Function for making new directory structure

In [7]:
def make_folder_structure(base_dir, main_classification_types_dirs):

    train_dir_list = []
    validation_dir_list = []
    test_dir_list = []


    if not os.path.exists(base_dir):
        os.mkdir(base_dir)  # new directory to make directory structure and load data

    main_directory_names = ['train', 'validation', 'test']

    for main_dir_name in main_directory_names:
        dir = os.path.join(base_dir, main_dir_name)
        if not os.path.exists(dir):
            os.mkdir(dir)

        for main_classification_types_dir in main_classification_types_dirs:
            type_dir = os.path.join(dir, main_classification_types_dir)

            if main_dir_name == 'train':
                train_dir_list.append(type_dir)

            elif main_dir_name == 'validation':
                validation_dir_list.append(type_dir)

            elif main_dir_name == 'test':
                test_dir_list.append(type_dir)

            if not os.path.exists(type_dir):
                os.mkdir(type_dir)

    return train_dir_list, validation_dir_list, test_dir_list


## Function for loading data to new directory structure 

In [8]:
def load_data_to_folder(original_data_folder, # original data folder
                        base_dir,  # folder to create training, validation and testing data
                        main_classification_types,  # main class types of the problem
                        size_of_training,  # number of images to training set
                        size_of_validation,  # number of images to validation set
                        size_of_testing):  # number of images to testing set

    for class_type in main_classification_types:
        src_dir = os.path.join(original_data_folder, class_type)
        files = os.listdir(src_dir)

        training_files = files[0:size_of_training]
        validation_files = files[size_of_training:size_of_training + size_of_validation]
        testing_files = files[size_of_training + size_of_validation:size_of_training + size_of_validation + size_of_testing]

        print(len(files))

        # copying training images
        for file in training_files:
            src_file = os.path.join(src_dir, file)
            dst_file = os.path.join(base_dir + '/train/' + class_type, file)
            if not os.path.exists(dst_file):
                shutil.copyfile(src_file, dst_file)

        # copying validaton images
        for file in validation_files:
            src_file = os.path.join(src_dir, file)
            dst_file = os.path.join(base_dir + '/validation/' + class_type, file)
            if not os.path.exists(dst_file):
                shutil.copyfile(src_file, dst_file)

        # copying testing images
        for file in testing_files:
            src_file = os.path.join(src_dir, file)
            dst_file = os.path.join(base_dir + '/test/' + class_type, file)
            if not os.path.exists(dst_file):
                shutil.copyfile(src_file, dst_file)

        #training_dir = os.path.join(base_dir, 'train')



### Generating new folder structure for new data set version 2 with more training data and less validation and testing data

In [9]:
make_folder_structure(base_dir, main_classification_types)

(['data_v2/train/dyed-lifted-polyps',
  'data_v2/train/dyed-resection-margins',
  'data_v2/train/esophagitis',
  'data_v2/train/normal-cecum',
  'data_v2/train/normal-pylorus',
  'data_v2/train/normal-z-line',
  'data_v2/train/polyps',
  'data_v2/train/ulcerative-colitis'],
 ['data_v2/validation/dyed-lifted-polyps',
  'data_v2/validation/dyed-resection-margins',
  'data_v2/validation/esophagitis',
  'data_v2/validation/normal-cecum',
  'data_v2/validation/normal-pylorus',
  'data_v2/validation/normal-z-line',
  'data_v2/validation/polyps',
  'data_v2/validation/ulcerative-colitis'],
 ['data_v2/test/dyed-lifted-polyps',
  'data_v2/test/dyed-resection-margins',
  'data_v2/test/esophagitis',
  'data_v2/test/normal-cecum',
  'data_v2/test/normal-pylorus',
  'data_v2/test/normal-z-line',
  'data_v2/test/polyps',
  'data_v2/test/ulcerative-colitis'])

In [12]:
load_data_to_folder(original_data_folder_name, base_dir, main_classification_types, train_size, validation_size, test_size)

1000
1000
1000
1000
1000
1000
1000
1000
