# This is Medical Image Data Generator (15 classes)

In [1]:
import os
import shutil
import math

In [2]:
original_data_folder = '../../data/data_Medico_2018_development_set_v3_15_classes'

In [3]:
os.listdir(original_data_folder)

['stool-plenty',
 'colon-clear',
 'retroflex-rectum',
 'dyed-lifted-polyps',
 'instruments',
 'normal-pylorus',
 'stool-inclusions',
 'blurry-nothing',
 'out-of-patient',
 'esophagitis_normal_z_line',
 'retroflex-stomach',
 'ulcerative-colitis',
 'polyps',
 'dyed-resection-margins',
 'normal-cecum']

### Change this name to new for creating new data folder from the original data folder

In [4]:
base_dir = '../../data/data_generated_medicotask_15classes_70_30_v3'

In [5]:
train_dir = os.path.join(base_dir, 'train')
validation_dir = os.path.join(base_dir, 'validation')
#test_dir = os.path.join(base_dir, 'test')


main_classification_types = os.listdir(original_data_folder)

# original data location path
original_data_folder_name = original_data_folder

In [6]:
# os.listdir(os.path.join(train_dir, main_classification_types[0]))

In [7]:
 # os.listdir(os.path.join(train_dir, main_classification_types[0]))

## Need to take 80% for training and 20% for validation 

In [8]:
os.listdir(original_data_folder_name)

['stool-plenty',
 'colon-clear',
 'retroflex-rectum',
 'dyed-lifted-polyps',
 'instruments',
 'normal-pylorus',
 'stool-inclusions',
 'blurry-nothing',
 'out-of-patient',
 'esophagitis_normal_z_line',
 'retroflex-stomach',
 'ulcerative-colitis',
 'polyps',
 'dyed-resection-margins',
 'normal-cecum']

In [9]:
x = len(os.listdir(original_data_folder_name)) * (80/100)

In [10]:
x

12.0

In [11]:
math.floor(x)

12

## Change these values for generating new train, validation and test data sets

In [12]:
train_size = 70 # 70% 
validation_size = 30 # 30 %
test_size = 0 # 0 %

## Function for making new directory structure

In [13]:
def make_folder_structure(base_dir, main_classification_types_dirs):

    train_dir_list = []
    validation_dir_list = []
    test_dir_list = []


    if not os.path.exists(base_dir):
        os.mkdir(base_dir)  # new directory to make directory structure and load data

    main_directory_names = ['train', 'validation', 'test']

    for main_dir_name in main_directory_names:
        dir = os.path.join(base_dir, main_dir_name)
        if not os.path.exists(dir):
            os.mkdir(dir)

        for main_classification_types_dir in main_classification_types_dirs:
            type_dir = os.path.join(dir, main_classification_types_dir)

            if main_dir_name == 'train':
                train_dir_list.append(type_dir)

            elif main_dir_name == 'validation':
                validation_dir_list.append(type_dir)

            elif main_dir_name == 'test':
                test_dir_list.append(type_dir)

            if not os.path.exists(type_dir):
                os.mkdir(type_dir)

    return train_dir_list, validation_dir_list, test_dir_list


## Function for loading data to new directory structure 

In [14]:
def load_data_to_folder(original_data_folder, # original data folder
                        base_dir,  # folder to create training, validation and testing data
                        main_classification_types,  # main class types of the problem
                        size_of_training_percentage,  # percentage %
                        size_of_validation_percentage,  # percentage %
                        size_of_testing_percentage):  # percentage %

    for class_type in main_classification_types:
        src_dir = os.path.join(original_data_folder, class_type)
        files = os.listdir(src_dir)
        
        size_of_training = math.floor(len(files) * (size_of_training_percentage/100))
        size_of_validation = math.ceil(len(files) * (size_of_validation_percentage/100))
        size_of_testing = math.ceil(len(files) * (size_of_testing_percentage/100))
        

        print(len(files))
        print(size_of_training)
        print(size_of_validation)
        print(size_of_testing)
        
        training_files = files[0:size_of_training]
        validation_files = files[size_of_training:size_of_training + size_of_validation]
        testing_files = files[size_of_training + size_of_validation:size_of_training + size_of_validation + size_of_testing]

        

        # copying training images
        for file in training_files:
            src_file = os.path.join(src_dir, file)
            dst_file = os.path.join(base_dir + '/train/' + class_type, file)
            if not os.path.exists(dst_file):
                shutil.copyfile(src_file, dst_file)

        # copying validaton images
        for file in validation_files:
            src_file = os.path.join(src_dir, file)
            dst_file = os.path.join(base_dir + '/validation/' + class_type, file)
            if not os.path.exists(dst_file):
                shutil.copyfile(src_file, dst_file)

        # copying testing images
        for file in testing_files:
            src_file = os.path.join(src_dir, file)
            dst_file = os.path.join(base_dir + '/test/' + class_type, file)
            if not os.path.exists(dst_file):
                shutil.copyfile(src_file, dst_file)

        #training_dir = os.path.join(base_dir, 'train')
        
        size_of_training = 0
        size_of_validation = 0
        size_of_testing = 0



### Generating new folder structure for new data set version 2 with more training data and less validation and testing data

In [15]:
make_folder_structure(base_dir, main_classification_types)

(['../../data/data_generated_medicotask_15classes_70_30_v3/train/stool-plenty',
  '../../data/data_generated_medicotask_15classes_70_30_v3/train/colon-clear',
  '../../data/data_generated_medicotask_15classes_70_30_v3/train/retroflex-rectum',
  '../../data/data_generated_medicotask_15classes_70_30_v3/train/dyed-lifted-polyps',
  '../../data/data_generated_medicotask_15classes_70_30_v3/train/instruments',
  '../../data/data_generated_medicotask_15classes_70_30_v3/train/normal-pylorus',
  '../../data/data_generated_medicotask_15classes_70_30_v3/train/stool-inclusions',
  '../../data/data_generated_medicotask_15classes_70_30_v3/train/blurry-nothing',
  '../../data/data_generated_medicotask_15classes_70_30_v3/train/out-of-patient',
  '../../data/data_generated_medicotask_15classes_70_30_v3/train/esophagitis_normal_z_line',
  '../../data/data_generated_medicotask_15classes_70_30_v3/train/retroflex-stomach',
  '../../data/data_generated_medicotask_15classes_70_30_v3/train/ulcerative-colitis'

In [16]:
load_data_to_folder(original_data_folder_name, base_dir, main_classification_types, train_size, validation_size, test_size)

366
256
110
0
267
186
81
0
237
165
72
0
457
319
138
0
36
25
11
0
439
307
132
0
130
91
39
0
176
123
53
0
43
30
13
0
882
617
265
0
398
278
120
0
457
319
138
0
613
429
184
0
416
291
125
0
416
291
125
0
