# Split Images into training, validation and testing

Code adapted from [stackoverflow question](https://stackoverflow.com/questions/53074712/how-to-split-folder-of-images-into-test-training-validation-sets-with-stratified)

In [5]:
import os
import numpy as np
import shutil
import random
from tqdm import tqdm

In [7]:
# Creating Train / Val / Test folders (One time use)
root_dir = 'Concrete Crack Images for Classification'
new_dir = 'ozgenel_dataset_split'
classes_dir = ['/Positive', '/Negative']

val_ratio = 0.15
test_ratio = 0.15

for cls in classes_dir:
    os.makedirs(new_dir +'/train' + cls)
    os.makedirs(new_dir +'/val' + cls)
    os.makedirs(new_dir +'/test' + cls)


    # Creating partitions of the data after shuffeling
    src = root_dir + cls # Folder to copy images from

    allFileNames = os.listdir(src)
    np.random.shuffle(allFileNames)
    train_FileNames, val_FileNames, test_FileNames = np.split(np.array(allFileNames),
                                                              [int(len(allFileNames)* (1 - val_ratio - test_ratio)), 
                                                               int(len(allFileNames)* (1 - test_ratio))])


    train_FileNames = [src+'/'+ name for name in train_FileNames.tolist()]
    val_FileNames = [src+'/' + name for name in val_FileNames.tolist()]
    test_FileNames = [src+'/' + name for name in test_FileNames.tolist()]

    print('Handling class {}/{}'.format(classes_dir.index(cls) + 1, len(classes_dir)))
    print('Total images: ', len(allFileNames))
    print('Training: ', len(train_FileNames))
    print('Validation: ', len(val_FileNames))
    print('Testing: ', len(test_FileNames))
    
    # Copy-pasting images
    for name in tqdm(train_FileNames):
        shutil.copy(name, new_dir +'/train' + cls)

    for name in tqdm(val_FileNames):
        shutil.copy(name, new_dir +'/val' + cls)

    for name in tqdm(test_FileNames):
        shutil.copy(name, new_dir +'/test' + cls)

Handling class 1/2
Total images:  20000
Training:  14000
Validation:  3000
Testing:  3000


100%|████████████████████████████████████████████████████████████████████████████| 14000/14000 [09:36<00:00, 24.29it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [02:23<00:00, 20.93it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [02:36<00:00, 19.22it/s]


Handling class 2/2
Total images:  20000
Training:  14000
Validation:  3000
Testing:  3000


100%|████████████████████████████████████████████████████████████████████████████| 14000/14000 [10:18<00:00, 22.63it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [01:47<00:00, 27.85it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [01:58<00:00, 25.27it/s]
