## Train-Test Split Dataset

### Before using this notebook

- Install dependencies
```barsh
pip install opencv-contrib-python

- Create an **empty** destination folder to save the predicted result
- Edit `src_path` and `des_path` to correct path

### Running

Follow above directions to setup and run this cell to check

In [1]:
import cv2
import os
import glob
import random

In [2]:
src_path = './labels'# Path to dataset folder of 5 labeled cells sub-folders basophil, eosinophil, lymphocyte, monocyte, neutrophil
des_path = './dest'  # Path to new folder for saving splited dataset
train_size = 0.8     # Modify the ratio of training set
test_size = 0.1      # Modify the ratio of testing set
cell_group = "wbc"   # or "rbc" for red blood cells

In [3]:
def move_files_randomly(arr_files, length, path):
    for i in range(0,length):
        file = random.choice(arr_files)
        arr_files.remove(file)
        dest = os.path.join(path, os.path.basename(os.path.dirname(file)))
        if not os.path.exists(dest):
            os.mkdir(dest) 
        img = cv2.imread(file)
        cv2.imwrite(os.path.join(dest, os.path.basename(file)), img)

In [4]:
class_name = {
    'wbc': ['lymphocyte', 'neutrophil', 'basophil', 'monocyte', 'eosinophil'],
    'rbc': ['circular', 'elongated', 'other']
}
    
def train_test_split(src_path, des_path, train_size, test_size):
    #Check ratios
    if train_size + test_size > 1:
        print('Invalid sizes!')
        return
    if set(os.listdir(src_path)) != set(class_name[cell_group]):
        print('Invalid source path!')
        return

    #Check dest path exists
    if os.path.exists(des_path):  
        print('Destination path exists!')
        return
    #Create dest folder with train & test sub-folder
    train_path = os.path.join(des_path, 'train')
    os.makedirs(train_path)
    test_path = os.path.join(des_path, 'test')
    os.makedirs(test_path)
    
    #Create valid folder
    if train_size + test_size < 1:
        valid_path = os.path.join(des_path, 'valid')
        os.makedirs(valid_path)
    for classname in class_name[cell_group]:
        #Get all files
        files = glob.glob(os.path.join(src_path, classname, '*'))
        #Add files to train & test folder
        len_train = round(len(files)*train_size)
        len_test = round(len(files)*test_size)
        move_files_randomly(files, len_train, train_path)
        move_files_randomly(files, len_test, test_path)
        if valid_path:
            move_files_randomly(files, len(files), valid_path)

In [5]:
train_test_split(src_path, des_path, train_size, test_size)