<a href="https://colab.research.google.com/github/zachmurphy1/facemask-faster-rcnn/blob/main/Train_Val_Test_Split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train-val-test split

This notebook performs a train-val-test split on the raw images and annotations folders.

##Input
Images and annotations folder, each containing respective data at the top level and sequentially numbered. Originally obtained from https://www.kaggle.com/andrewmvd/face-mask-detection.
```
facemask_data/images
facemask_data/annotations
```


##Output
Images and annotations folders for each set. Each set gets its own folder.
```
Train:
facemask_data/images/train
facemask_data/images/train/images
facemask_data/images/train/annotations

Val:
facemask_data/images/val
facemask_data/imagesval/images
facemask_data/images/val/annotations

Test:
facemask_data/images/test
facemask_data/images/test/images
facemask_data/images/test/annotations
```



In [None]:
# Imports
import numpy as np
import shutil, os

In [None]:
# Mount data directory
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
%cd /content/gdrive/My\ Drive/facemask-faster-rcnn/

DATADIR = 'facemask_data'
ANNDIR = DATADIR + '/annotations'
IMGDIR = DATADIR + '/images'
PREFIX = 'maksssksksss'

In [None]:
# Set total n, train/val/test split, and seed
n = 853
split_frac = {
    'train':0.6,
    'val':0.2,
    'test':0.2
    }
seed=0
assert sum(split_frac.values()) == 1, 'split_frac components need to sum to 1'

# Randomly order indices
np.random.seed(seed)
to_split = np.arange(n)
np.random.shuffle(to_split)
np.random.seed(None)

# Split indices according to split fractions
SPLIT_IDX = {
    'train':list(to_split[0:int(n*split_frac['train'])]),
    'val':list(to_split[int(n*split_frac['train']):int(n*(split_frac['train']+split_frac['val']))]),
    'test':list(to_split[int(n*(split_frac['train']+split_frac['val'])):])
    }

In [None]:
# Save datasets to different directories
for ds in ['train', 'val','test']:
  print(ds)
  # Get idxs
  idxs = SPLIT_IDX[ds]

  # Set image dirs
  origin_img = IMGDIR
  destination_img = DATADIR + '/' + ds + '/images'

  # Set ann dirs
  origin_ann = ANNDIR
  destination_ann = DATADIR + '/' + ds + '/annotations'

  # Create dirs
  os.makedirs(DATADIR + '/' + ds, exist_ok=True)
  os.makedirs(destination_img, exist_ok=True)
  os.makedirs(destination_ann, exist_ok=True)

  # For each idx, copy image and ann from origin to target dir
  for i, idx in enumerate(idxs):
    shutil.copy(origin_img + '/' + PREFIX + str(idx) + '.png', destination_img + '/' + str(i) + '.png')
    shutil.copy(origin_ann + '/' + PREFIX + str(idx) + '.xml', destination_ann + '/' + str(i) + '.xml')