## Notebook Context
## Goal - Perform train, validation and test splits from the available annotated data. Do this on a per scan volume basis.
## Data - Extracted scan volume slices to individual directories from <a>http://medicaldecathlon.com/</a>. One directory per scan volume containing all the slices from the scan volume

In [1]:
import os
import shutil
import glob
import random

## Compute number of scan volumes for each dataset split

In [2]:
total_scan_volumes = glob.glob("../data/train/images/liver*")
total_annotated_volumes = glob.glob("../data/train/labels/liver*")

In [4]:
available_volumes_count = len(total_scan_volumes)

train_volumes_percentage = 0.70
validation_volumes_percentage = 0.15
test_volumes_percentage = 0.15

train_volumes_count = int(available_volumes_count * train_volumes_percentage)

validation_volumes_count = int(round(available_volumes_count * validation_volumes_percentage))

test_volumes_count = int(round(available_volumes_count * test_volumes_percentage))

print(f"Available volumes count - {available_volumes_count}")
print(f"Train volumes count - {train_volumes_count}")
print(f"Validation volumes count - {validation_volumes_count}")
print(f"Test volumes count - {test_volumes_count}")
print(train_volumes_count + validation_volumes_count + test_volumes_count)

Available volumes count - 131
Train volumes count - 91
Validation volumes count - 20
Test volumes count - 20
131


In [7]:
print(total_scan_volumes[0])

../data/train/images/liver_78


## Pick a random set of scan volumes from the available scan volumes based on the number of computed scan samples for that corresponding split

In [8]:
# Training scan volumes
train_samples = random.sample(total_scan_volumes, k=train_volumes_count)
train_annotations = [k.replace("images", "labels") for k in train_samples]

# Validation scan volumes
available_validation_test_samples = list(set(total_scan_volumes) - set(train_samples))
validation_samples = random.sample(available_validation_test_samples, k=validation_volumes_count)
validation_annotations = [k.replace("images", "labels") for k in validation_samples]

# Testing scan volumes
test_samples = list(set(total_scan_volumes) - set(train_samples) - set(validation_samples))
test_annotations = [k.replace("images", "labels") for k in test_samples]

## Copy the selected scan volumes to the corresponding directory of the split

In [34]:
for volume_scan_samples_path, volume_scan_annotations_path in zip(test_samples, test_annotations):
    
    volume_sample_paths = glob.glob(volume_scan_samples_path + "/*.bmp")
    volume_annotation_paths = glob.glob(volume_scan_annotations_path + "/*.bmp")
    
    destination_samples_directory_path = "../data/test/images/" + volume_scan_samples_path.split("/")[-1] + "/"
    destination_annotations_directory_path = "../data/test/labels/" + volume_scan_annotations_path.split("/")[-1] + "/"
        
    shutil.move(volume_scan_samples_path, destination_samples_directory_path)
    shutil.move(volume_scan_annotations_path, destination_annotations_directory_path)
    