# Preprocessing set for training and testing

This notebook enables you to preprocess the training and testing data in order to train a Deep-Learning model.

This script requires a cube file with h5 format.

The resulting output enables to define a train, validation and test set while keeping the same distribution of the classes, which is important for the less frequent classes.

In [1]:
import numpy as np
import h5py
from goodforest_lib.config.constants import RANDOM_SEED

np.random.seed(RANDOM_SEED)

In [4]:
path = "../../data/raw/BC/diverse/"
filename = "cubes_4-sick-classes_diverse_IB_filtered.h5"

In [5]:
with h5py.File(path+filename, 'r') as hf:
    data = hf['cubes'][:]
    print(data.shape)

(6254, 25, 256, 256)


In [7]:
# Create a dictionnary to store keys, values where keys are the class labels and values are the cubes indices
NB_CLASSES = 6
classes = {i:dict() for i in range(NB_CLASSES)}

for i in range(data.shape[0]):
    labels = data[i, -1]
    unique_label, unique_label_count = np.unique(labels, return_counts=True)
    for label, count in zip(unique_label, unique_label_count):
        classes[label][i] = count

In [8]:
only_1_and_0_cubes = set(classes[0].keys()).union(classes[1].keys())
for i in range(2, NB_CLASSES):
    only_1_and_0_cubes = only_1_and_0_cubes - set(classes[i].keys())
only_1_and_0_cubes = list(only_1_and_0_cubes)
print(f"Percentage of cubes with only 0 and 1 classes: {len(only_1_and_0_cubes)/data.shape[0]*100:.2f}%")

Percentage of cubes with only 0 and 1 classes: 63.16%


In [9]:
train_set, validation_set, test_set = [], [], []

In [10]:
chosen_cubes = set()
custom_range = list(range(2, NB_CLASSES))

np.random.shuffle(custom_range)

proportion_per_set = [0.8, 0.1, 0.1]

for class_label in custom_range:
    cubes_indices = list(set(classes[class_label].keys()) - chosen_cubes)
    chosen_cubes = chosen_cubes.union(set(cubes_indices))
    if len(cubes_indices) < 3:
        print(f"Class {class_label} corresponding to {(class_label-4)//2} has {len(cubes_indices)} cubes")
    np.random.seed(RANDOM_SEED)
    np.random.shuffle(cubes_indices)
    temp_train_set = set(cubes_indices[:int(len(cubes_indices)*proportion_per_set[0])])
    temp_validation_set = cubes_indices[int(len(cubes_indices)*proportion_per_set[0]):int(len(cubes_indices)*(proportion_per_set[0]+proportion_per_set[1]))]
    temp_test_set = cubes_indices[int(len(cubes_indices)*(proportion_per_set[0]+proportion_per_set[1])):]
    if len(temp_train_set) == 0 or len(temp_validation_set) == 0 or len(temp_test_set) == 0:
        temp_train_set = cubes_indices[:-2]
        temp_validation_set = cubes_indices[-2:-1]
        temp_test_set = cubes_indices[-1:]
    train_set.extend(temp_train_set)
    validation_set.extend(temp_validation_set)
    test_set.extend(temp_test_set)

In [11]:
# Suffle the sets and save them
np.random.shuffle(train_set)
np.random.shuffle(validation_set)
np.random.shuffle(test_set)

# Get all the cubes infos
train_cubes_set = data[train_set]
validation_cubes_set = data[validation_set]
test_cubes_set = data[test_set]

In [12]:
print(len(train_set), len(validation_set), len(test_set))
print(len(set(train_set)), len(set(validation_set)), len(set(test_set)))
print(len(set(train_set).union(set(validation_set).union(set(test_set)))))

1842 229 233
1842 229 233
2304


In [13]:
remaining_cubes = set(range(data.shape[0])) - set(train_set) - set(validation_set) - set(test_set)
remaining_cubes = list(remaining_cubes)
np.random.seed(RANDOM_SEED)
np.random.shuffle(remaining_cubes)

train_cubes_set = np.concatenate((train_cubes_set, data[remaining_cubes[:int(len(remaining_cubes)*proportion_per_set[0])]]), axis=0)
validation_cubes_set = np.concatenate((validation_cubes_set, data[remaining_cubes[int(len(remaining_cubes)*proportion_per_set[0]):int(len(remaining_cubes)*(proportion_per_set[0]+proportion_per_set[1]))]]), axis=0)
test_cubes_set = np.concatenate((test_cubes_set, data[remaining_cubes[int(len(remaining_cubes)*(proportion_per_set[0]+proportion_per_set[1])):]]), axis=0)

In [14]:
print(train_cubes_set.shape[0], validation_cubes_set.shape[0], test_cubes_set.shape[0])

5002 624 628


In [15]:
train_cubes_set.shape, validation_cubes_set.shape, test_cubes_set.shape

((5002, 25, 256, 256), (624, 25, 256, 256), (628, 25, 256, 256))

In [16]:
suffix = "4-sick-classes_IB_set.h5"

with h5py.File(path+"train"+suffix, 'w') as hf:
    hf.create_dataset("cubes", data=train_cubes_set)
with h5py.File(path+"validation"+suffix, 'w') as hf:
    hf.create_dataset("cubes", data=validation_cubes_set)
with h5py.File(path+"test"+suffix, 'w') as hf:
    hf.create_dataset("cubes", data=test_cubes_set)