# Preparation of dataset

## Import Modules

In [51]:
import os
import tensorflow as tf
import glob
from tqdm import tqdm
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import random
from scipy.ndimage import gaussian_filter, map_coordinates
import numpy as np
import math
from scipy.signal import convolve2d
import scipy.misc

## Format label image size

In [38]:
original_dir = '../datasets/label_images_original'
save_dir = '../datasets/label_images_formatted'
if not os.path.exists(save_dir):
    os.mkdir(save_dir)
for label_image_path in glob.glob(original_dir+'/*.png'):
    label_image = Image.open(label_image_path)
    if label_image.size == (700,700):
        label_image = label_image.crop((35, 35, 665, 665))
    image_name = os.path.splitext(os.path.basename(label_image_path))[0]
    save_path = os.path.join(save_dir, image_name+'.png')
    label_image.save(save_path, quality=95)

## Convert label images into ndarray data
- Non area of Interest -> 0
- Region of normal -> 1
- Region of tumor  -> 2
- Region of xxx.   -> 3

In [56]:
num_split = 5
image_dir = '../datasets/pathologocal_images_JPEG'
label_dir = '../datasets/label_images_size_formatted'
save_dir = '../datasets/ndarray_labels'
if not os.path.exists(save_dir):
    os.mkdir(save_dir)
label_image_path_list = glob.glob(label_dir+'/*')
for label_image_path in tqdm(label_image_path_list):
    label_image = Image.open(label_image_path)
    label_image_array = np.array(label_image)
    black = np.logical_and(
        np.logical_and(
            label_image_array[:, :, 1] == 0,
            label_image_array[:, :, 2] == 0
        ),
        label_image_array[:, :, 3] == 255
    )
    blue = np.logical_and(
        np.logical_and(
            label_image_array[:, :, 1] == 0,
            label_image_array[:, :, 2] != 0
        ),
        label_image_array[:, :, 0] == 0
    )
    green = np.logical_and(
        np.logical_and(
            label_image_array[:, :, 1] != 0,
            label_image_array[:, :, 2] == 0
        ),
        label_image_array[:, :, 0] == 0
    )
    blue_green =np.logical_and(
        np.logical_and(
            label_image_array[:, :, 1] != 0,
            label_image_array[:, :, 2] != 0
        ),
        label_image_array[:, :, 0] == 0
    )
    new_label_array = np.zeros(label_image.size)
    new_label_array[black] = -1
    new_label_array[green] = 1
    new_label_array[blue] = 2
    new_label_array[blue_green] = 3
    image_name = os.path.splitext(os.path.basename(label_image_path))[0]
    save_path = os.path.join(save_dir, image_name+'.npy')
    np.save(save_path, new_label_array)
    save_dir = '../datasets/label_image_formatted'
    save_path = os.path.join(save_dir, image_name+'.jpg')

100%|██████████| 2028/2028 [00:56<00:00, 35.95it/s]


## Devide samples into training samples and test samples and create textlines

In [55]:
seed = 0
image_dir = '../datasets/pathologocal_images_JPEG'
label_dir = '../datasets/ndarray_labels'
image_path_list = glob.glob(image_dir+'/*')
label_path_list = glob.glob(label_dir+'/*')
image_names = [os.path.splitext(os.path.basename(path))[0] for path in image_path_list]
label_names = [os.path.splitext(os.path.basename(path))[0] for path in label_path_list]
label_names = list(set(label_names) & (set(image_names) & set(label_names)))
random.seed(seed)
random.shuffle(label_names)
test_ratio = 0.1
num_samples = len(label_names)
print("num_samples -> {}".format(num_samples))
num_train_samples = math.floor(num_samples * (1 - test_ratio))
train_val_sample_names = [name for name in label_names[:num_train_samples]]
test_sample_names = [name for name in label_names[num_train_samples:]]
print("train_and_validation_sample_names -> {}".format(len(train_val_sample_names)))
print("test_sample_names -> {}".format(len(test_sample_names)))
train_val_sample_names_with_indentation = "\n".join(train_val_sample_names)
test_sample_names_with_indentation = "\n".join(test_sample_names)
train_val_sample_save_path = '../datasets/train_val.txt'
with open(train_val_sample_save_path, 'w') as f:
    f.write(train_val_sample_names_with_indentation)
test_sample_save_path = '../datasets/test.txt'
with open(test_sample_save_path, 'w') as f:
    f.write(test_sample_names_with_indentation)

num_samples -> 2024
train_and_validation_sample_names -> 1821
test_sample_names -> 203
