In [5]:
'''
### Credits
This preprocessing and normalization outline is adapted from the [TensorFlow image loading tutorial](https://www.tensorflow.org/tutorials/load_data/images).
'''

# First we begin with setting up the environment for image loading and preprocessing.
!pip install tensorflow
!pip install tensorflow_datasets

import numpy as np
import os
import PIL
import PIL.Image
import tensorflow as tf
import tensorflow_datasets as tfds

# Check version to confirm importing succeeded.
print(tf.__version__)

2.18.0


In [None]:
# We will continue as though we have our data sets loaded up already for the sake of simplicity
# Lets create a dataset with batch size of 32 and image size of 256 since it matches our current dataset for the melanoma project
# This is a step in preparing parameters for preprocessing
batch_size = 32
img_height = 256 # A part of preprocessing
img_width = 256 # A part of preprocessing

# Then we will create a validation split of 80% training and 20% validation (standard practice)
# This may change in the future depending on the project and goals
# This also does some resizing using parameters from earlier.
train_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split=0.2,
    subset="training",
    seed=123,
    image_size=(img_height, img_width),
    batch_size=batch_size
)
# Note that the above will not work until data is loaded properly

# This also does some resizing using parameters from earlier
val_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split=0.2,
    subset="validation",
    seed=123,
    image_size=(img_height, img_width),
    batch_size=batch_size
)
# Note that the above will not work until data is loaded properly

# To check class names:
class_names = train_ds.class_names
print(class_names)

In [None]:
# Another important section in the process is standardizing the data
# RGB values are set in the range [0, 255] which isn't ideal for neural networks so we have to standardize these values
# We can standardize to [0, 1] using rescaling
normalization_layer = tf.keras.layers.Rescaling(1./255)

# But there are two ways to apply this layer according to the Tensorflow website.
# The next cell cover the two options.

In [None]:
# Applying it to the dataset by calling `Dataset.map`
normalized_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
image_batch, labels_batch = next(iter(normalized_ds))
first_image = image_batch[0]
# Notice the pixel values are now in `[0,1]`.
print(np.min(first_image), np.max(first_image))
# OR included in the layer inside the model definition for simplified deployment.
# This option might stick with our project for deployment purposes.