# Transfer Learning Exercises

In [None]:
# Import useful libraries        (note: don't forget to turn on GPU)

# tensorflow for network building/training
import tensorflow as tf
from tensorflow.python.keras import Model, Sequential
from tensorflow.keras.applications import VGG16

# Basic operating system (os), numerical, and plotting functionality
import os
import time
import numpy as np
from matplotlib import pylab as plt

# scikit-learn data utilities
from sklearn.model_selection import train_test_split
from skimage import transform

# scikit-learn performance metric utilities
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Color transformations
from skimage.color import rgb2lab

#Skimage resizing 
from skimage.transform import resize

# Garbage collection (for saving RAM during training)
import gc

## VGG16 Model

For this exercise you'll now use the VGG16 model as the feature extractor. https://www.tensorflow.org/api_docs/python/tf/keras/applications/VGG16

Specifications:
- Default input size: 224x224, no smaller than 32x32 pixels
- Default output classes: 1000

Our images are 150x150 pixels in size and come from only **eight categories**. In order to use this model for our classification task, we again can/need to do the following:
* Resize images : Our input images can be resized to the appropriate dimensions. Alternatively, we can pad our images to the expected dimensions. Padding leads to additional choices - Do we pad with zeros, duplicate edge pixels or mirror the image across edges?
* Change the prediction layer : Remove the existing prediction layer and add a new layer that can predict **8 classes**.
* Train : Finally, we need to train the network on our data

## Load Data

Getting path and changing directories

In [None]:
# Define the current directory and the directory where the files to download can
# be found
current_dir = os.getcwd()
remote_path = 'https://github.com/BeaverWorksMedlytics2020/Data_Public/raw/master/NotebookExampleData/Week3/data_nuclei/crc/'

# Define and build a directory to save this data in
data_dir = os.path.join(current_dir, 'crc_data')
if not os.path.isdir(data_dir):
  os.mkdir(data_dir)

# Move into the data directory and download all of the files
os.chdir(data_dir)
for ii in range(1, 6):
    basename = f'rgb0{ii}.npz'
    filename = os.path.join(remote_path, basename)

    # Check if the file has already been downloaded
    if not os.path.isfile(basename):
      cmd = f'wget {filename}'
      print(cmd)
      os.system(cmd)

# Return to the original directory
os.chdir(current_dir)

Function for loading images

In [None]:
# Define a function to load the data from the assumed download path
def load_images(colorspace='rgb'):
    """
    Loads the example data and applies transformation into requested colorspace

    Arguments
    ---------
    colorspace : str, optional, default: `rgb`
        The colorspace into which the images should be transformed. Accepted
        values include

        'rgb' : Standard red-green-blue color-space for digital images

        'gray' or 'grey': An arithmetic average of the (r, g, b) values

        'lab': The CIE L*a*b* colorspace
    
    Returns
    -------
    images : numpy.ndarray, shape (Nimg, Ny, Nx, Ncolor)
        The complete set of transformed images

    labels : numpy.ndarray, shape (Nimg)
        The classification labels associated with each entry in `images`

    label_to_str : dict
        A dictionary which converts the numerical classification value in
        `labels` into its string equivalent representation.
    """
    # Check that the colorspace argument is recognized
    colorspace_lower = colorspace.lower()
    if colorspace_lower not in ['rgb', 'gray', 'grey', 'lab']:
        raise ValueError(f'`colorspace` value of {colorspace} not recognized')

    # Load data, which is stored as a numpy archive file (.npz)
    filename = os.path.join(data_dir, 'rgb01.npz')
    print(f'loading {filename}')
    tmp = np.load(os.path.join(data_dir, 'rgb01.npz'), allow_pickle=True)

    # Parse the loaded data into images and labels
    # Initialize the images and labels variables using the first archive data
    images = tmp['rgb_data']
    if colorspace_lower == 'rgb':
        pass
    elif colorspace_lower in ['gray', 'grey']:
        images = np.mean(images, axis=-1)      # Average into grayscale
    elif colorspace_lower == 'lab':
        images = rgb2lab(images)               # Convert to CIE L*a*b*

    # Grab the initial array for the image labels
    labels = tmp['labels']
    
    # Grab the dictionary to convert numerical labels to their string equivalent
    label_to_str = tmp['label_str']
    label_to_str = label_to_str.tolist() # Convert label_to_str into a dict

    # Update the user on the number and size of images loaded
    print('Loaded images with shape {}'.format(images.shape))
    del tmp

    # Loop over each of the remaining archives and append the contained data
    for ii in range(2,6):
        # Build the full path to the archive and load it into memory
        filename = os.path.join(data_dir, f'rgb0{ii}.npz')
        print(f'loading {filename}')
        tmp = np.load(filename, allow_pickle=True)

        # Parse and append the data
        these_images = tmp['rgb_data']
        if colorspace_lower == 'rgb':
            pass
        elif (colorspace_lower == 'gray') or (colorspace_lower == 'grey'):
            these_images = np.mean(these_images, axis=-1) # Convert to grayscale
        elif colorspace_lower == 'lab':
            these_images = rgb2lab(these_images)          # Convert to CIEL*a*b*

        # Append the images and labels
        images = np.append(images, these_images, axis=0)
        labels = np.append(labels, tmp['labels'], axis=0)

        # Update the user on the number and size of images
        print('Loaded images with shape {}'.format(these_images.shape))
        del tmp

    # Force the image data to be floating point and print the data shape
    images = images.astype(np.float)
    print('Final image data shape: {}'.format(images.shape))
    print('Number of image labels: {}'.format(*labels.shape))

    return images, labels, label_to_str

Load images and labels into memory

In [None]:
images_full_res, labels, label_to_str = load_images()
num_classes = np.unique(labels).size

## Pre-process the Images

***Note: you'll have to edit a line of code in the cell for resizing***

Resizing

In [None]:
# This boolean can be switched to false if you do not want to resize the images
resize_images_bool = True

# Specify a new shape to use for the resized images
# NOTE: For the VGG16 model, we must use a size of at least (32, 32).
original_shape = images_full_res.shape
new_shape = list(original_shape)
new_shape[1:3] = ## YOUR CODE HERE

# Compute if we are downsampling (in which case we need anti-aliasing)
scaling_ratio = np.array(new_shape[1:3])/np.array(original_shape[1:3])
anti_alias = np.any(scaling_ratio < 1)

# If resizing is requested, then run the resizing transformation
if resize_images_bool:
    # Grab the original shape of the images
    num_images = images_full_res.shape[0]

    # Initialize an array for storing the resized images
    images = np.zeros(new_shape, dtype=np.float16)

    # Loop over each image in the data and perform a resizing operation
    for img_num, img_data in enumerate(images_full_res):
        # Update the user on progress
        if np.mod(img_num, 1000) == 0:
            print(f'Processing image number {img_num}')

        # Process the image and force it to be a 16-bit float
        processed_img = transform.resize(img_data, new_shape[1:],
                                         anti_aliasing=anti_alias)
        images[img_num] = processed_img.astype(np.float16)

# If no resizing requested, then just rename that data
else:
    images = images_full_res

# Remove the full-resolution versions from memory (just clogging things up)
del images_full_res

Normalize the images (if it hasn't been done already)

In [None]:
# Note, we cast image data as float16 to save RAM
images = images.astype(np.float16)/255.0

Include an axis for color channels

In [None]:
# Take note of number of color channels in the loaded image add a last axis to 
# images ndarray if array dimension is only 3 (as is the case with grayscale images)
if images.ndim == 3:
    # If image is grayscale, then we add a last axis (of len 1) for channel
    n_channels = 1
    images = images[:, : , :, np.newaxis]
    print('\nlast dimension added to images ndarray to account for channel')
    print(f'new images.shape: {images.shape}')
else:
    #if image is not grayscale, last dimension of image already corresponds to channel
    n_channels = images.shape[-1]

Split data into train and test sets

In [None]:
# Split data into train and test sets
train_images, test_images, train_labels, test_labels = train_test_split(images, labels, test_size=.2)

# Convert 'labels' (1D array of integers) to one-hot encoding
train_labels = tf.keras.utils.to_categorical(train_labels)
test_labels = tf.keras.utils.to_categorical(test_labels)

# Print sizes of train/test sets
print(f'train_images.shape: {train_images.shape}')
print(f'train_labels.shape: {train_labels.shape}')
print(f'test_images.shape: {test_images.shape}')
print(f'test_labels.shape: {test_labels.shape}')

# Print the one-hot encoded labels as a sanity check
print('one-hot encoded labels:')
print(train_labels)

# Get rid of the duplicate copies of the data
del images, labels

## Load Pre-trained VGG16 Model

here's the link to documentation again (https://www.tensorflow.org/api_docs/python/tf/keras/applications/VGG16), also reference the tutorial notebook

In [None]:
# Create the base pre-trained model
print('loading VGG16')
base_model = ## YOUR CODE HERE
print('done')

Summarize model structure

In [None]:
base_model.summary()

Freezing layers

In [None]:
# Play around with freezing layers, take a look at the tutorial notebook for reference 

# By default we'll just freeze the entire base model again
base_model.trainable = False

Modify the pre-trained network by adding a few new layers at the output, including a classification layer (remember we want to predict 8 different classes)

In [None]:
# Add a global spatial average pooling layer
## YOUR CODE HERE

# Add a fully-connected layer
## YOUR CODE HERE

# Add the final classification layer
## YOUR CODE HERE

# Build the model you will train
model = ## YOUR CODE HERE

# Print summary of model layers
model.summary()

Compiling model

In [None]:
# Compile the model (should be done *after* setting layers to non-trainable)
    # optimizer: rmsprop
    # loss: categorical crossentropy
    # metrics: accuracy
  
## YOUR CODE HERE

## Train model

Train the model on the new, histological, data

In [None]:
# Convert all of our training and validation ('test') data to TensorFlow data
# This prevents the training algorithm from needing to make a *copy* of your
# numpy arrays, which would EAT UP SOO MUCH RAM!
#
# It also accelerates training a bit because there is no data-conversion step
train_images_tf = tf.constant(train_images, dtype=tf.float16)
test_images_tf = tf.constant(test_images)
del train_images, test_images

train_labels_tf = tf.constant(train_labels, dtype=tf.float16)
test_labels_tf = tf.constant(test_labels)
del train_labels, test_labels

Train model

In [None]:
# This function is called after each epoch
# (It will ensure that your training process does not consume all available RAM)
class garbage_collect_callback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    gc.collect()

# Time how long it takes the model to train for these epochs
start_time = time.time()

# Perform the training method
history = model.fit(train_images_tf,
                    train_labels_tf,
                    batch_size=64,
                    epochs= 50,
                    verbose=1,
                    validation_data=(test_images_tf, test_labels_tf),
                    callbacks = [garbage_collect_callback()])

stop_time = time.time()
print("--- %s seconds ---" % (stop_time - start_time))

Plot model train/validation accuracy and model train/validation loss

In [None]:
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

## Make Predictions for Test Images

In [None]:
# Predict class of test each test
predictions = model.predict(test_images_tf, verbose=True)

# Convert the predictions and true labels into category numbers
test_true_labels = test_labels_tf.numpy().argmax(axis=1)
test_pred_labels = predictions.argmax(axis=1)

In [None]:
# Plot a set of test images, along with predicted labels and true labels
plt.figure(figsize=(16,20))
for ii in range(0, 16):
    # Activate subplot and display image
    plt.subplot(4,4,ii+1)
    plt.imshow(test_images_tf[ii+100,:,:,:].numpy().astype(np.float))

    # Turn off axes
    plt.axis('off')

    # Add annotaiton
    plt.title('expected : ' + label_to_str[test_true_labels[ii+100]]
              + '\npredicted : ' + label_to_str[test_pred_labels[ii+100]])
plt.show()

## Accuracy

In [None]:
acc = accuracy_score(test_true_labels, test_pred_labels)
print(f'Model Accuracy: {acc:.2%}')

Confusion matrix

In [None]:
conf_mat = confusion_matrix(test_true_labels, test_pred_labels)

# Generate a new figure
plt.figure(figsize=(10,10))

# Display the confusion matrix
plt.imshow(conf_mat, cmap='hot', interpolation='nearest')

# Add some anotation for the plot
plt.colorbar()
plt.xlabel('True label')
plt.ylabel('Predicted label')
plt.show()

### To-do:

Continue playing around with preprocessing (image size) and the model (added layers, freezing layers, optimizer, # epochs) and see their effects on the accuracy. Doing this may help you for the challenge problem :O