# SSC300: A fast model to count multiple classes

In this notebook it is shown a comparison of the serveral variants of the Keras implementation of the SSC model. The notebook can be ran on Google Colab to have a free Tesla K80 12GB GPU.

## Mount Google Drive

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

## Download of datasets and Packages decompression
This series of cells allows to download and uncompress the PASCAL VOC 2007 and 2012 

In [0]:
!rm -r object_counting object_classification utils

In [0]:
!tar -xf keras.tar.xz

In [0]:
!rm -r weights

In [0]:
!mkdir weights

In [0]:
!wget http://pjreddie.com/media/files/VOCtrainval_06-Nov-2007.tar

In [0]:
!wget http://pjreddie.com/media/files/VOCtest_06-Nov-2007.tar

In [0]:
!wget http://pjreddie.com/media/files/VOCtrainval_11-May-2012.tar

In [0]:
!tar -xf VOCtest_06-Nov-2007.tar

In [0]:
!tar -xf VOCtrainval_06-Nov-2007.tar

In [0]:
!tar -xf VOCtrainval_11-May-2012.tar

## Models applied to PASCAL VOC dataset

From now on, I will show you how to use the code and instantiate the different models and test them on the PASCAL VOC dataset.

In each block, most of the operations are the same thanks to the flexibility of the class inspired by Object Oriented paradigm, exploiting objects ineritance and interface style.

### Preliminary imports
All the necessary packages are imported

In [0]:
from keras.optimizers import Adam, SGD
from keras.losses import poisson
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TerminateOnNaN, CSVLogger
from keras.callbacks import TensorBoard
from keras import backend as K
from keras.models import load_model
from math import ceil
import h5py
import numpy as np
import shutil
import time
from object_counting.keras_ssc.misc_utils.tensor_sampling_utils import sample_tensors
from matplotlib import pyplot as plt

from object_counting.keras_ssc.models.keras_ssc300_4_merged import ssc_300
from object_counting.keras_ssc.keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
from object_counting.keras_ssc.keras_layers.keras_layer_L2Normalization import L2Normalization

from object_counting.keras_ssc.ssc_encoder_decoder.ssc_input_encoder import SSCInputEncoder
from object_counting.keras_ssc.ssc_encoder_decoder.ssc_input_encoder_1pred import SSCInputEncoder1Pred


from object_counting.keras_ssc.data_generator.object_detection_2d_data_generator import DataGenerator
from object_counting.keras_ssc.data_generator.object_detection_2d_geometric_ops import Resize
from object_counting.keras_ssc.data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels
from object_counting.keras_ssc.data_generator.data_augmentation_chain_original_ssd import SSDDataAugmentation
from object_counting.keras_ssc.data_generator.object_detection_2d_misc_utils import apply_inverse_transforms

%matplotlib inline

### Setting variables

For more info about the variables check the doumentation of the package. If it is not build, you can use sphynx to generate the html docs from the code doc strings.

In [0]:
img_height = 300 # Height of the model input images
img_width = 300 # Width of the model input images
img_channels = 3 # Number of color channels of the model input images
mean_color = [123, 117, 104] # The per-channel mean of the images in the dataset. Do not change this value if you're using any of the pre-trained weights.
swap_channels = [2, 1, 0] # The color channel order in the original SSD is BGR, so we'll have the model reverse the color channel order of the input images.
n_classes = 20 # Number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO
scales_pascal = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05] # The anchor box scaling factors used in the original SSD300 for the Pascal VOC datasets
scales_pascal = [0.37, 0.54, 0.71, 0.88, 1.05] # The anchor box scaling factors used in the original SSD300 for the Pascal VOC datasets
scales_coco = [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05] # The anchor box scaling factors used in the original SSD300 for the MS COCO datasets
scales = scales_pascal
aspect_ratios = [[1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5]] # The anchor box aspect ratios used in the original SSD300; the order matters
aspect_ratios = [[1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5]]
hidden_sizes = [250, 250, 100]
predictors = ['conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
two_boxes_for_ar1 = True
steps = [8, 16, 32, 64, 100, 300] # The space between two adjacent anchor box center points for each predictor layer.
steps = [32, 64, 100, 300] # The space between two adjacent anchor box center points for each predictor layer.
offsets = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] # The offsets of the first anchor box center points from the top and left borders of the image as a fraction of the step size for each predictor layer.
offsets = [0.5, 0.5, 0.5, 0.5] # The offsets of the first anchor box center points from the top and left borders of the image as a fraction of the step size for each predictor layer.
clip_boxes = False # Whether or not to clip the anchor boxes to lie entirely within the image boundaries
normalize_coords = True

## MODEL

### Implement the Model
After having defined all the parameters, we instantiate the model, instantiate an optimizer, maybe load weights and compile the model.

In [0]:
K.clear_session() # Clear previous models from memory.

#    Instantiate an optimizer and compile the model.
#    If you want to follow the original Caffe implementation, use the preset SGD
#    optimizer, otherwise I'd recommend the commented-out Adam optimizer.

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0, clipvalue=1.)
#sgd = SGD(lr=0.001, momentum=0.9, decay=0.0, nesterov=False)

model = ssc_300(image_size=(img_height, img_width, img_channels),
                n_classes=n_classes,
                mode='training',
                l2_regularization=0.0005,
                scales=scales,
                aspect_ratios_per_layer=aspect_ratios,
                two_boxes_for_ar1=two_boxes_for_ar1,
                steps=steps,
                offsets=offsets,
                subtract_mean=mean_color,
                swap_channels=swap_channels,
                output_activation=True,
                lstm=True,
                condense_predictors=True)

# 2: Load some weights into the model.
# TODO: Set the path to the weights you want to load.
weights_path = 'gdrive/My Drive/ColabFiles/ssc300_4_m_lstm_c_ftall.h5'

# If by_name is set to False, the structure must be identical,
# while if set to True, only the layers with the same name must be identical
print("LOADING WEIGHTS...\n")
model.load_weights(weights_path, by_name=True)

print("COMPILE THE MODEL\n")
model.compile(optimizer=adam, loss='mse')
model.summary()



In [0]:
for i, l in enumerate(model.layers):
    print(i, l.name, l.trainable)

### Load datasets and convert them in the correct format for the model
Instantiate one DataGenerator object for each set you want to load. If the hdf5 path is et it will load the dataset from the H5 Dataset. Loading into memory is faster but is only possible if there is enough free space in teh memory.

In [0]:
# 1: Instantiate `DataGenerator` objects: One for training, one for validation.
# Optional: If you have enough memory, consider loading the images into memory for the reasons explained above.

train_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None)
val_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None)
test_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None)

# 2: Parse the image and label lists for the training and validation datasets. This can take a while.
# TODO: Set the paths to the datasets here.

# The directories that contain the images.
images_dir      = '/content/VOCdevkit/VOC2007/JPEGImages/'
images_dir_12   = '/content/VOCdevkit/VOC2012/JPEGImages/'

# The directories that contain the annotations.
annotations_dir      = '/content/VOCdevkit/VOC2007/Annotations/'
annotations_dir_12   = '/content/VOCdevkit/VOC2012/Annotations/'

# The paths to the image sets.
train_image_set_filename    = '/content/VOCdevkit/VOC2007/ImageSets/Main/train.txt'
train_image_set_filename_12    = '/content/VOCdevkit/VOC2012/ImageSets/Main/trainval.txt'
val_image_set_filename      = '/content/VOCdevkit/VOC2007/ImageSets/Main/val.txt'
test_image_set_filename     = '/content/VOCdevkit/VOC2007/ImageSets/Main/test.txt'

# The XML parser needs to now what object class names to look for and in which order to map them to integers.
classes = ['aeroplane', 'bicycle', 'bird', 'boat',
           'bottle', 'bus', 'car', 'cat',
           'chair', 'cow', 'diningtable', 'dog',
           'horse', 'motorbike', 'person', 'pottedplant',
           'sheep', 'sofa', 'train', 'tvmonitor']

# Parse the sets
train_dataset.parse_xml(images_dirs=[images_dir, images_dir_12],
                        image_set_filenames=[train_image_set_filename, train_image_set_filename_12],
                        annotations_dirs=[annotations_dir, annotations_dir_12],
                        classes=classes,
                        include_classes='all',
                        exclude_truncated=False,
                        exclude_difficult=False,
                        ret=False)

val_dataset.parse_xml(images_dirs=[images_dir],
                      image_set_filenames=[val_image_set_filename],
                      annotations_dirs=[annotations_dir],
                      classes=classes,
                      include_classes='all',
                      exclude_truncated=False,
                      exclude_difficult=True,
                      ret=False)

test_dataset.parse_xml(images_dirs=[images_dir],
                      image_set_filenames=[test_image_set_filename],
                      annotations_dirs=[annotations_dir],
                      classes=classes,
                      include_classes='all',
                      exclude_truncated=False,
                      exclude_difficult=True,
                      ret=False)

# Optional: Convert the dataset into an HDF5 dataset. This will require more disk space, but will
# speed up the training if you don't load the images in memory. Doing this is not relevant in case 
# you activated the `load_images_into_memory` option in the constructor, 
# because in that case the images are in memory already anyway. If you want 
# to create HDF5 datasets, uncomment the subsequent two function calls.

# train_dataset.create_hdf5_dataset(file_path='/content/VOCdevkit/VOC2007/voc_train.h5',
#                                   resize=False,
#                                   variable_image_size=True,
#                                   verbose=True)

# val_dataset.create_hdf5_dataset(file_path='/content/VOCdevkit/VOC2007/voc_val.h5',
#                                 resize=False,
#                                 variable_image_size=True,
#                                 verbose=True)

# test_dataset.create_hdf5_dataset(file_path='/content/VOCdevkit/VOC2007/voc_test.h5',
#                                 resize=False,
#                                 variable_image_size=True,
#                                 verbose=True)

In [0]:
# 3: Set the batch size.
batch_size = 25 # Change the batch size if you like, or if you run into GPU memory issues.

# 4: Set the image transformations for pre-processing and data augmentation options.
# For the training generator:
ssd_data_augmentation = SSDDataAugmentation(img_height=img_height,
                                            img_width=img_width,
                                            background=mean_color)

# For the validation generator:
convert_to_3_channels = ConvertTo3Channels()
resize = Resize(height=img_height, width=img_width)

# 5: Instantiate an encoder that can encode ground truth labels into the format needed by the SSD loss function.

# The encoder constructor needs the spatial dimensions of the model's predictor layers to create the anchor boxes.
predictor_sizes = [# model.get_layer('conv4_3_norm').output_shape[1:3],
                   # model.get_layer('fc7').output_shape[1:3],
                   model.get_layer('conv6_2').output_shape[1:3],
                   model.get_layer('conv7_2').output_shape[1:3],
                   model.get_layer('conv8_2').output_shape[1:3],
                   model.get_layer('conv9_2').output_shape[1:3]]

ssd_input_encoder = SSCInputEncoder(img_height=img_height,
                                    img_width=img_width,
                                    n_classes=n_classes,
                                    predictor_sizes=predictor_sizes,
                                    scales=scales,
                                    aspect_ratios_per_layer=aspect_ratios,
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    steps=steps,
                                    offsets=offsets,
                                    clip_boxes=clip_boxes,
                                    matching_type='multi',
                                    pos_iou_threshold=0.5,
                                    neg_iou_limit=0.5,
                                    normalize_coords=normalize_coords)


ssc_input_encoder = SSCInputEncoder1Pred(len(classes))

# 6: Create the generator handles that will be passed to Keras' `fit_generator()` function.

train_generator = train_dataset.generate(batch_size=batch_size,
                                         shuffle=True,
                                         transformations=[convert_to_3_channels,
                                                          resize],
                                         label_encoder=ssc_input_encoder,
                                         returns={'processed_images',
                                                  'encoded_labels'},
                                         keep_images_without_gt=False)

val_generator = val_dataset.generate(batch_size=batch_size,
                                     shuffle=False,
                                     transformations=[convert_to_3_channels,
                                                      resize],
                                     label_encoder=ssc_input_encoder,
                                     returns={'processed_images',
                                              'encoded_labels'},
                                     keep_images_without_gt=False)
test_generator = test_dataset.generate(batch_size=batch_size,
                                     shuffle=False,
                                     transformations=[convert_to_3_channels,
                                                      resize],
                                     label_encoder=ssc_input_encoder,
                                     returns={'processed_images',
                                              'encoded_labels'},
                                     keep_images_without_gt=False)

# Get the number of samples in the training and validations datasets.
train_dataset_size = train_dataset.get_dataset_size()
val_dataset_size   = val_dataset.get_dataset_size()
test_dataset_size   = test_dataset.get_dataset_size()

print("Number of images in the training dataset:\t{:>6}".format(train_dataset_size))
print("Number of images in the validation dataset:\t{:>6}".format(val_dataset_size))
print("Number of images in the test dataset:\t\t{:>6}".format(test_dataset_size))

 ### Train the model
 If desired, define a learning rate schedule, then create the callbacks necessary to save the weights and set the learing rate schedule.

In [0]:
# Define a learning rate schedule.

def lr_schedule(epoch):
    if epoch < 20:
        return 0.001
    elif epoch < 80:
        return 0.0001
    else:
        return 0.00001

In [0]:
# Define model callbacks.

# TODO: Set the filepath under which you want to save the model.
model_checkpoint = ModelCheckpoint(filepath='weights/ssc300_pascal_07+12_epoch-{epoch:02d}_loss-{loss:.4f}_val_loss-{val_loss:.4f}.h5',
                                   monitor='val_loss',
                                   verbose=1,
                                   save_best_only=True,
                                   save_weights_only=True,
                                   mode='auto',
                                   period=1)
#model_checkpoint.best = 

csv_logger = CSVLogger(filename='weights/ssd300_pascal_07+12_training_log.csv',
                       separator=',',
                       append=True)

learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule,
                                                verbose=1)

terminate_on_nan = TerminateOnNaN()

callbacks = [model_checkpoint,
             csv_logger,
             learning_rate_scheduler,
             terminate_on_nan]

If you want to change the layers to be trained, you must recompile the model. To change it, just set the trainable attribute of a layer either to True or False. In transfer learning usually the added layer on top of the backbone network are trained with the backbone oens frozen, while in a second step, the topmost layer of the original model are defrost and trained along with the new model.

In [0]:
# set which layers to train. First train the topmost layers, then fine-tune also the convolutional ones.
for i, layer in enumerate(model.layers):
    layer.trainable = True if i > 22 else False
    
print("COMPILE THE MODEL\n")
model.compile(optimizer=adam, loss='mse')

In [0]:
model.load_weights('gdrive/My Drive/ColabFiles/ssc300_4_m_c_ftall_boh.h5')

In [0]:
# If you're resuming a previous training, set `initial_epoch` and `final_epoch` accordingly.
initial_epoch   = 70
final_epoch     = 100
steps_per_epoch = ceil(train_dataset_size/batch_size)

history = model.fit_generator(generator=train_generator,
                              steps_per_epoch=steps_per_epoch,
                              epochs=final_epoch,
                              callbacks=callbacks,
                              validation_data=val_generator,
                              validation_steps=ceil(val_dataset_size/batch_size),
                              initial_epoch=initial_epoch)

### Evaluate the model

In [0]:
from tqdm import tqdm
model.compile(optimizer=adam, loss='mse')

predictions = []
gts = []
imgs = []
labels = []

# Make a prediction for each test sample feeding the model with the test generator
for i in tqdm(range(ceil(test_dataset_size/batch_size))):

    batch_x, batch_y = next(test_generator)
    imgs.extend(batch_x)
    predictions.extend(model.predict(batch_x, batch_size))
    gts.extend(batch_y)



In [0]:
imgs = np.array(imgs)
pred_res = predictions = np.array(predictions)
gts_res = gts = np.array(gts)

If the output is split into predictors, it is necessary to reshape and sum it to get the final count for each class.

In [0]:
#pred_res = np.reshape(predictions, (-1,20,len(predictor_sizes)), order='F')
#gts_res = np.reshape(gts, (-1,20,len(predictor_sizes)), order='F')

print(pred_res.shape, gts_res.shape)

In [0]:
from matplotlib.pyplot import imshow
import random

#i = 4565 #chairs
i = random.randint(0, 4952)

imshow(imgs[i])

# print(pred_res[i])
print('Preds:\n', np.round(pred_res[i]))
print('Truth:\n', gts_res[i])
print((gts_res[i]-np.round(pred_res[i])))
# print(labels[i])

In [0]:
# Clip the eventual negative predictions to 0
pred_res = pred_res.clip(0)

In [0]:
# Compute the evaluation metrics

RMSE = np.sqrt(np.mean((pred_res-gts_res)**2))
print("RMSE: ", RMSE)
mRMSE = np.mean(np.sqrt(np.mean((pred_res-gts_res)**2, axis=0)))
print("mRMSE: ", mRMSE)
m_relRMSE = np.mean(np.sqrt(np.mean(((pred_res-gts_res)**2)/(gts_res+1), axis=0)))
print("m_relRMSE: ", m_relRMSE)

In [0]:
# Obtain the prediction by predictor
predictors = np.reshape(predictions, (-1,20,len(predictor_sizes)), order='F')

In [0]:
def pretty_prediction(classes, y_pred, y_true=None, pred_idx=0):
    
    if y_true is not None:
        for i in range(y_pred.shape[0]):
            print("Prediction {}".format(i))
            for c in range(len(classes)):
                if not np.round(y_pred[i,c]) == 0 or not y_true[i,c] == 0:
                    print("\t{}: {} {}".format(classes[c], np.round(y_pred[i,c]), y_true[i,c])) #np.around(predictors[pred_idx,c],1)
    else:
        for i in range(y_pred.shape[0]):
            print("Prediction {}".format(i))
            for c in range(len(classes)):
                if not y_precontent/d[i,c] == 0:
                    print("\t{}: {}".format(classes[c], y_pred[i,c]))

print("The first number is the prediction, the second number is the ground truth.\n")

i = random.randint(0, 4952)
# 4469 plane and reflected people
# 2106 bottles

imshow(imgs[i])
print(i)
pretty_prediction(classes, pred_res[i:i+1], gts_res[i:i+1], i)


#### Visualize the pixel-wise activation

In [0]:
from lime import lime_image

In [0]:
explainer = lime_image.LimeImageExplainer()
exp = explainer.explain_instance(imgs[i], model.predict)

In [0]:
temp, mask = exp.get_image_and_mask(exp.top_labels[0], positive_only=False, num_features=200)
plt.imshow(temp)
plt.show()

In [0]:
for c in exp.top_labels:
    print(classes[c])