# 1. Knowledge Distillation and Hint Layers on CIFAR-10
This notebook contains the code for two experiments on CIFAR-10. The text accompanying the code is brief, as most of the explanation is in the main pdf-file.

First, the packages are imported.

In [1]:
from __future__ import print_function
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D, Input, Lambda, concatenate
from tensorflow.keras.losses import categorical_crossentropy as logloss
import numpy as np

2.2.0


In the code below the CIFAR-10 data is imported and normalised.

In [3]:
nb_classes = 10

(X_train, y_train), (X_test, y_test) = cifar10.load_data()

# convert y_train and y_test to categorical binary values 
Y_train = tf.keras.utils.to_categorical(y_train, nb_classes)
Y_test = tf.keras.utils.to_categorical(y_test, nb_classes)

X_train = X_train.reshape(50000, 32,32,3)
X_test = X_test.reshape(10000, 32,32,3)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Normalize the values
X_train /= 255
X_test /= 255

print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

input_shape = (32,32,3) # Input shape of each image

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
50000 train samples
10000 test samples


In the code block below a CNN with an architecture similar to VGG is created and explained in its output.

In [4]:
inputs = Input(shape=input_shape)

t_conv1 = Conv2D(64, kernel_size=(3, 3),
                 activation='relu', 
                 padding = 'same', 
                 kernel_initializer='he_normal')(inputs)
t_conv2 = Conv2D(64, (3, 3), activation='relu', padding = 'same')(t_conv1)
t_maxpool1 = MaxPooling2D(pool_size=(2, 2))(t_conv2)

# No dropout added here, as the hint layer is only created later, and adding dropout in between
# might not be smart

t_conv3 = Conv2D(128, kernel_size=(3, 3),
                 activation='relu',
                 padding = 'same')(t_maxpool1)
t_conv4 = Conv2D(128, (3, 3), activation='relu',padding='same')(t_conv3)
t_maxpool2 = MaxPooling2D(pool_size=(2, 2))(t_conv4)


t_conv5 = Conv2D(256, kernel_size=(3, 3),
                 activation='relu',
                 padding = 'same')(t_maxpool2)
t_conv6 = Conv2D(256, (3, 3), activation='relu', padding = 'same')(t_conv5)
t_conv7 = Conv2D(256, (3, 3), activation='relu', padding = 'same')(t_conv6)

# Create the hint 'model'
w_hint = Model(inputs=inputs, outputs = t_conv7, name = 'w_hint')

t_maxpool3 = MaxPooling2D(pool_size=(2, 2))(t_conv7)

t_do1 = Dropout(0.4)(t_maxpool3)

t_conv8 = Conv2D(512, kernel_size=(3, 3),
                 activation='relu',
                 padding = 'same')(t_do1)
t_conv9 = Conv2D(512, (3, 3), activation='relu', padding = 'same')(t_conv8)
t_conv10 = Conv2D(512, (3, 3), activation='relu', padding = 'same')(t_conv9)
t_conv11 = Conv2D(512, (3, 3), activation='relu', padding = 'same')(t_conv10)
t_maxpool4 = MaxPooling2D(pool_size=(2, 2))(t_conv11)

t_do2 = Dropout(0.4)(t_maxpool4)

t_flat = Flatten()(t_do2)
t_dense1 = Dense(1024, activation='relu')(t_flat)
t_dense2 = Dense(256, activation='relu')(t_dense1)
t_dense3 = Dense(64, activation='relu')(t_dense2)

t_do3 = Dropout(0.3)(t_dense3)

t_dense_final = Dense(nb_classes, name = 'wo_softmax_teach')(t_do3)
t_softmax = Activation('softmax')(t_dense_final)
# Note that we add a normal softmax layer to begin with
teacher = Model(inputs=inputs, outputs=t_softmax, name = 'teacher')


# momentum optimiser
t_optimizer = tf.keras.optimizers.SGD(
    learning_rate=0.01, momentum=0.9, nesterov=True, name='SGD'
)

teacher.compile(loss='categorical_crossentropy',
              optimizer=t_optimizer,
              metrics=['accuracy'])
print(teacher.summary())

Model: "teacher"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 32, 3)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 32, 32, 64)        1792      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 32, 32, 64)        36928     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 16, 16, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 16, 16, 128)       73856     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 16, 16, 128)       147584    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 8, 8, 128)         0   

In [None]:
# Training the teacher
epochs = 40
batch_size = 128
teacher.fit(X_train, Y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(X_test, Y_test))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7fdf6f660dd8>

In the code below the student network is created.

In [5]:
inputs = Input(shape=input_shape)

s_conv1 = Conv2D(16, kernel_size=(3, 3),
                 activation='relu', 
                 padding = 'same', 
                 kernel_initializer='he_normal')(inputs)
s_maxpool1 = MaxPooling2D(pool_size=(2, 2))(s_conv1)

s_conv2 = Conv2D(32, (3, 3), activation='relu',padding='same')(s_maxpool1)
s_maxpool2 = MaxPooling2D(pool_size=(2, 2))(s_conv2)

# Guided 'model'
w_guided =  Model(inputs=inputs, outputs=s_maxpool2, name = 'w_guided')

s_flat1 = Flatten()(s_maxpool2)
s_dense1 = Dense(128, activation='relu')(s_flat1)

s_do1 = Dropout(0.3)(s_dense1)

s_dense_final = Dense(nb_classes, name = 'wo_softmax_stud')(s_do1)
s_softmax = Activation('softmax')(s_dense_final)

student = Model(inputs=inputs, outputs=s_softmax, name = 'student')

student.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

student.summary()

Model: "student"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 32, 32, 3)]       0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 32, 32, 16)        448       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 16, 16, 16)        0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 16, 16, 32)        4640      
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 8, 8, 32)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2048)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               2622

In [None]:
# Train the student to achieve base accuracy
epochs = 30
batch_size = 64
student.fit(X_train, Y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(X_test, Y_test))
# 0.6932, 0.7047, 0.7026, 0.6953

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f30342e8f98>

In [None]:
# Delete the model so that it can be retrained from scratch
# Again, to achieve base accuracy.
student.reset_states()
student = None
del student

### Hint and guided layer code

In the code below the adaptation layer is created, and Wr is created as described in the paper. It is checked if the output of both the hint layer and the guided layer + adaptation layer is the same.

In [None]:
# conv_regressor is the adapation layer.
conv_regressor = Conv2D(256, kernel_size=(1, 1),
                 input_shape = (8,8,32),
                 kernel_initializer='glorot_normal')(w_guided.output)

w_r = Model(inputs=inputs, outputs=conv_regressor, name = 'w_r')
print(w_hint.output)
print(w_r.output)
print(teacher.output)

Tensor("conv2d_6/Identity:0", shape=(None, 8, 8, 256), dtype=float32)
Tensor("conv2d_13/Identity:0", shape=(None, 8, 8, 256), dtype=float32)
Tensor("activation/Identity:0", shape=(None, 10), dtype=float32)


Creating the funciton below took me very long to figure out. In essence it takes the l2 distance between the guided layer and hint layer output.

In [None]:
def fitnet_loss(target_feat, source_feat):
    return tf.reduce_mean(tf.square(target_feat-source_feat))

In [None]:
w_r.compile(
    optimizer='adam',
    loss= (lambda y_hint, y_guided: fitnet_loss(y_hint, y_guided)))
w_hint_outputs = w_hint.predict(X_train)
print(w_hint_outputs.shape)


(50000, 8, 8, 256)


The hint outputs are then created, and the w_r trains the layers until the guided layer from the student model by taking the l2 distance between the hint output and the guided + adaptation layer output. Minimum loss is achieved within 9 epochs.

In [None]:
w_r.fit(X_train, w_hint_outputs,
          batch_size=128,
          epochs=10)#,
          #verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fdf6f5e6a58>

### Knowledge distillation code

In the code below a manual softmax function is defined, and a new teacher model without its softmax output is created. 

In [None]:
# Define a manual softmax function
def softmax(x):
    return np.exp(x)/(np.exp(x).sum())

teacher_WO_Softmax = Model(teacher.input, teacher.get_layer('wo_softmax_teach').output)


In [None]:
teacher_train_logits = teacher_WO_Softmax.predict(X_train)
teacher_test_logits = teacher_WO_Softmax.predict(X_test) 
# This model directly gives the soft logits 

This output is then saved because otherwise I would have to retrain the teacher model each time I would want to train the student model with knowledge distillation.

In [None]:
teacher_train_logits = np.load("teacher_train_logits.npy")
teacher_test_logits = np.load("teacher_test_logits.npy")

The code below is rerun for all temperature values (2, 4, 6, 8, 10, 12 and 14).

In [None]:
# Set a tempature value
temp = 12


# Perform a manual softmax at raised temperature
train_logits_T = teacher_train_logits / temp
test_logits_T = teacher_test_logits / temp 

Y_train_soft = softmax(train_logits_T)
Y_test_soft = softmax(test_logits_T)

# Concatenate so that this becomes a 10 + 10 dimensional vector
Y_train_new = np.concatenate([Y_train, Y_train_soft], axis=1)
Y_test_new =  np.concatenate([Y_test, Y_test_soft], axis =1)

In [None]:
print(Y_train_new.shape)
print(Y_test_new.shape)
print(X_train.shape)
Y_train_new[1]
print(type(teacher_train_logits))

(50000, 20)
(10000, 20)
(50000, 32, 32, 3)
<class 'numpy.ndarray'>


Below the student model with a concatenation of two outputs is created, as described in the paper and Hinton's original knowledge distillation paper.

In [7]:
# Now collect the logits from the last layer
logits = student.layers[-2].output # This is going to be a tensor. And hence it needs to pass through a Activation layer
probs = Activation('softmax')(logits)

# softened probabilities at raised temperature
logits_T = tf.keras.layers.Lambda(lambda x: x / temp)(logits)
probs_T = Activation('softmax')(logits_T)

output = tf.keras.layers.concatenate([probs, probs_T])

# This is the new student model
student_kd = Model(student.input, output)

student_kd.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 32, 32, 3)]  0                                            
__________________________________________________________________________________________________
conv2d_11 (Conv2D)              (None, 32, 32, 16)   448         input_2[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_4 (MaxPooling2D)  (None, 16, 16, 16)   0           conv2d_11[0][0]                  
__________________________________________________________________________________________________
conv2d_12 (Conv2D)              (None, 16, 16, 32)   4640        max_pooling2d_4[0][0]            
______________________________________________________________________________________________

Below the loss functions for knowledge distillation are defined.

In [None]:
# Declare knowledge distillation loss
def knowledge_distillation_loss(y_true, y_pred, alpha):

    # Extract the one-hot encoded values and the softs separately so that we can create two objective functions
    y_true, y_true_softs = y_true[: , :nb_classes], y_true[: , nb_classes:]
    
    y_pred, y_pred_softs = y_pred[: , :nb_classes], y_pred[: , nb_classes:]
    
    loss = (1-alpha)*logloss(y_true,y_pred) + alpha*logloss(y_true_softs, y_pred_softs)
    
    return loss

# For testing use regular output probabilities - without temperature
def acc(y_true, y_pred):
    y_true = y_true[:, :nb_classes]
    y_pred = y_pred[:, :nb_classes]
    return tf.keras.metrics.categorical_accuracy(y_true, y_pred)

The codeblock below is run for all different values of alpha (0.3, 0.5, 0.7 and 0.9).

In [None]:
student_kd.compile(
    optimizer='adam',
    loss=(lambda y_true, y_pred: knowledge_distillation_loss(y_true, y_pred, 0.5)),
    metrics=[acc] )


In [None]:
student_kd.fit(X_train, Y_train_new,
          batch_size=64,
          epochs=30,
          verbose=1,
          validation_data=(X_test, Y_test_new))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fdeb840a278>

The models had to be trained tens of times, thus the code blocks below were used to reset the weights. Both had to be run in order to recreate an initialised student_kd model.

In [None]:
student_kd.reset_states()
student_kd = None
del student_kd
tf.keras.backend.clear_session() 

In [None]:
student.reset_states()
student = None
del student

### Teacher-assistant
The code block below defines the student model for the multi-step knowledge distillation. The student here is called the pupil. As the results were unsatisfactory, I removed the hint layer code. The process of creating this code is equivalent precisely to what is done in the code above.

### Dense layers
I also removed the dense layer code, as these also led to unsatisfactory results. As described in the paper, the final test accuracy would always oscillate between two different values, whether they were trained on soft outputs or not.

In [None]:
inputs = Input(shape=input_shape)


p_conv1 = Conv2D(16, kernel_size=(3, 3),
                 activation='relu', 
                 padding = 'same', 
                 kernel_initializer='he_normal')(inputs)
p_maxpool1 = MaxPooling2D(pool_size=(2, 2))(p_conv1)

p_flat1 = Flatten()(p_maxpool1)
p_dense1 = Dense(32, activation='relu')(p_flat1)
p_do1 = Dropout(0.2)(p_dense1)
p_dense_final = Dense(nb_classes, name = 'wo_softmax_pupil')(p_do1)
p_softmax = Activation('softmax')(p_dense_final)

pupil = Model(inputs=inputs, outputs=p_softmax, name = 'pupil')

pupil.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

pupil.summary()

Model: "pupil"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 32, 32, 3)]       0         
_________________________________________________________________
conv2d_33 (Conv2D)           (None, 32, 32, 16)        448       
_________________________________________________________________
max_pooling2d_19 (MaxPooling (None, 16, 16, 16)        0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 4096)              0         
_________________________________________________________________
dense_12 (Dense)             (None, 32)                131104    
_________________________________________________________________
dropout_12 (Dropout)         (None, 32)                0         
_________________________________________________________________
wo_softmax_pupil (Dense)     (None, 10)                330   

In [None]:
pupil.fit(X_train, Y_train,
          batch_size=64,
          epochs=20,
          verbose=1,
          validation_data=(X_test, Y_test))

In [None]:
student_WO_Softmax = Model(student.input, student.get_layer('wo_softmax_').output)

student_train_logits = student_WO_Softmax.predict(X_train)
student_test_logits = student_WO_Softmax.predict(X_test)

In [None]:
# Set a tempature value
temp = 8


# Perform a manual softmax at raised temperature
student_train_logits_T = student_train_logits / temp
student_test_logits_T = student_test_logits / temp 

Y_train_soft_student = softmax(student_train_logits_T)
Y_test_soft_student = softmax(student_test_logits_T)

# Concatenate so that this becomes a 10 + 10 dimensional vector
Y_train_new_student = np.concatenate([Y_train, Y_train_soft_student], axis=1)
Y_test_new_student =  np.concatenate([Y_test, Y_test_soft_student], axis =1)

# Now collect the logits from the last layer
logits_pupil = pupil.layers[-2].output # This is going to be a tensor. And hence it needs to pass through a Activation layer
probs_pupil = Activation('softmax')(logits_pupil)

# softened probabilities at raised temperature
logits_T_pupil = tf.keras.layers.Lambda(lambda x: x / temp)(logits_pupil)
probs_T_pupil = Activation('softmax')(logits_T_pupil)

output_pupil = tf.keras.layers.concatenate([probs_pupil, probs_T_pupil])

# This is our new student model
pupil_kd = Model(pupil.input, output_pupil)


p_optimizer = tf.keras.optimizers.SGD(
    learning_rate=0.01, momentum=0.9, nesterov=True, name='SGD'
)


pupil_kd.compile(
    #optimizer=tf.keras.optimizers.SGD(lr=1e-1, momentum=0.9, nesterov=True),
    optimizer='adam',
    loss=(lambda y_true, y_pred: knowledge_distillation_loss(y_true, y_pred, 0.5)),
    #loss='categorical_crossentropy',
    metrics=[acc] )


pupil_kd.fit(X_train, Y_train_new_student,
          batch_size=64,
          epochs=20,
          verbose=1,
          validation_data=(X_test, Y_test_new_student))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f65ae4a4588>

In [None]:
pupil.reset_states()
pupil = None
del pupil

In [None]:
pupil_kd.reset_states()
pupil_kd = None
del pupil_kd