# 2. Knowledge Distillation and Hint Layers on CIFAR-100
This notebook contains the code for two experiments on CIFAR-100. The text accompanying the code is removed, as the code is completely equivalent to that of KD10cifar.ipynb, but then for CIFAR-100.

In [0]:
from __future__ import print_function
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

#from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.datasets import cifar100
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D, Input, Lambda, concatenate
from tensorflow.keras.losses import categorical_crossentropy as logloss
import numpy as np

In [0]:
nb_classes = 100

(X_train, y_train), (X_test, y_test) = cifar100.load_data()

# convert y_train and y_test to categorical binary values 
Y_train = tf.keras.utils.to_categorical(y_train, nb_classes)
Y_test = tf.keras.utils.to_categorical(y_test, nb_classes)

X_train = X_train.reshape(50000, 32,32,3)
X_test = X_test.reshape(10000, 32,32,3)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Normalize the values
X_train /= 255
X_test /= 255

print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

input_shape = (32,32,3) # Input shape of each image

# Hyperparameters
nb_filters = 64 # number of convolutional filters to use
pool_size = (2, 2) # size of pooling area for max pooling
kernel_size = (3, 3) # convolution kernel size

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz
50000 train samples
10000 test samples


In [0]:
inputs = Input(shape=input_shape)

t_conv1 = Conv2D(64, kernel_size=(3, 3),
                 activation='relu', 
                 padding = 'same', 
                 kernel_initializer='he_normal')(inputs)
t_conv2 = Conv2D(64, (3, 3), activation='relu', padding = 'same')(t_conv1)
t_maxpool1 = MaxPooling2D(pool_size=(2, 2))(t_conv2)

# Could add dropout here for reguralization

t_conv3 = Conv2D(128, kernel_size=(3, 3),
                 activation='relu',
                 padding = 'same')(t_maxpool1)
t_conv4 = Conv2D(128, (3, 3), activation='relu',padding = 'same')(t_conv3)
t_maxpool2 = MaxPooling2D(pool_size=(2, 2))(t_conv4)


t_conv5 = Conv2D(256, kernel_size=(3, 3),
                 activation='relu',
                 padding = 'same')(t_maxpool2)
t_conv6 = Conv2D(256, (3, 3), activation='relu', padding = 'same')(t_conv5)
t_conv7 = Conv2D(256, (3, 3), activation='relu', padding = 'same')(t_conv6)

w_hint = Model(inputs=inputs, outputs = t_conv7, name = 'w_hint')

t_maxpool3 = MaxPooling2D(pool_size=(2, 2))(t_conv7)

t_do1 = Dropout(0.4)(t_maxpool3)

t_conv8 = Conv2D(512, kernel_size=(3, 3),
                 activation='relu',
                 padding = 'same')(t_do1)
t_conv9 = Conv2D(512, (3, 3), activation='relu', padding = 'same')(t_conv8)
t_conv10 = Conv2D(512, (3, 3), activation='relu', padding = 'same')(t_conv9)
t_conv11 = Conv2D(512, (3, 3), activation='relu', padding = 'same')(t_conv10)
t_maxpool4 = MaxPooling2D(pool_size=(2,2))(t_conv11)

t_do2 = Dropout(0.4)(t_maxpool4)

t_flat = Flatten()(t_do2)
t_dense1 = Dense(1024, activation='relu')(t_flat)
t_dense2 = Dense(256, activation='relu')(t_dense1)
t_dense3 = Dense(128, activation='relu')(t_dense2)

t_do3 = Dropout(0.3)(t_dense3)

t_dense_final = Dense(nb_classes, name = 'wo_softmax_teach')(t_do3)
t_softmax = Activation('softmax')(t_dense_final)
# Note that we add a normal softmax layer to begin with
teacher = Model(inputs=inputs, outputs=t_softmax, name = 'teacher')

t_optimizer = tf.keras.optimizers.SGD(
    learning_rate=0.01, momentum=0.9, nesterov=True, name='SGD'
)

teacher.compile(loss='categorical_crossentropy',
              optimizer=t_optimizer,
              metrics=['accuracy'])
print(teacher.summary())

Model: "teacher"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 32, 3)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 32, 32, 64)        1792      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 32, 32, 64)        36928     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 16, 16, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 16, 16, 128)       73856     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 16, 16, 128)       147584    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 8, 8, 128)         0   

In [0]:
epochs = 40
batch_size = 128
teacher.fit(X_train, Y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(X_test, Y_test))
# 0.5015

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7fd93ce2d0f0>

In [0]:
inputs = Input(shape=input_shape)

s_conv1 = Conv2D(16, kernel_size=(3, 3),
                 activation='relu', 
                 padding = 'same', 
                 kernel_initializer='he_normal')(inputs)
s_maxpool1 = MaxPooling2D(pool_size=(2, 2))(s_conv1)

s_conv2 = Conv2D(32, (3, 3), activation='relu',padding='same')(s_maxpool1)
s_maxpool2 = MaxPooling2D(pool_size=(2, 2))(s_conv2)

w_guided =  Model(inputs=inputs, outputs=s_maxpool2, name = 'w_guided')

s_flat1 = Flatten()(s_maxpool2)
s_dense1 = Dense(128, activation='relu')(s_flat1)

s_do1 = Dropout(0.3)(s_dense1)

s_dense_final = Dense(nb_classes, name = 'wo_softmax_stud')(s_do1)
s_softmax = Activation('softmax')(s_dense_final)

student = Model(inputs=inputs, outputs=s_softmax, name = 'student')

student.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

student.summary()

Model: "student"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 32, 3)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 32, 32, 16)        448       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 16, 16, 16)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 16, 16, 32)        4640      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 8, 8, 32)          0         
_________________________________________________________________
flatten (Flatten)            (None, 2048)              0         
_________________________________________________________________
dense (Dense)                (None, 128)               2622

In [0]:
student.fit(X_train, Y_train,
          batch_size=64,
          epochs=20,
          verbose=1,
          validation_data=(X_test, Y_test))
# 0.3459, 0.3559, 0.3617, 0.3551 30 epochs
# 0.3728 with hint and guided layer.

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fbba4275da0>

In [0]:
conv_regressor = Conv2D(256, kernel_size=(1, 1),
                 input_shape = (8,8,32),
                 kernel_initializer='glorot_normal')(w_guided.output)

w_r = Model(inputs=inputs, outputs=conv_regressor, name = 'w_r')
print(w_hint.output)
print(w_r.output)
print(teacher.output)

Tensor("conv2d_6/Identity:0", shape=(None, 8, 8, 256), dtype=float32)
Tensor("conv2d_15/Identity:0", shape=(None, 8, 8, 256), dtype=float32)
Tensor("activation/Identity:0", shape=(None, 100), dtype=float32)


In [0]:
def fitnet_loss(target_feat, source_feat):
    return tf.reduce_mean(tf.square(target_feat-source_feat))

In [0]:
w_r.compile(
    optimizer='adam',
    loss= (lambda y_hint, y_guided: fitnet_loss(y_hint, y_guided)))

In [0]:
w_hint_outputs = w_hint.predict(X_train)
print(w_hint_outputs.shape)
#np.save("w_hint_outputs.npy",w_hint_outputs)


(50000, 8, 8, 256)


In [0]:
w_r.fit(X_train, w_hint_outputs,
          batch_size=128,
          epochs=10)#,
          #verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fd8c8692208>

In [0]:
# Define a manual softmax function
def softmax(x):
    return np.exp(x)/(np.exp(x).sum())
    
#teacher_WO_Softmax = Model(teacher.input, teacher.get_layer('wo_softmax_teach').output)


In [0]:
teacher_train_logits = teacher_WO_Softmax.predict(X_train)
teacher_test_logits = teacher_WO_Softmax.predict(X_test) 
# This model directly gives the logits ( see the teacher_WO_softmax model above)

In [0]:
np.save("teacher_train_logits_100_cifar.npy",teacher_train_logits)
np.save("teacher_test_logits_100_cifar.npy",teacher_test_logits)

In [0]:
teacher_train_logits = np.load("teacher_train_logits_100_cifar.npy")
teacher_test_logits = np.load("teacher_test_logits_100_cifar.npy")

In [0]:
teacher_train_logits = teacher_train_logits.astype('float64')
teacher_test_logits = teacher_test_logits.astype('float64')

In [0]:
# Set a tempature value
temp = 10


# Perform a manual softmax at raised temperature
train_logits_T = teacher_train_logits / temp
test_logits_T = teacher_test_logits / temp 

Y_train_soft = softmax(train_logits_T)
Y_test_soft = softmax(test_logits_T)

# Concatenate so that this becomes a 10 + 10 dimensional vector
Y_train_new = np.concatenate([Y_train, Y_train_soft], axis=1)
Y_test_new =  np.concatenate([Y_test, Y_test_soft], axis =1)

In [0]:
logits = student.layers[-2].output # This is going to be a tensor. And hence it needs to pass through a Activation layer
probs = Activation('softmax')(logits)

# softened probabilities at raised temperature
logits_T = tf.keras.layers.Lambda(lambda x: x / temp)(logits)
probs_T = Activation('softmax')(logits_T)

output = tf.keras.layers.concatenate([probs, probs_T])

# This is our new student model
student_kd = Model(student.input, output)

#student_kd.summary()

In [0]:
# Declare knowledge distillation loss
def knowledge_distillation_loss(y_true, y_pred, alpha):

    # Extract the one-hot encoded values and the softs separately so that we can create two objective functions
    y_true, y_true_softs = y_true[: , :nb_classes], y_true[: , nb_classes:]
    
    y_pred, y_pred_softs = y_pred[: , :nb_classes], y_pred[: , nb_classes:]
    
    loss = (1-alpha)*logloss(y_true,y_pred) + alpha*logloss(y_true_softs, y_pred_softs)
    
    return loss

# For testing use regular output probabilities - without temperature
def acc(y_true, y_pred):
    y_true = y_true[:, :nb_classes]
    y_pred = y_pred[:, :nb_classes]
    return tf.keras.metrics.categorical_accuracy(y_true, y_pred)

In [0]:
student_kd.compile(
    optimizer='adam',
    loss=(lambda y_true, y_pred: knowledge_distillation_loss(y_true, y_pred, 0.5)),
    metrics=[acc] )


In [0]:
student_kd.fit(X_train, Y_train_new,
          batch_size=64,
          epochs=1,
          verbose=1,
          validation_data=(X_test, Y_test_new))

In [0]:
student_kd.reset_states()
student.reset_states()
student_kd = None
student = None
del student_kd
del student
tf.keras.backend.clear_session()