In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from scipy import ndimage
import math
import ast #to easily read out class text file that contains some unknwn syntax.
import scipy   #to upscale the image
import matplotlib.pyplot as plt
import cv2     
from keras.applications.resnet import ResNet50, preprocess_input
from keras.models import Model   
from PIL import Image



# Load and preprocess CIFAR10 Dataset

In [3]:
(training_images, training_labels) , (test_images, test_labels) = tf.keras.datasets.cifar10.load_data()

def preprocess_image_input(input_images):
  input_images = input_images.astype('float32')
  output_ims = tf.keras.applications.resnet50.preprocess_input(input_images)
  return output_ims

train_X = preprocess_image_input(training_images)
test_X = preprocess_image_input(test_images)


Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


# Teacher Section

## Teacher Model - Resnet50 with custom top

In [4]:
def feature_extractor(inputs):

  feature_extractor = tf.keras.applications.resnet.ResNet50(input_shape=(224, 224, 3),
                                               include_top=False,
                                               weights='imagenet')(inputs)
 
  return feature_extractor



def classifier(inputs):
    x= tf.keras.layers.Conv2D(2048, (1, 1), strides=(1, 1), padding="same")(inputs)
    x= tf.keras.layers.ReLU()(x)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)  
    x = tf.keras.layers.Dense(10,name="classification")(x)
    return x



def final_model(inputs):

    resize = tf.keras.layers.UpSampling2D(size=(7,7))(inputs)

    resnet_feature_extractor = feature_extractor(resize)
    
    
    classification_output = classifier(resnet_feature_extractor)

    return classification_output




def define_compile_model():
  inputs = tf.keras.layers.Input(shape=(32,32,3))
  
  classification_output = final_model(inputs) 
  model = tf.keras.Model(inputs=inputs, outputs = classification_output)
 
  model.compile(optimizer='SGD', 
                loss='sparse_categorical_crossentropy',
                metrics = ['accuracy'])
  
  return model


resnet50 = define_compile_model()

resnet50.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 32, 32, 3)]       0         
                                                                 
 up_sampling2d (UpSampling2  (None, 224, 224, 3)       0         
 D)                                                              
                                                                 
 resnet50 (Functional)       (None, 7, 7, 2048)        23587712  
                                                                 
 conv2d (Conv2D)             (None, 7, 7, 2048)        4196352   
                                                                 
 re_lu (ReLU)                (None, 7, 7, 2048)        0         
                                                  

In [5]:
resnet50.compile(
    optimizer=keras.optimizers.SGD(),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)


## Teacher Training

In [6]:
resnet50.fit(train_X, training_labels, epochs=3, validation_data = (test_X, test_labels), batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7eadec557cd0>

# Baseline Distillation 

In [None]:
#proportionality maintained student
student_bl = keras.Sequential(
    [
        keras.Input(shape=(32, 32, 3)),
        layers.Conv2D(128, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(64, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
        layers.GlobalAveragePooling2D(),
        layers.Dense(10),
    ],
    name="student_bl",
)

student_bl.summary()

In [None]:
class Distiller_bl(keras.Model):
    def __init__(self, student, teacher):
        super().__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
       
        x, y = data

  
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
           
            student_predictions = self.student(x, training=True)

            
            student_loss = self.student_loss_fn(y, student_predictions)

            
            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )

            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

       
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        
        self.compiled_metrics.update_state(y, student_predictions)

       
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
       
        x, y = data

      
        y_prediction = self.student(x, training=False)

       
        student_loss = self.student_loss_fn(y, y_prediction)

      
        self.compiled_metrics.update_state(y, y_prediction)

       
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [None]:
distiller = Distiller_bl(student=student_bl, teacher=resnet50)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=['accuracy'],
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=3,
)


distiller.fit(train_X, training_labels, epochs=10)


distiller.evaluate(test_X, test_labels)

# Distillation with only CAM loss

In [7]:
batch_size = 32
train_dataset = tf.data.Dataset.from_tensor_slices((train_X, training_labels))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

In [8]:
#proportionality maintained student
student_c1 = keras.Sequential(
    [
        keras.Input(shape=(32, 32, 3)),
        layers.Conv2D(128, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(64, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
        layers.GlobalAveragePooling2D(),
        layers.Dense(10),
    ],
    name="student_c1",
)

student_c1.summary()

Model: "student_c1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_1 (Conv2D)           (None, 16, 16, 128)       3584      
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 16, 16, 128)       0         
                                                                 
 max_pooling2d (MaxPooling2  (None, 16, 16, 128)       0         
 D)                                                              
                                                                 
 conv2d_2 (Conv2D)           (None, 8, 8, 64)          73792     
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 8, 8, 64)          0         
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 8, 8, 64)          0         
 g2D)                                                   

In [9]:
def plt_heatmap(img,heatmap,title):
    
    fig,ax= plt.subplots()
    ax.imshow(img)
    #ax.imshow(heatmap, cmap='jet', alpha=0.5)
    ax.set_title(title) 
    plt.show()



def cam_loss(model_one,model_two,layer_one,layer_two,cam_img): 
    cifar10_classes = [
        
        'Airplane',
        'Automobile',
        'Bird',
        'Cat',
        'Deer',
        'Dog',
        'Frog',
        'Horse',
        'Ship',
        'Truck'
    ]
    
    img_tensor = np.expand_dims(cam_img, axis=0)
    preprocessed_img = preprocess_input(img_tensor)
    last_layer_weights = model_one.layers[-1].get_weights()[0]
    vis_model = Model(inputs=model_one.input, outputs=(model_one.layers[layer_one].output,model_one.layers[-1].output))
    with tf.GradientTape() as tape:
        last_conv_output, pred_vec = vis_model(preprocessed_img,training=False)
      
        last_conv_output = np.squeeze(last_conv_output) 
      
        pred = np.argmax(pred_vec)
   
    upsampled_last_conv_output = ndimage.zoom(last_conv_output, (4.57, 4.57, 1), order=1)
    last_layer_weights_for_pred = last_layer_weights[:, pred]
    heat_map = np.dot(upsampled_last_conv_output, last_layer_weights_for_pred) 
    flat=heat_map.flatten()
    
    #plt_heatmap(cam_img,heat_map,cifar10_classes[pred])
       
    
    last_layer_weights_2 = model_two.layers[-1].get_weights()[0]
    vis_model_2 = Model(inputs=model_two.input, outputs=(model_two.layers[layer_two].output,model_two.layers[-1].output))
    
    with tf.GradientTape() as tape:
        last_conv_output_2, pred_vec_2 = vis_model_2(preprocessed_img,training=False)
        last_conv_output_2 = np.squeeze(last_conv_output_2)         
        pred_2 = np.argmax(pred_vec_2)
        
    h = int(cam_img.shape[0]/last_conv_output_2.shape[0])
    w = int(cam_img.shape[1]/last_conv_output_2.shape[1])
    upsampled_last_conv_output_2 = ndimage.zoom(last_conv_output_2, (h, w, 1), order=1) 
    last_layer_weights_for_pred_2 = last_layer_weights_2[:, pred]
    heat_map_2 = np.dot(upsampled_last_conv_output_2, last_layer_weights_for_pred_2) 
    flat_2=heat_map_2.flatten()
    
        
    #plt_heatmap(cam_img,heat_map_2,cifar10_classes[pred_2])
    
    
    
    absolute_differences = np.abs(flat - flat_2)

    mae = np.mean(absolute_differences)
    
        
    
    return mae





class Distiller(keras.Model):
    def __init__(self, student, teacher):
        super().__init__()
        self.teacher = teacher
        self.student = student
        self.indexing=0

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
       
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature
        
    
    def train_step(self, x,y,conts):
       
       
        x_numpy=x.numpy()
       
             
        # retrieve the cam loss
        caml=(cam_loss(self.teacher,self.student,-4,-3,x_numpy[0]))
    
      
                                  
         
    
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
   
            student_predictions = self.student(x, training=True)

           
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )
     
     
            loss = self.alpha * student_loss + (1 - self.alpha) * caml
            
           
                            
        
        
        self.indexing=self.indexing+10
        if self.indexing>49500:
            self.indexing=0
       
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

       
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        
        self.compiled_metrics.update_state(y, student_predictions)

        
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
       
        return results,distillation_loss,caml

    def test_step(self, data):
     
        x, y = data

       
        y_prediction = self.student(x, training=False)

       
        student_loss = self.student_loss_fn(y, y_prediction)

      
        self.compiled_metrics.update_state(y, y_prediction)

       
        print('hi')
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [10]:
distiller_c1 = Distiller(student=student_c1, teacher=resnet50)
distiller_c1.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=['accuracy'],
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=3,
)

In [11]:
conts=1
epochs = 10
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))

    

    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
       
       

        history,l1,l2=distiller_c1.train_step(x_batch_train,y_batch_train,conts)
        conts+=1
    print(conts)
    tf.print(l1)
    print(l2)
    for key, value in history.items():
        print(key, tf.print(value))




Start of epoch 0
1564
10.433486
12.830579
0.39264
accuracy None
1.55721092
student_loss None
10.433486
distillation_loss None

Start of epoch 1
3127
9.27807426
11.692741
0.4465
accuracy None
1.18062067
student_loss None
9.27807426
distillation_loss None

Start of epoch 2
4690
9.54609585
13.128896
0.48118
accuracy None
2.00735426
student_loss None
9.54609585
distillation_loss None

Start of epoch 3
6253
8.57855606
12.52506
0.506185
accuracy None
1.55245376
student_loss None
8.57855606
distillation_loss None

Start of epoch 4
7816
7.32777834
12.49141
0.52642
accuracy None
1.06652486
student_loss None
7.32777834
distillation_loss None

Start of epoch 5
9379
8.28400707
13.057808
0.543326676
accuracy None
0.993043423
student_loss None
8.28400707
distillation_loss None

Start of epoch 6
10942
5.82007504
19.01378
0.557391405
accuracy None
0.747252
student_loss None
5.82007504
distillation_loss None

Start of epoch 7
12505
6.44685888
9.628952
0.569197476
accuracy None
0.989153922
student_loss

In [12]:
distiller_c1.evaluate(test_X,test_labels)

hi


[0.6452000141143799, 1.2904012203216553]

# Distillation with cam loss (softmax,mae)

In [13]:
#proportionality maintained student
student_c2 = keras.Sequential(
    [
        keras.Input(shape=(32, 32, 3)),
        layers.Conv2D(128, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(64, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
        layers.GlobalAveragePooling2D(),
        layers.Dense(10),
    ],
    name="student_c2",
)

student_c2.summary()

Model: "student_c2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_4 (Conv2D)           (None, 16, 16, 128)       3584      
                                                                 
 leaky_re_lu_2 (LeakyReLU)   (None, 16, 16, 128)       0         
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 16, 16, 128)       0         
 g2D)                                                            
                                                                 
 conv2d_5 (Conv2D)           (None, 8, 8, 64)          73792     
                                                                 
 leaky_re_lu_3 (LeakyReLU)   (None, 8, 8, 64)          0         
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 8, 8, 64)          0         
 g2D)                                                   

In [14]:
def softmax(x):
    exp_x = np.exp(x - np.max(x))  
    return exp_x / exp_x.sum(axis=0, keepdims=True)

def plt_heatmap(img,heatmap,title):
    
    fig,ax= plt.subplots()
    ax.imshow(img)
    #ax.imshow(heatmap, cmap='jet', alpha=0.5)
    ax.set_title(title) 
    plt.show()



def cam_loss(model_one,model_two,layer_one,layer_two,cam_img): 
    cifar10_classes = [
        
        'Airplane',
        'Automobile',
        'Bird',
        'Cat',
        'Deer',
        'Dog',
        'Frog',
        'Horse',
        'Ship',
        'Truck'
    ]
    
    img_tensor = np.expand_dims(cam_img, axis=0)
    preprocessed_img = preprocess_input(img_tensor)
    last_layer_weights = model_one.layers[-1].get_weights()[0]
    vis_model = Model(inputs=model_one.input, outputs=(model_one.layers[layer_one].output,model_one.layers[-1].output))
    with tf.GradientTape() as tape:
        last_conv_output, pred_vec = vis_model(preprocessed_img,training=False)
      
        last_conv_output = np.squeeze(last_conv_output) 
      
        pred = np.argmax(pred_vec)
   
    upsampled_last_conv_output = ndimage.zoom(last_conv_output, (4.57, 4.57, 1), order=1)
    last_layer_weights_for_pred = last_layer_weights[:, pred]
    heat_map = np.dot(upsampled_last_conv_output, last_layer_weights_for_pred) 
    flat=heat_map.flatten()
    flat=softmax(flat)
    
    #plt_heatmap(cam_img,heat_map,cifar10_classes[pred])
       
    
    last_layer_weights_2 = model_two.layers[-1].get_weights()[0]
    vis_model_2 = Model(inputs=model_two.input, outputs=(model_two.layers[layer_two].output,model_two.layers[-1].output))
    
    with tf.GradientTape() as tape:
        last_conv_output_2, pred_vec_2 = vis_model_2(preprocessed_img,training=False)
        last_conv_output_2 = np.squeeze(last_conv_output_2)         
        pred_2 = np.argmax(pred_vec_2)
        
    h = int(cam_img.shape[0]/last_conv_output_2.shape[0])
    w = int(cam_img.shape[1]/last_conv_output_2.shape[1])
    upsampled_last_conv_output_2 = ndimage.zoom(last_conv_output_2, (h, w, 1), order=1) 
    last_layer_weights_for_pred_2 = last_layer_weights_2[:, pred]
    heat_map_2 = np.dot(upsampled_last_conv_output_2, last_layer_weights_for_pred_2) 
    flat_2=heat_map_2.flatten()
    flat_2=softmax(flat_2)
    
  
    
    
        
    #plt_heatmap(cam_img,heat_map_2,cifar10_classes[pred_2])
    
    
    
    absolute_differences = np.abs(flat - flat_2)

    mae = np.mean(absolute_differences)
    
        
    
    return mae





class Distiller_3(keras.Model):
    def __init__(self, student, teacher):
        super().__init__()
        self.teacher = teacher
        self.student = student
        self.indexing=0

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
       
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature
        
    
    def train_step(self, x,y,conts):
       
       
        x_numpy=x.numpy()
       
             
        # retrieve the cam loss
        caml=(cam_loss(self.teacher,self.student,-4,-3,x_numpy[0]))
    
      
                                  
         
    
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
   
            student_predictions = self.student(x, training=True)

           
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )
     
     
            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss + caml*10
           
            
           
                            
        
        
        self.indexing=self.indexing+10
        if self.indexing>49500:
            self.indexing=0
       
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

       
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        
        self.compiled_metrics.update_state(y, student_predictions)

        
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
       
        return results,distillation_loss,caml

    def test_step(self, data):
     
        x, y = data

       
        y_prediction = self.student(x, training=False)

       
        student_loss = self.student_loss_fn(y, y_prediction)

      
        self.compiled_metrics.update_state(y, y_prediction)

       
        print('hi')
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [15]:
distiller_c2 = Distiller_3(student=student_c2, teacher=resnet50)
distiller_c2.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=['accuracy'],
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=3,
)

In [16]:
conts=1
epochs = 10
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))

    

    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
       
       

        history,l1,l2=distiller_c2.train_step(x_batch_train,y_batch_train,conts)
        conts+=1
    print(conts)
    tf.print(l1)
    print(l2)
    for key, value in history.items():
        print(key, tf.print(value))




Start of epoch 0
1564
5.21923399
0.0019018062
0.41108
accuracy None
0.977855384
student_loss None
5.21923399
distillation_loss None

Start of epoch 1
3127
5.76990271
0.0018867933
0.46929
accuracy None
1.28827631
student_loss None
5.76990271
distillation_loss None

Start of epoch 2
4690
6.55854082
0.0009849127
0.50498
accuracy None
1.15404582
student_loss None
6.55854082
distillation_loss None

Start of epoch 3
6253
7.41599274
0.001953125
0.530995
accuracy None
2.23063707
student_loss None
7.41599274
distillation_loss None

Start of epoch 4
7816
6.38461065
0.0019531248
0.551016
accuracy None
1.61445296
student_loss None
6.38461065
distillation_loss None

Start of epoch 5
9379
4.75229597
0.0019531242
0.566683352
accuracy None
0.568609059
student_loss None
4.75229597
distillation_loss None

Start of epoch 6
10942
5.71874142
0.0019531203
0.579668581
accuracy None
1.49964952
student_loss None
5.71874142
distillation_loss None

Start of epoch 7
12505
4.3732934
0.0019503656
0.591137528
accur

In [17]:
distiller_c2.evaluate(test_X,test_labels)

hi


[0.6414999961853027, 0.9571666717529297]

# Distillation with cam loss (sigmoid,mae)

In [22]:
#proportionality maintained student
student_c3= keras.Sequential(
    [
        keras.Input(shape=(32, 32, 3)),
        layers.Conv2D(128, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(64, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
        layers.GlobalAveragePooling2D(),
        layers.Dense(10),
    ],
    name="student_c3",
)

student_c3.summary()

Model: "student_c3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_10 (Conv2D)          (None, 16, 16, 128)       3584      
                                                                 
 leaky_re_lu_6 (LeakyReLU)   (None, 16, 16, 128)       0         
                                                                 
 max_pooling2d_6 (MaxPoolin  (None, 16, 16, 128)       0         
 g2D)                                                            
                                                                 
 conv2d_11 (Conv2D)          (None, 8, 8, 64)          73792     
                                                                 
 leaky_re_lu_7 (LeakyReLU)   (None, 8, 8, 64)          0         
                                                                 
 max_pooling2d_7 (MaxPoolin  (None, 8, 8, 64)          0         
 g2D)                                                   

In [23]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def plt_heatmap(img,heatmap,title):
    
    fig,ax= plt.subplots()
    ax.imshow(img)
    #ax.imshow(heatmap, cmap='jet', alpha=0.5)
    ax.set_title(title) 
    plt.show()



def cam_loss(model_one,model_two,layer_one,layer_two,cam_img): 
    cifar10_classes = [
        
        'Airplane',
        'Automobile',
        'Bird',
        'Cat',
        'Deer',
        'Dog',
        'Frog',
        'Horse',
        'Ship',
        'Truck'
    ]
    
    img_tensor = np.expand_dims(cam_img, axis=0)
    preprocessed_img = preprocess_input(img_tensor)
    last_layer_weights = model_one.layers[-1].get_weights()[0]
    vis_model = Model(inputs=model_one.input, outputs=(model_one.layers[layer_one].output,model_one.layers[-1].output))
    with tf.GradientTape() as tape:
        last_conv_output, pred_vec = vis_model(preprocessed_img,training=False)
      
        last_conv_output = np.squeeze(last_conv_output) 
      
        pred = np.argmax(pred_vec)
   
    upsampled_last_conv_output = ndimage.zoom(last_conv_output, (4.57, 4.57, 1), order=1)
    last_layer_weights_for_pred = last_layer_weights[:, pred]
    heat_map = np.dot(upsampled_last_conv_output, last_layer_weights_for_pred) 
    flat=heat_map.flatten()
    flat=sigmoid(flat)
    
    #plt_heatmap(cam_img,heat_map,cifar10_classes[pred])
       
    
    last_layer_weights_2 = model_two.layers[-1].get_weights()[0]
    vis_model_2 = Model(inputs=model_two.input, outputs=(model_two.layers[layer_two].output,model_two.layers[-1].output))
    
    with tf.GradientTape() as tape:
        last_conv_output_2, pred_vec_2 = vis_model_2(preprocessed_img,training=False)
        last_conv_output_2 = np.squeeze(last_conv_output_2)         
        pred_2 = np.argmax(pred_vec_2)
        
    h = int(cam_img.shape[0]/last_conv_output_2.shape[0])
    w = int(cam_img.shape[1]/last_conv_output_2.shape[1])
    upsampled_last_conv_output_2 = ndimage.zoom(last_conv_output_2, (h, w, 1), order=1) 
    last_layer_weights_for_pred_2 = last_layer_weights_2[:, pred]
    heat_map_2 = np.dot(upsampled_last_conv_output_2, last_layer_weights_for_pred_2) 
    flat_2=heat_map_2.flatten()
    flat_2=sigmoid(flat_2)
    
  
    
    
        
    #plt_heatmap(cam_img,heat_map_2,cifar10_classes[pred_2])
    
    
    
    absolute_differences = np.abs(flat - flat_2)

    mae = np.mean(absolute_differences)
    
        
    
    return mae





class Distiller_4(keras.Model):
    def __init__(self, student, teacher):
        super().__init__()
        self.teacher = teacher
        self.student = student
        self.indexing=0

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
       
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature
        
    
    def train_step(self, x,y,conts):
       
       
        x_numpy=x.numpy()
       
             
        # retrieve the cam loss
        caml=(cam_loss(self.teacher,self.student,-4,-3,x_numpy[0]))
    
      
                                  
         
    
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
   
            student_predictions = self.student(x, training=True)

           
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )
     
     
            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss + caml
           
           
           
                            
        
        
        self.indexing=self.indexing+10
        if self.indexing>49500:
            self.indexing=0
       
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

       
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        
        self.compiled_metrics.update_state(y, student_predictions)

        
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
       
        return results,distillation_loss,caml

    def test_step(self, data):
     
        x, y = data

       
        y_prediction = self.student(x, training=False)

       
        student_loss = self.student_loss_fn(y, y_prediction)

      
        self.compiled_metrics.update_state(y, y_prediction)

       
        print('hi')
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [24]:
distiller_c3 = Distiller_4(student=student_c3, teacher=resnet50)
distiller_c3.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=['accuracy'],
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=3,
)

In [25]:
conts=1
epochs = 10
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))

    

    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
       
       

        history,l1,l2=distiller_c3.train_step(x_batch_train,y_batch_train,conts)
        conts+=1
    print(conts)
    tf.print(l1)
    print(l2)
    for key, value in history.items():
        print(key, tf.print(value))




Start of epoch 0


  return 1 / (1 + np.exp(-x))


1564
7.32344341
0.31949922
0.40366
accuracy None
1.23258853
student_loss None
7.32344341
distillation_loss None

Start of epoch 1
3127
6.26005077
0.72482526
0.46428
accuracy None
1.19808292
student_loss None
6.26005077
distillation_loss None

Start of epoch 2
4690
6.33762503
0.42104483
0.500826657
accuracy None
1.90962744
student_loss None
6.33762503
distillation_loss None

Start of epoch 3
6253
5.38203
0.33154404
0.526075
accuracy None
1.34064531
student_loss None
5.38203
distillation_loss None

Start of epoch 4
7816
2.61925936
0.25167355
0.545916
accuracy None
0.68687582
student_loss None
2.61925936
distillation_loss None

Start of epoch 5
9379
4.81336117
0.32036978
0.562433362
accuracy None
1.46040189
student_loss None
4.81336117
distillation_loss None

Start of epoch 6
10942
5.61103725
0.09207921
0.57557714
accuracy None
1.12619174
student_loss None
5.61103725
distillation_loss None

Start of epoch 7
12505
3.83559346
0.12265824
0.58706
accuracy None
0.568683147
student_loss None
3.

In [26]:
distiller_c3.evaluate(test_X,test_labels)

hi


[0.6571999788284302, 1.1914244890213013]

# Distillation with cam loss (mae,random 5 samples)

In [27]:
#proportionality maintained student
student_c4= keras.Sequential(
    [
        keras.Input(shape=(32, 32, 3)),
        layers.Conv2D(128, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(64, (3, 3), strides=(2, 2), padding="same"),
        layers.LeakyReLU(alpha=0.2),
        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
        layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
        layers.GlobalAveragePooling2D(),
        layers.Dense(10),
    ],
    name="student_c4",
)

student_c4.summary()

Model: "student_c4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_13 (Conv2D)          (None, 16, 16, 128)       3584      
                                                                 
 leaky_re_lu_8 (LeakyReLU)   (None, 16, 16, 128)       0         
                                                                 
 max_pooling2d_8 (MaxPoolin  (None, 16, 16, 128)       0         
 g2D)                                                            
                                                                 
 conv2d_14 (Conv2D)          (None, 8, 8, 64)          73792     
                                                                 
 leaky_re_lu_9 (LeakyReLU)   (None, 8, 8, 64)          0         
                                                                 
 max_pooling2d_9 (MaxPoolin  (None, 8, 8, 64)          0         
 g2D)                                                   

In [28]:
def plt_heatmap(img,heatmap,title):
    
    fig,ax= plt.subplots()
    ax.imshow(img)
    #ax.imshow(heatmap, cmap='jet', alpha=0.5)
    ax.set_title(title) 
    plt.show()



def cam_loss(model_one,model_two,layer_one,layer_two,cam_img): 
    cifar10_classes = [
        
        'Airplane',
        'Automobile',
        'Bird',
        'Cat',
        'Deer',
        'Dog',
        'Frog',
        'Horse',
        'Ship',
        'Truck'
    ]
    
    img_tensor = np.expand_dims(cam_img, axis=0)
    preprocessed_img = preprocess_input(img_tensor)
    last_layer_weights = model_one.layers[-1].get_weights()[0]
    vis_model = Model(inputs=model_one.input, outputs=(model_one.layers[layer_one].output,model_one.layers[-1].output))
    with tf.GradientTape() as tape:
        last_conv_output, pred_vec = vis_model(preprocessed_img,training=False)
      
        last_conv_output = np.squeeze(last_conv_output) 
      
        pred = np.argmax(pred_vec)
   
    upsampled_last_conv_output = ndimage.zoom(last_conv_output, (4.57, 4.57, 1), order=1)
    last_layer_weights_for_pred = last_layer_weights[:, pred]
    heat_map = np.dot(upsampled_last_conv_output, last_layer_weights_for_pred) 
    flat=heat_map.flatten()
    
    #plt_heatmap(cam_img,heat_map,cifar10_classes[pred])
    

    
    
    last_layer_weights_2 = model_two.layers[-1].get_weights()[0]
    vis_model_2 = Model(inputs=model_two.input, outputs=(model_two.layers[layer_two].output,model_two.layers[-1].output))
    
    with tf.GradientTape() as tape:
        last_conv_output_2, pred_vec_2 = vis_model_2(preprocessed_img,training=False)
        last_conv_output_2 = np.squeeze(last_conv_output_2)         
        pred_2 = np.argmax(pred_vec_2)
        
    h = int(cam_img.shape[0]/last_conv_output_2.shape[0])
    w = int(cam_img.shape[1]/last_conv_output_2.shape[1])
    upsampled_last_conv_output_2 = ndimage.zoom(last_conv_output_2, (h, w, 1), order=1) 
    last_layer_weights_for_pred_2 = last_layer_weights_2[:, pred]
    heat_map_2 = np.dot(upsampled_last_conv_output_2, last_layer_weights_for_pred_2) 
    flat_2=heat_map_2.flatten()
    
    #print(heat_map_2[0])
    
    #plt_heatmap(cam_img,heat_map_2,cifar10_classes[pred_2])
    
    
    
    absolute_differences = np.abs(flat - flat_2)


    mae = np.mean(absolute_differences)
    
    
    
    
    return mae





class Distiller_5(keras.Model):
    def __init__(self, student, teacher):
        super().__init__()
        self.teacher = teacher
        self.student = student
        self.indexing=0

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
       
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature
        
    
    def train_step(self, x,y,conts):
       
       
        x_numpy=x.numpy()
       
        
        sum=0
        
        leng=int(x_numpy.size/x_numpy[0].size)
        ind_one=0
        for i in range(leng): 
            if ind_one >= leng:
                break
            sum=sum+(cam_loss(self.teacher,self.student,-4,-3,x_numpy[ind_one]))
            ind_one+=7
            
        
        caml=(sum/5)
        
        
        
                          
         
    
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)
            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )
     
     
            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss + caml
                            
        
        
        self.indexing=self.indexing+10
        if self.indexing>49500:
            self.indexing=0
       
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

       
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        
        self.compiled_metrics.update_state(y, student_predictions)

        
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
       
        return results

    def test_step(self, data):
     
        x, y = data

       
        y_prediction = self.student(x, training=False)

       
        student_loss = self.student_loss_fn(y, y_prediction)

      
        self.compiled_metrics.update_state(y, y_prediction)

       
        print('hi')
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [29]:
distiller_c4 = Distiller_5(student=student_c4, teacher=resnet50)
distiller_c4.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=['accuracy'],
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=3,
)

In [30]:
conts=1
epochs = 10
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))

    

    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
       
       

        history=distiller_c4.train_step(x_batch_train,y_batch_train,conts)
        conts+=1
    print(conts)
    for key, value in history.items():
        print(key, tf.print(value))




Start of epoch 0
1564
0.41358
accuracy None
1.09150195
student_loss None
6.02514935
distillation_loss None

Start of epoch 1
3127
0.46973
accuracy None
2.42356873
student_loss None
9.37817383
distillation_loss None

Start of epoch 2
4690
0.505046666
accuracy None
2.07264328
student_loss None
7.65290403
distillation_loss None

Start of epoch 3
6253
0.530355
accuracy None
0.656530619
student_loss None
4.7881918
distillation_loss None

Start of epoch 4
7816
0.550592
accuracy None
0.608008862
student_loss None
4.50802898
distillation_loss None

Start of epoch 5
9379
0.566626668
accuracy None
1.24867606
student_loss None
5.79140139
distillation_loss None

Start of epoch 6
10942
0.579317153
accuracy None
1.10779262
student_loss None
4.02469587
distillation_loss None

Start of epoch 7
12505
0.59037751
accuracy None
1.43136072
student_loss None
5.95758629
distillation_loss None

Start of epoch 8
14068
0.600113332
accuracy None
0.781636
student_loss None
4.29827
distillation_loss None

Start o

In [31]:
distiller_c4.evaluate(test_X,test_labels)

hi


[0.6520000100135803, 0.7082465291023254]