In [None]:
# Imports

import numpy as np
import sys
# https://github.com/Ujjwal-9/Knowledge-Distillation
sys.path.append('Knowledge/utils/')
import sklearn
import os
import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.models import Model
import os
import keras
from keras import optimizers

# use non standard flow_from_directory
# it outputs y_batch that contains onehot targets and logits
from image_preprocessing_ver2 import ImageDataGenerator

In [None]:
# Load logits saved from teacher

data_dir = os.path.join(os.getcwd(), 'data')

train_logits = np.load(os.path.join(data_dir, 'cifar10_train_logits.npy'), allow_pickle=True)[()]
val_logits = np.load(os.path.join(data_dir, 'cifar10_val_logits.npy'), allow_pickle=True)[()]

data_generator = ImageDataGenerator(data_format='channels_last', rescale=1/255)

batch_size = 128
epochs=75
data_dir = r''
train_generator = data_generator.flow_from_directory(os.path.join(data_dir, 'cifar10\\train'), train_logits, target_size=(32, 32), color_mode='rgb', batch_size=batch_size)
val_generator = data_generator.flow_from_directory(os.path.join(data_dir, 'cifar10\\test'), val_logits, target_size=(32, 32), color_mode='rgb', batch_size=batch_size)

In [None]:
# Function to build and return student given dilation parameter

from keras import models, layers

num_classes = 10

def build_student(dilation):
    
    if dilation > 1:
        dilation = 1
    if round(32 * dilation) == 0:
        dilation = 1/32
    
    student = Sequential()
    student.add(Conv2D(int(round(32*dilation)), (3, 3), input_shape=(32, 32, 3)))
    student.add(Activation('relu'))
    student.add(MaxPooling2D(pool_size=(2, 2)))

    student.add(Conv2D(int(round(64*dilation)), (3, 3)))
    student.add(Activation('relu'))
    student.add(MaxPooling2D(pool_size=(2, 2)))

    student.add(Flatten())
    student.add(Dense(int(round(256*dilation))))
    student.add(Activation('relu'))
    student.add(Dense(num_classes))
    student.add(Activation('softmax'))
    
    return student

In [None]:
# Distillation loss (soft targets and hard targets)

from keras.losses import categorical_crossentropy as logloss
from keras.metrics import categorical_accuracy, top_k_categorical_accuracy
from keras import backend as K

def distillation_loss(y_true, y_pred, hard_loss_weight, temp):
    y_true, logits = y_true[:, :10], y_true[:, 10:]
    
    y_soft = K.softmax(logits / temp)
    
    y_pred, y_pred_soft = y_pred[:, :10], y_pred[:, 10:]
    
    return hard_loss_weight * logloss(y_true, y_pred) + logloss(y_soft, y_pred_soft)
    

In [None]:
# Define val data generator

val_generator_no_shuffle = data_generator.flow_from_directory(
    os.path.join(data_dir, 'cifar10\\test'), val_logits,
    target_size=(32, 32),
    batch_size=batch_size, color_mode='rgb', shuffle=False
)

In [None]:
def distill(dilation, temp, weight):
    """
    Metrics are redefined here because soft_logloss depends on non-standard param (temp).
    model.compile wouldn't take lambdas as metrics, so this was the workaround.
    """
    def accuracy(y_true, y_pred):
        y_true = y_true[:, :10]
        y_pred = y_pred[:, :10]
        return categorical_accuracy(y_true, y_pred)

    def top_5_accuracy(y_true, y_pred):
        y_true = y_true[:, :10]
        y_pred = y_pred[:, :10]
        return top_k_categorical_accuracy(y_true, y_pred)

    def categorical_crossentropy(y_true, y_pred):
        y_true = y_true[:, :10]
        y_pred = y_pred[:, :10]
        return logloss(y_true, y_pred)

    def soft_logloss(y_true, y_pred):     
        logits = y_true[:, 10:]
        y_soft = K.softmax(logits/temp)
        y_pred_soft = y_pred[:, 10:]    
        return logloss(y_soft, y_pred_soft)
    
    student = build_student(dilation)
    
    # Remove softmax
    student.pop()
    
    # Get student logits and class probabilities
    logits = student.layers[-1].output
    probabilities = layers.Activation('softmax')(logits)

    # Apply temperature to get softed probabilities
    # Temps of 2.5-4 "worked significantly better" than other temps on networks with 30 units per layer
    logits_T = layers.Lambda(lambda x: x / temp)(logits)
    probabilities_T = layers.Activation('softmax')(logits_T)

    # Define student that outputs probabilities and softed probabilities
    output = layers.concatenate([probabilities, probabilities_T])
    model = Model(student.input, output)
    
    model.compile(
        optimizer='adam',
        loss=lambda y_true, y_pred: distillation_loss(y_true, y_pred, weight, temp),
        metrics=[accuracy, top_5_accuracy, categorical_crossentropy, soft_logloss])

    history = model.fit_generator(
        train_generator,
        epochs=25,
        steps_per_epoch=50000/batch_size,
        verbose=0,
        validation_data=val_generator,
        validation_steps=25,
        callbacks=[
                EarlyStopping(monitor='val_loss', patience=5, min_delta=0.005)
            ])

    results = model.evaluate_generator(val_generator_no_shuffle, 50000/batch_size)
    return results, model

In [None]:
# Load data for non-distilled training
from keras.datasets import cifar10
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Convert class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [None]:
# Function for training student normally

def train(dilation):
    student = build_student(dilation)
    student.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy'])
    history = student.fit(
        x_train,
        y_train,
        epochs=25,
        verbose=0,
        validation_data=(x_test, y_test),
        callbacks=[
                EarlyStopping(monitor='val_loss', patience=5, min_delta=0.005)
            ])
    results = student.evaluate(x_test, y_test)
    return results

In [None]:
# Iterate over dilation values, saving results

dilation_accuracies = []
reg_accuracies = []
for dilation in np.linspace(0.125, 1, 8):
    d_accuracy, model = distill(dilation, temp=10, weight=0.2)
    dilation_accuracies.append((dilation, d_accuracy))
    acc = train(dilation)
    reg_accuracies.append((dilation, acc))
dilation_accuracies = np.asarray(dilation_accuracies)
reg_accuracies = np.asarray(reg_accuracies)
np.save('data/dilation_accuracies.npy', dilation_accuracies)
np.save('data/reg_accuracies.npy', reg_accuracies)

In [None]:
# Plot acc vs dilation

import matplotlib.pyplot as plt
import numpy as np

d_accuracies = np.load('data/dilation_accuracies.npy', allow_pickle=True)
dilation_x = d_accuracies[:, 0]
dilation_y = list(map(lambda x: x[1][1], d_accuracies))

reg_accuracies = np.load('data/reg_accuracies.npy', allow_pickle=True)
reg_x = reg_accuracies[:, 0]
reg_y = list(map(lambda x: x[1][1], reg_accuracies))

plt.plot(dilation_x, dilation_y, label='Distilled')
plt.plot(reg_x, reg_y, label='Benchmark')
plt.title('Dilation vs Accuracy')
plt.xlabel('Dilation')
plt.ylabel('Accuracy')
plt.legend()
plt.xticks(np.linspace(0, 1, 9))
plt.savefig('dilation_v_accuracy.png')