## Maciej Manna
### Project 1:
# Model for classification of CIFAR100 dataset 

# Approach:

Classification using **DenseNet** model (see: G. Huang et al., [*Densely Connected Convolutional Networks*](https://arxiv.org/abs/1608.06993), 2018). Implementation in ```tensorflow 2``` inspired by [```densenet-tf2```](https://github.com/justincosentino/densenet-tf2). 

In [192]:
# IMPORTS

import os
import time

import tensorflow as tf
import tensorflow_datasets as tfds

## Dataset Import and Preprocessing

In [193]:
# DATA IMPORT AND SPLIT PARAMETERS

ds_name   = 'cifar100:3.*.*'
ds_mean   = (129.3, 124.1, 112.4)
ds_stddev = ( 68.2,  65.4,  70.4)
ds_batch_size =    64
ds_trn_size   = 45000
ds_val_size   =  5000

ds_split = [f"train[0:{ds_trn_size}]",
            f"train[{ds_trn_size}:{ds_trn_size + ds_val_size}]",
            f"test"]

In [194]:
# LOAD CIFAR100 DATA USING 'tensorflow_datasets'

(trn, val, tst), info = tfds.load(ds_name, as_supervised=True, split=ds_split, with_info=True)

In [195]:
# PREPROCESS DATASET

def get_norm_fn(mean, stddev):
    mean = tf.reshape(tf.constant(mean, dtype=tf.float64), [1, 1, 3])
    stddev = tf.reshape(tf.constant(stddev, dtype=tf.float64), [1, 1, 3])

    def normalize_image(img, label):
        return tf.math.divide(tf.math.subtract(tf.cast(img, tf.float64), mean), stddev), label
    return normalize_image

trn = (trn.map(get_norm_fn(ds_mean, ds_stddev), num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .cache().shuffle(1000).batch(ds_batch_size).repeat())

val = (val.map(get_norm_fn(ds_mean, ds_stddev), num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .batch(ds_batch_size).cache())

tst = (tst.map(get_norm_fn(ds_mean, ds_stddev), num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .batch(ds_batch_size).cache())

In [196]:
# BASIC DATASET PARAMETERS

batch_size = ds_batch_size

trn_size   = ds_trn_size
val_size   = ds_val_size
tst_size   = info.splits['test'].num_examples

ds_size    = trn_size + val_size + tst_size

ds_keys    = info.supervised_keys

lbl_type   = info.features['label'].dtype
img_type   = info.features['image'].dtype
img_shape  = info.features['image'].shape

n_classes  = info.features['label'].num_classes

print(f"'CIFAR100' DATASET INFO:")
print(f"  - examples:                     {ds_size} (training: {trn_size}, validation: {val_size}, test: {tst_size})")
print(f"  - batch size used:              {batch_size}")
print(f"  - number of separate classes:   {n_classes}")
print(f"  - features:                     {', '.join(ds_keys)}")
print(f"        - image - data type: {img_type}, shape: {img_shape}")
print(f"        - label - data type: {lbl_type}")

'CIFAR100' DATASET INFO:
  - examples:                     60000 (training: 45000, validation: 5000, test: 10000)
  - batch size used:              64
  - number of separate classes:   100
  - features:                     image, label
        - image - data type: <dtype: 'uint8'>, shape: (32, 32, 3)
        - label - data type: <dtype: 'int64'>


## Model Construction

In [197]:
# CONSTANTS AND SETTINGS
eps = 1e-5

In [198]:
# BLOCK DEFINITIONS

# === DEFINITION OF TRANSITION BLOCKS === #

def transition_block(x, reduction, name, dropout):
    x = tf.keras.layers.BatchNormalization(axis=3, epsilon=eps, name=name + '.batchnorm')(x)
    x = tf.keras.layers.ReLU(name=name + '.relu')(x)
    x = tf.keras.layers.Conv2D(filters=int(x.shape[-1] * reduction),
                               kernel_size=(1, 1),
                               strides=(1, 1),
                               padding="valid",
                               use_bias=False,
                               kernel_regularizer=tf.keras.regularizers.l2(l=1e-4),
                               kernel_initializer=tf.keras.initializers.he_normal(),
                               name=name + '.conv2d')(x)
    
    if dropout > 0:
        x = tf.keras.layers.Dropout(rate=dropout, name=name + '.dropout')
        
    x = tf.keras.layers.AvgPool2D(pool_size=(2, 2),
                                  strides=(2, 2),
                                  padding="valid",
                                  name=name + '.avgpool')(x)
    
    return x


def final_transition_block(x, name):
    x = tf.keras.layers.BatchNormalization(axis=3, epsilon=eps, name=name + '.batchnorm')(x)
    x = tf.keras.layers.ReLU(name=name + '.relu')(x)
    x = tf.keras.layers.GlobalAvgPool2D(name=name + '.avgpool')(x)
    
    return x


# === DEFINITION OF CONVOLUTION BLOCK === #

def convolution_block(x, growth_rate, name, dropout):

    # BOTTLENECK
    y = tf.keras.layers.BatchNormalization(axis=3, epsilon=eps, name=name + '.batchnorm_0')(x)
    y = tf.keras.layers.ReLU(name=name + '.relu_0')(y)
    y = tf.keras.layers.Conv2D(filters=4 * growth_rate,
                               kernel_size=(1, 1),
                               strides=(1, 1),
                               padding="valid",
                               use_bias=False,
                               kernel_regularizer=tf.keras.regularizers.l2(l=1e-4),
                               kernel_initializer=tf.keras.initializers.he_normal(),
                               name=name + '.conv2d_1')(y)
    if dropout > 0:
        x = tf.keras.layers.Dropout(rate=dropout, name=name + '.dropout_1')
        
    y = tf.keras.layers.BatchNormalization(axis=3, epsilon=eps, name=name + '.batchnorm_1')(y)
    y = tf.keras.layers.ReLU(name=name + '.relu_1')(y)

    
    y = tf.keras.layers.ZeroPadding2D(padding=((1, 1), (1, 1)), name=name + '.padding')(y)
    y = tf.keras.layers.Conv2D(filters=growth_rate,
                               kernel_size=(3, 3),
                               strides=(1, 1),
                               padding="valid",
                               use_bias=False,
                               kernel_regularizer=tf.keras.regularizers.l2(l=1e-4),
                               kernel_initializer=tf.keras.initializers.he_normal(),
                               name=name + '.conv2d_2')(y)

    if dropout > 0:
        x = tf.keras.layers.Dropout(rate=dropout, name=name + '.dropout_2')
    
    x = tf.keras.layers.Concatenate(axis=3, name=name + 'cat')([x, y])
    
    return x


# === DEFINITION OF DENSE BLOCK === #

def dense_block(x, growth_rate, num_per_block, name, dropout):
    for i in range(num_per_block):
        x = convolution_block(x, growth_rate, name=f"{name}.block_{i + 1}", dropout=dropout)
    return x

In [199]:
# DENSENET DEFINITION (for 32x32 images of CIFAR)

def densenet(depth = 100, growth_rate = 12, reduction = 0.5, dropout = 0.0, summary=True):
    num_classes = n_classes
    num_per_block = ((depth - 4) // 3) // 2

    # INPUT LAYER
    x_in = tf.keras.Input(shape=(32, 32, 3), name="input")

    # INITIAL CONVOLUTION LAYER
    x = tf.keras.layers.ZeroPadding2D(padding=((1, 1), (1, 1)), name="init.padding")(x_in)
    x = tf.keras.layers.Conv2D(filters=growth_rate * 2,
                               kernel_size=(3, 3),
                               strides=(1, 1),
                               padding="valid",
                               use_bias=False,
                               kernel_regularizer=tf.keras.regularizers.l2(l=1e-4),
                               kernel_initializer=tf.keras.initializers.he_normal(),
                               name="init.conv2d")(x)

    # FIRST BLOCK
    x = dense_block(x, growth_rate, num_per_block, "dense_1", dropout=dropout)
    x = transition_block(x, reduction, "trans_1", dropout=dropout)
    
    # SECOND BLOCK
    x = dense_block(x, growth_rate, num_per_block, "dense_2", dropout=dropout)
    x = transition_block(x, reduction, "trans_2", dropout=dropout)

    # THIRD (FINAL) BLOCK
    x = dense_block(x, growth_rate, num_per_block, "dense_3", dropout=dropout)
    x = final_transition_block(x, "trans_3")

    # OUTPUT LAYER
    x_out = tf.keras.layers.Dense(units=num_classes,
                              kernel_regularizer=tf.keras.regularizers.l2(l=1e-4),
                              name="output")(x)

    model = tf.keras.Model(inputs=x_in, outputs=x_out, name="densenet")

    if summary:
        model.summary()

    return model

## Basic survey of hyperparameters and number of model parameters

In [200]:
# PARAMETER SETUPS

PARAMS = {
    'default': {
        'depth': 100,
        'growth_rate': 12,
        'reduction': 0.5,
        'dropout': 0.0
    },
    'shallow_50': {
        'depth': 50,
        'growth_rate': 12,
        'reduction': 0.5,
        'dropout': 0.0
    },
    'shallow_40': {
        'depth': 40,
        'growth_rate': 12,
        'reduction': 0.5,
        'dropout': 0.0
    },
    'shallow_35': {
        'depth': 35,
        'growth_rate': 12,
        'reduction': 0.5,
        'dropout': 0.0
    },
    'grow_8': {
        'depth': 100,
        'growth_rate': 8,
        'reduction': 0.5,
        'dropout': 0.0
    },
    'grow_5': {
        'depth': 100,
        'growth_rate': 5,
        'reduction': 0.5,
        'dropout': 0.0
    },
    'grow_5_adj': {
        'depth': 97,
        'growth_rate': 5,
        'reduction': 0.5,
        'dropout': 0.0
    },
    'mixed_d64_g8': {
        'depth': 64,
        'growth_rate': 8,
        'reduction': 0.5,
        'dropout': 0.0
    },
    'mixed_d56_g8': {
        'depth': 56,
        'growth_rate': 8,
        'reduction': 0.5,
        'dropout': 0.0
    },
    'mixed_d128_g4': {
        'depth': 128,
        'growth_rate': 4,
        'reduction': 0.5,
        'dropout': 0.0
    },
    'mixed_d122_g4': {
        'depth': 122,
        'growth_rate': 4,
        'reduction': 0.5,
        'dropout': 0.0
    },
}

### Number of parameters in different setups:

Note: max number of parameters allowed - 150,000.

 - ```default``` - 824,020

 - ```shallow_50``` - 238,714
 - ```shallow_40``` - 193,180
 - ```shallow_35``` - 151,546 <<<
 
 - ```grow_8``` - 384,692
 - ```grow_5``` - 162,966
 - ```grow_5_adj``` - 146,866 <<<
 
 - ```mixed_d64_g8``` - 187,124
 - ```mixed_d56_g8``` - 135,604 <<<
 - ```mixed_d128_g4``` - 157,092
 - ```mixed_d122_g4``` - 144,594 <<<
 
 Choice of four setups that meet parameter criteria, with following depth--growth rate tradeoffs:
  - #1 - depth: 122, gr:  4
  - #2 - depth:  97, gr:  5
  - #3 - depth:  56, gr:  8
  - #4 - depth:  35, gr: 12

In [201]:
# model = densenet(**PARAMS['mixed_d120_g4'])

choices = [('d122_gr4','mixed_d122_g4'),
           ('d97_gr5', 'grow_5_adj'),
           ('d56_gr8' ,'mixed_d56_g8'),
           ('d35_gr12', 'shallow_35')]

## Model Training and Evaluation - Setup

In [207]:
# TRAINING SETTINGS

learning_rate = 0.1
momentum = 0.9

#base_dir='~/.dlearn/'
#base_dir='.dlearn'

base_dir = '.'

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [208]:
# MODEL DATA DIRS

base_dir = os.path.expanduser(base_dir)
assert os.path.isdir(base_dir)

In [209]:
# TRAINING FOR MODEL (per key)

def train_model(key, base_dir, epochs=2):
    
    # DIRS SETUP
    base_dir = os.path.join(base_dir, key)
    if not os.path.exists(base_dir):
        os.mkdir(base_dir)
    
    chk_dir = os.path.join(base_dir, 'checkpoints')
    if not os.path.exists(chk_dir):
        os.mkdir(chk_dir)

    log_dir = os.path.join(base_dir, 'logs')
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    
    strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    #strategy = tf.distribute.MirroredStrategy()
    
    with strategy.scope():        
        par_key = [par_key for ckey, par_key in choices if ckey == key][0]
        model = densenet(summary=False, **PARAMS[par_key])
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=momentum, nesterov=True)
        model.compile(optimizer=optimizer,
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    
    def scheduler(epoch):
        if epoch < epochs * 0.5:
            return learning_rate
        if epoch >= epochs * 0.5 and epoch < epochs * 0.75:
            return FLAGS.lr / 10.0
        return FLAGS.lr / 100.0
    
    # CALLBACKS
    lr_sched_callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, profile_batch=0)
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        #filepath=os.path.join( chk_dir, "checkpoint-weights-{epoch:02d}-{val_loss:.2f}.ckpt"),
        filepath=os.path.join( chk_dir, "checkpoint-weights-{epoch:02d}.ckpt"),
        save_weights_only=True, verbose=1)
        #save_weights_only=True, verbose=1, save_best_only=True)
    
    # ACTUAL TRAINING
    _ = model.fit(x=trn,
                  steps_per_epoch=trn_size // batch_size,
                  epochs=epochs,
                  callbacks=[lr_sched_callback, tensorboard_callback, checkpoint_callback],
                  validation_data=val,
                  verbose=2)
        
    # MODEL EVALUATION
    model.load_weights(tf.train.latest_checkpoint(chk_dir))
    results = model.evaluate(tst)
    print("\ntest loss {}, test acc: {}".format(results[0], results[1]))
    
    return model, chk_dir

## Estimating number of allowed training epochs

Each model is trained for two epochs (first one is always slightly longer), and time of training second one is retrieved. On that base it is estimated how much epochs can be used for training to fit into 30 minute limit (minus one, to accomodate for longer first epoch).

In [None]:
# ESTIMNATING No OF EPOCHS FOR 'd122_gr4' SETUP
_ = train_model('d122_gr4', base_dir)
tt1 = 95
epochs1 = int((30 * 60) / tt1) # 17

In [None]:
# ESTIMNATING No OF EPOCHS FOR 'd97_gr5' SETUP
_ = train_model('d97_gr5', base_dir)
tt2 = 79
epochs2 = int((30 * 60) / tt2) # 21

In [None]:
# ESTIMNATING No OF EPOCHS 'd56_gr8' SETUP
_ = train_model('d56_gr8', base_dir)
tt3 = 50
epochs3 = int((30 * 60) / tt3) # 35

In [None]:
# ESTIMNATING No OF EPOCHS FOR 'd35_gr12' SETUP
_ = train_model('d35_gr12', base_dir)
tt4 = 42
epochs4 = int((30 * 60) / tt4) # 41

## Proper Model Training and Evaluation

In [None]:
# TRAINING AND EVALUATION OF MODEL FOR 'd122_gr4' SETUP
m1, cd1 = train_model('d122_gr4', base_dir, epochs=epochs1)

In [None]:
# TRAINING AND EVALUATION OF MODEL FOR 'd97_gr5' SETUP
m2, cd2 = train_model('d97_gr5', base_dir, epochs=epochs2)

In [None]:
# TRAINING AND EVALUATION OF MODEL FOR 'd56_gr8' SETUP
m3, cd3 = train_model('d56_gr8', base_dir, epochs=epochs3)

In [None]:
# TRAINING AND EVALUATION OF MODEL FOR 'd35_gr12' SETUP
m4, cd4 = train_model('d35_gr12', base_dir, epochs=epochs4)

### Results Summary

|       -        | Training Set |     -    | Validation Set |     -    | Test Set |    -     |
|---------------:|:------------:|:--------:|:--------------:|:--------:|:--------:|:--------:|
|     Model      |     Loss     |    Acc   |      Loss      |    Acc   |   Loss   |    Acc   |
| ```d122_gr4``` |     1.91     |   0.60   |      2.56      |   0.47   |  2.5226  |  0.4746  |
|  ```d97_gr5``` |     1.88     |   0.61   |      2.54      |   0.48   |  2.5653  |  0.4789  |
|  ```d56_gr8``` |     1.81     |   0.63   |      2.74      |   0.46   |  2.7928  |  0.4583  |
| ```d35_gr12``` |     1.81     |   0.65   |      2.59      |   0.50   |  2.5738  |  0.5048  |