# Mạth - Assignment 7

Using Tensorflow and Keras to implement  the following architectures:


1. Lenet as shown in the below figure.
Do the ablation study with the following hyper-paramters:
- Activation Function: Sigmoid vs tanh vs ReLU. 
- Optimizer: SGD vs Adam
- Number of filters (or depth of tensors): (6 x 6 x 16 x 16) vs (3 x 3 x 8 x 8) vs (12 x 12 x 32 x 32)
Note that, at each configuration, keep the other parameters unchanged. Moreover, please plot loss functions for train and validation sets. What can we conclude from the above experiments?


2. Mini-ResNet as shown in the below figure.
Compare the performance of LeNet and Mini-ResNet.


Conclusion:
- Original Letnet is good with SGD
- The accuracy will drop when we decrease the filters (3,8) and increase significant when we increase the filter (12,32)
- MiniResnet is not good than Lenet

In [39]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, Dense, Flatten, MaxPooling2D
from tensorflow.keras.layers import GlobalAveragePooling2D, Add, Dense, Activation, BatchNormalization
from sklearn.utils import shuffle
import wandb

tf.random.set_seed(2021)
from tensorflow.python.client import device_lib 
import os

print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8947948572853959721
]


In [40]:
class Struct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

args = {
    "activation": "relu", #tanh, sigmoid, relu
    "optimizer": "adam", #sgd, adam
    "filters": [12,32], # (6 x 6 x 16 x 16) vs (3 x 3 x 8 x 8) vs (12 x 12 x 32 x 32)
    "model": "miniresnet" #miniresnet, lenet
}
    
params = Struct(**args)

### Tensorflow implementation


In [41]:
## Lenet
class LeNet(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = Conv2D(params.filters[0], kernel_size=(5, 5),
                            padding='valid', activation=params.activation)
        self.pooling1 = MaxPooling2D(padding='same')
        self.conv2 = Conv2D(params.filters[1], kernel_size=(5, 5),
                            padding='valid', activation=params.activation)
        self.pooling2 = MaxPooling2D(padding='same')
        self.flat = Flatten()
        self.fc1 = Dense(120, activation=params.activation)
        self.fc2 = Dense(84, activation=params.activation)
        self.out = Dense(10, activation='softmax')

    def call(self, x):
        x = self.conv1(x)
        x = self.pooling1(x)
        x = self.conv2(x)
        x = self.pooling2(x)
        x = self.flat(x)
        x = self.fc1(x)
        x = self.fc2(x)
        y = self.out(x)

        return y

## Mini Resnet
class MiniResnet(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = Conv2D(8, kernel_size=(7, 7),strides=(2, 2),padding='same')
        self.bn1 = BatchNormalization()
        self.relu1 = Activation('relu')
        self.pool1 = MaxPooling2D(pool_size=(3, 3),strides=(2, 2), padding='same')
        self.block1 = [
            self._building_block(8) for _ in range(3)
        ]
        self.conv2 = Conv2D(8,kernel_size=(1, 1), strides=(2, 2))
        self.block2 = [
            self._building_block(8) for _ in range(4)
        ]
        self.avg_pool = GlobalAveragePooling2D()
        self.fc = Dense(128, activation='relu')
        self.out = Dense(10, activation='softmax')

    def call(self, x):
        h = self.conv1(x)
        h = self.bn1(h)
        h = self.relu1(h)
        h = self.pool1(h)
        for block in self.block1:
            h = block(h)
        h = self.conv2(h)
        for block in self.block2:
            h = block(h)
        h = self.avg_pool(h)
        h = self.fc(h)
        y = self.out(h)
        return y

    def _building_block(self, channel_out=64):
        return Block(channel_out=channel_out)

class Block(Model):
    def __init__(self, channel_out=64):
        super().__init__()
        self.conv1 = Conv2D(channel_out,kernel_size=(3, 3),padding='same')
        self.bn1 = BatchNormalization()
        self.relu1 = Activation('relu')
        self.conv2 = Conv2D(channel_out,kernel_size=(3, 3),padding='same')
        self.bn2 = BatchNormalization()
        self.add = Add()
        self.relu2 = Activation('relu')

    def call(self, x):
        h = self.conv1(x)
        h = self.bn1(h)
        h = self.relu1(h)
        h = self.conv2(h)
        h = self.bn2(h)
        h = self.add([x, h])
        y = self.relu2(h)
        return y


In [42]:
def normalize(X_train, X_test):
    X_train = X_train / 255.
    X_test = X_test / 255.

    mean = np.mean(X_train, axis=(0, 1, 2, 3))
    std = np.std(X_train, axis=(0, 1, 2, 3))
    print('mean:', mean, 'std:', std)
    X_train = (X_train - mean) / (std + 1e-7)
    X_test = (X_test - mean) / (std + 1e-7)
    return X_train, X_test

def prepare_cifar(x, y):
    x = tf.cast(x, tf.float32)
    y = tf.cast(y, tf.int32)
    return x, y


In [43]:
#Load data
mnist = tf.keras.datasets.cifar10
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = normalize(x_train, x_test)

train_loader = tf.data.Dataset.from_tensor_slices((x_train,y_train))
train_loader = train_loader.map(prepare_cifar).shuffle(50000).batch(256)

test_loader = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_loader = test_loader.map(prepare_cifar).shuffle(10000).batch(256)

#Build model
if params.model == "miniresnet":
    model = MiniResnet()
else:
    model = LeNet()
criterion = tf.losses.CategoricalCrossentropy()
# criterion = keras.losses.CategoricalCrossentropy(from_logits=True)

if params.optimizer == "adam":
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
else:
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.0, nesterov=False, name='SGD')
metric = tf.keras.metrics.CategoricalAccuracy()

#Train model
epochs = 50
batch_size = 512
n_batches = x_train.shape[0] // batch_size

train_loss = tf.keras.metrics.Mean()
train_acc = tf.keras.metrics.CategoricalAccuracy()
test_loss = tf.keras.metrics.Mean()
test_acc = tf.keras.metrics.CategoricalAccuracy()
wandb.init()

for epoch in range(epochs):
    for step, (x, y) in enumerate(train_loader):
        y = tf.squeeze(y, axis=1)
        y = tf.one_hot(y, depth=10) # [b, 10]

        with tf.GradientTape() as tape:
            logits = model(x)
            loss = criterion(y, logits)
            metric.update_state(y, logits)

        grads = tape.gradient(loss, model.trainable_variables)
        grads = [ tf.clip_by_norm(g, 15) for g in grads]
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if step % 40 == 0:
            print(epoch, step, 'loss:', float(loss), 'acc:', metric.result().numpy())
            acc = metric.result().numpy()
            loss = float(loss)
            wandb.log({'acc': acc, 'loss': loss})
            metric.reset_states()
            
    if epoch % 1 == 0:
        metric = tf.keras.metrics.CategoricalAccuracy()
        for x, y in test_loader:
            y = tf.squeeze(y, axis=1)
            y = tf.one_hot(y, depth=10)

            logits = model.predict(x)
            # be careful, these functions can accept y as [b] without warnning.
            metric.update_state(y, logits)
        print('test acc:', metric.result().numpy())
        val_acc = metric.result().numpy()
        val_loss = float(loss)  
        wandb.log({'val_acc': val_acc, 'val_loss': val_loss})

        metric.reset_states()
# Mark the run as finished
wandb.finish()

mean: 0.4733630004850874 std: 0.25156892506322026


0 0 loss: 2.3706233501434326 acc: 0.1015625
0 40 loss: 2.3008153438568115 acc: 0.09443359
0 80 loss: 2.2899322509765625 acc: 0.11621094
0 120 loss: 2.259572982788086 acc: 0.13896485
0 160 loss: 2.068004608154297 acc: 0.17197266
test acc: 0.2571
1 0 loss: 1.9624017477035522 acc: 0.2109375
1 40 loss: 1.9093332290649414 acc: 0.26279297
1 80 loss: 1.8682427406311035 acc: 0.27871093
1 120 loss: 1.8567272424697876 acc: 0.27246094
1 160 loss: 1.8426616191864014 acc: 0.27626953
test acc: 0.2976
2 0 loss: 1.8378270864486694 acc: 0.26953125
2 40 loss: 1.869619369506836 acc: 0.29560548
2 80 loss: 1.8038009405136108 acc: 0.30605468
2 120 loss: 1.819007158279419 acc: 0.31513673
2 160 loss: 1.827231764793396 acc: 0.31572264
test acc: 0.3274
3 0 loss: 1.814780592918396 acc: 0.265625
3 40 loss: 1.74114191532135 acc: 0.3227539
3 80 loss: 1.8146960735321045 acc: 0.33398438
3 120 loss: 1.7386471033096313 acc: 0.32685548
3 160 loss: 1.8090534210205078 acc: 0.33320314
test acc: 0.3381
4 0 loss: 1.716461420

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
acc,0.50215
loss,1.28525
_runtime,1169.0
_timestamp,1627319974.0
_step,299.0
val_acc,0.4976
val_loss,1.35028


0,1
acc,▁▄▅▅▆▅▆▆▆▆▆▆▇▇▇▇▇▇█▇▇▇▇▇▇▇▇▇██▇█████████
loss,█▅▅▅▄▄▄▃▃▃▃▃▂▃▂▂▂▃▁▂▂▃▃▃▂▁▁▂▂▂▂▁▁▂▂▂▁▂▂▁
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
val_acc,▁▂▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█▇███████
val_loss,█▆▆▅▆▅▅▅▄▂▄▄▅▂▁▄▁▃▆▃▃▃▂▁▁▄▃▂▃▃▃▁▂▂▄▃▁▃▁▂
