## Imports

In [1]:
import wandb
from wandb.keras import WandbCallback

import numpy as np
from typing import Union
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model, Input, Sequential
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import (
    layers,
    losses,
    metrics,
    datasets,
    mixed_precision,
    optimizers,
    callbacks
)

## Experiment Configs

In [2]:
CONFIGS = {
    "dataset_name": "CIFAR-10",
    "image_size": 32,
    "target_size": 72,
    "patch_size": 9,
    "num_mixer_layers": 5,
    "embedding_dim": 128,
    "channels_mlp_dim": 128,
    "num_classes": 10,
    "dropout": 0.5,
    "batch_size": 256,
    "learning_rate": 0.001,
    "epochs": 50,
    "label_smoothing": 0.01,
    "mixed_precision": True,
    "class_names": [
        "airplane", "automobile", "bird", "cat",
        "deer", "dog", "frog", "horse", "ship", "truck"
    ]
}

In [3]:
if CONFIGS["mixed_precision"]:
    mixed_precision.set_global_policy("mixed_float16")

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA RTX A6000, compute capability 8.6


2022-01-29 08:31:21.195145: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-29 08:31:21.239387: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-29 08:31:21.241010: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-29 08:31:21.242997: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


## Fetch CIFAR-10 Dataset

In [4]:
def get_cifar10(num_classes: int):
    (x_train, y_train), (x_test, y_test) = datasets.cifar10.load_data()
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)
    return (x_train, y_train), (x_test, y_test)


(x_train, y_train), (x_test, y_test) = get_cifar10(num_classes=10)
print("x_train.shape:", x_train.shape)
print("y_train.shape:", y_train.shape)
print("x_test.shape:", x_test.shape)
print("y_test.shape:", y_test.shape)

x_train.shape: (50000, 32, 32, 3)
y_train.shape: (50000, 10)
x_test.shape: (10000, 32, 32, 3)
y_test.shape: (10000, 10)


## MLP-Mixer Model

In [5]:
def get_preprocessing_layer(
    data_batch: Union[np.ndarray, tf.Tensor], target_size: int
) -> Sequential:
    normalization = preprocessing.Normalization()
    normalization.adapt(data_batch)
    resize = preprocessing.Resizing(target_size, target_size)
    return Sequential([normalization, resize], name="preprocessing")


def get_augmentation_layer() -> Sequential:
    return keras.Sequential(
        [
            preprocessing.RandomFlip("horizontal"),
            preprocessing.RandomRotation(factor=0.02),
            preprocessing.RandomZoom(height_factor=0.2, width_factor=0.2),
        ],
        name="augmentation",
    )

In [6]:
def patch_embedding(
    inputs: tf.Tensor, embedding_dim: int, patch_size: int
) -> tf.Tensor:
    x = layers.Conv2D(
        embedding_dim,
        kernel_size=patch_size,
        strides=patch_size,
        name="patch_embedding_conv2d"
    )(inputs)
    return layers.Reshape(
        (x.shape[1] * x.shape[2], x.shape[3]),
        name="patch_embedding_reshape"
    )(x)


def mlp_block(inputs: tf.Tensor, mlp_dim: int, name: str) -> tf.Tensor:
    x = layers.Dense(mlp_dim, name=name + "_dense_1")(inputs)
    x = layers.Activation("gelu", name=name + "_gelu_activation")(x)
    return layers.Dense(x.shape[-1], name=name + "_dense_2")(x)


def mixer_block(
    inputs: tf.Tensor, tokens_mlp_dim: int, channels_mlp_dim: int, name: str
) -> tf.Tensor:
    y = layers.LayerNormalization(name=name + "_layer_norm_1")(inputs)
    y = layers.Permute((2, 1), name=name + "_swap_axes_1")(y)
    # Token Mixing
    y = mlp_block(y, tokens_mlp_dim, name=name + "_mlp_block_1")
    y = layers.Permute((2, 1), name=name + "_swap_axes_2")(y)
    x = layers.Add(name=name + "_skip_connection_token")([inputs, y])
    # Channel Mixing
    y = layers.LayerNormalization(name=name + "_layer_norm_2")(x)
    y = mlp_block(y, channels_mlp_dim, name=name + "_mlp_block_2")
    return layers.Add(name=name + "_skip_connection_channel")([x, y])


def get_mlp_mixer_model(
    num_mixer_blocks: int,
    patch_size: int,
    embedding_dim: int,
    channels_mlp_dim: int,
    num_classes: int,
    preprocessing_layer: Union[Sequential, None],
    augmentation_layer: Union[Sequential, None],
) -> Model:
    inputs = Input(
        shape=(CONFIGS["image_size"], CONFIGS["image_size"], 3), name="Input"
    )
    preprocessed_inputs = (
        preprocessing_layer(inputs) if preprocessing_layer is not None else inputs
    )
    augmented_inputs = (
        augmentation_layer(preprocessed_inputs)
        if augmentation_layer is not None
        else preprocessed_inputs
    )
    x = patch_embedding(augmented_inputs, embedding_dim, patch_size)
    tokens_mlp_dim = x.shape[-2]
    for idx in range(num_mixer_blocks):
        x = mixer_block(
            x, tokens_mlp_dim, channels_mlp_dim, name=f"mixer_block_{idx}"
        )
    x = layers.LayerNormalization(name="layer_norm_post_mixer")(x)
    x = layers.Dropout(CONFIGS["dropout"], name="dropout")(x)
    x = layers.GlobalAveragePooling1D(name="global_average_pooling")(x)
    outputs = layers.Dense(
        num_classes, activation="softmax", dtype="float32", name="output"
    )(x)
    return Model(inputs, outputs, name="mlp_mixer")

## Callbacks

Custom Callback to log the Linear Projections of the Patch Embedding Layer.

In [7]:
class LogProjectionCallback(callbacks.Callback):
    
    def __init__(self, embedding_dim: int):
        super().__init__()
        self.embedding_dim = embedding_dim
    
    def on_epoch_end(self, epoch, logs=None):
        projections = model.get_layer("patch_embedding_conv2d").get_weights()[0]
        projections_min, projections_max = projections.min(), projections.max()
        projections = (projections - projections_min) / (projections_max - projections_min)
        log_data = []
        for idx in range(CONFIGS["embedding_dim"]):
            log_data.append(wandb.Image(projections[:, :, :, idx]))
        wandb.log({"Projection of Patches": log_data}, step=epoch)

## Experiment: Train and Eval

In [None]:
wandb.login()

with wandb.init(project='mlp-mixer', name="cifar-10-best-hyperparams", config=CONFIGS):
    
    # Define MLP-Mixer Model as per the Configs
    model = get_mlp_mixer_model(
        num_mixer_blocks=CONFIGS["num_mixer_layers"],
        patch_size=CONFIGS["patch_size"],
        embedding_dim=CONFIGS["embedding_dim"],
        channels_mlp_dim=CONFIGS["channels_mlp_dim"],
        num_classes=CONFIGS["num_classes"],
        preprocessing_layer=get_preprocessing_layer(
            data_batch=x_train, target_size=CONFIGS["target_size"]
        ),
        augmentation_layer=get_augmentation_layer(),
    )
    
    # Compile the Model
    model.compile(
        optimizer=optimizers.Adam(learning_rate=CONFIGS["learning_rate"]),
        loss=losses.CategoricalCrossentropy(),
        metrics=[
            metrics.CategoricalAccuracy(name="accuracy"),
            metrics.TopKCategoricalAccuracy(3, name="top-3-accuracy"),
            metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
        ],
    )
    
    # Define Log Projection Callback
    log_projection_callback = LogProjectionCallback(
        embedding_dim=CONFIGS["embedding_dim"]
    )

    # Define Wandb Callback
    wandb_callback = WandbCallback(
        data_type='image',
        save_model=True,
        training_data=(x_train, y_train),
        validation_data=(x_test, y_test),
        labels=CONFIGS["class_names"]
    )

    # Run Training
    history = model.fit(
        x=x_train,
        y=y_train,
        batch_size=CONFIGS["batch_size"],
        epochs=CONFIGS["epochs"],
        validation_split=0.1,
        callbacks=[
            log_projection_callback, wandb_callback
        ]
    )
    
    # Log Test Error Rate
    loss, accuracy, top_3_accuracy, top_5_accuracy = model.evaluate(x_test, y_test)
    wandb.log({'Test Error Rate': round((1 - accuracy) * 100, 2)})

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgeekyrakshit[0m (use `wandb login --relogin` to force relogin)


2022-01-29 08:31:31.285158: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-29 08:31:31.285695: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-29 08:31:31.287270: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-29 08:31:31.288678: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so retur

Epoch 1/50


2022-01-29 08:31:42.438880: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8201
2022-01-29 08:31:44.149284: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-01-29 08:31:44.149686: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-01-29 08:31:44.149711: W tensorflow/stream_executor/gpu/asm_compiler.cc:77] Couldn't get ptxas version string: Internal: Couldn't invoke ptxas --version
2022-01-29 08:31:44.150099: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-01-29 08:31:44.150157: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] Internal: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.
2022-01-29 08:31:45.739538: I tensorflow/stream_executor/cuda/c





Epoch 2/50




Epoch 3/50




Epoch 4/50




Epoch 5/50




Epoch 6/50




Epoch 7/50




Epoch 8/50




Epoch 9/50




Epoch 10/50




Epoch 11/50




Epoch 12/50




Epoch 13/50




Epoch 14/50




Epoch 15/50




Epoch 16/50




Epoch 17/50




Epoch 18/50




Epoch 19/50




Epoch 20/50




Epoch 21/50




Epoch 22/50




Epoch 23/50




Epoch 24/50




Epoch 25/50




Epoch 26/50




Epoch 27/50




Epoch 28/50




Epoch 29/50




Epoch 30/50




Epoch 31/50




Epoch 32/50




Epoch 33/50




Epoch 34/50




Epoch 35/50




Epoch 36/50




Epoch 37/50




Epoch 38/50




Epoch 39/50




Epoch 40/50




Epoch 41/50




Epoch 42/50




Epoch 43/50




Epoch 44/50




Epoch 45/50




Epoch 46/50




Epoch 47/50




Epoch 48/50




Epoch 49/50




Epoch 50/50






## Hyperparameter Tuning

In [None]:
sweep_config = {
    'method': 'bayes', 
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'early_terminate':{
        'type': 'hyperband',
        'min_iter': 5,
    },
    'parameters': {
        'batch_size': {
            'values': [256, 512, 1024]
        },
        'label_smoothing': {
            'values': [0.0, 0.01, 0.001]
        },
        'dropout': {
            'values': [0.0, 0.25, 0.5]
        },
        'num_mixer_layers': {
            'values': [2, 3, 4, 5, 6]
        }
    }
}

In [None]:
def sweep_train_fn():
    wandb.init(
        project='mlp-mixer',
        config=CONFIGS
    )
    wandb.config.epochs = 15
    model = get_mlp_mixer_model(
        num_mixer_blocks=wandb.config.num_mixer_layers,
        patch_size=wandb.config.patch_size,
        embedding_dim=wandb.config.embedding_dim,
        channels_mlp_dim=wandb.config.channels_mlp_dim,
        num_classes=wandb.config.num_classes,
        preprocessing_layer=get_preprocessing_layer(
            data_batch=x_train, target_size=wandb.config.target_size
        ),
        augmentation_layer=get_augmentation_layer(),
    )
    
    # Compile the Model
    model.compile(
        optimizer=optimizers.Adam(learning_rate=wandb.config.learning_rate),
        loss=losses.CategoricalCrossentropy(),
        metrics=[
            metrics.CategoricalAccuracy(name="accuracy"),
            metrics.TopKCategoricalAccuracy(3, name="top-3-accuracy"),
            metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
        ],
    )
    wandb_callback = WandbCallback(save_model=True)
    history = model.fit(
        x=x_train,
        y=y_train,
        batch_size=wandb.config.batch_size,
        epochs=wandb.config.epochs,
        validation_split=0.1,
        callbacks=[wandb_callback]
    )
    loss, accuracy, top_3_accuracy, top_5_accuracy = model.evaluate(x_test, y_test)
    wandb.log({'Test Error Rate': round((1 - accuracy) * 100, 2)})

In [None]:
wandb.login()
sweep_id = wandb.sweep(sweep_config, project="mlp-mixer")
wandb.agent(sweep_id, function=sweep_train_fn)