<a href="https://colab.research.google.com/github/theboredman/CSE468/blob/main/Quiz_1/CNN/Using_CNN_CIFAR100.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [10]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"  # @param ["tensorflow", "jax", "torch"]

import keras
from keras import layers
from keras import ops

import numpy as np
import matplotlib.pyplot as plt

## Prepare the data

In [11]:
num_classes = 100
input_shape = (32, 32, 3)

(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()

print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")


x_train shape: (50000, 32, 32, 3) - y_train shape: (50000, 1)
x_test shape: (10000, 32, 32, 3) - y_test shape: (10000, 1)


## Configure the hyperparameters

In [19]:
learning_rate = 0.001
weight_decay = 0.0001
batch_size = 128  # Reduced for better gradient updates
num_epochs = 10  # Increased for better training
image_size = 72  # We'll resize input images to this size
mlp_head_units = [
    2048,
    1024,
]  # Size of the dense layers of the final classifier

# New hyperparameters for improved training
initial_learning_rate = 0.001
label_smoothing = 0.1
dropout_rate = 0.3

## Use data augmentation

In [13]:
data_augmentation = keras.Sequential(
    [
        layers.Normalization(),
        layers.Resizing(image_size, image_size),
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(factor=0.1),  # Increased rotation
        layers.RandomZoom(height_factor=0.2, width_factor=0.2),
        layers.RandomTranslation(height_factor=0.1, width_factor=0.1),  # Added translation
        layers.RandomContrast(factor=0.2),  # Added contrast adjustment
        layers.RandomBrightness(factor=0.2),  # Added brightness adjustment
    ],
    name="data_augmentation",
)
# Compute the mean and the variance of the training data for normalization.
data_augmentation.layers[0].adapt(x_train)

## Implement multilayer perceptron (MLP)

In [14]:

def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = layers.Dense(units, activation=keras.activations.gelu)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x


## CNN Architecture

We'll build a CNN with convolutional layers for feature extraction followed by dense layers for classification.

In [17]:
def squeeze_excitation_block(x, filters, ratio=16):
    """Squeeze-and-Excitation block for attention mechanism"""
    # Squeeze
    se = layers.GlobalAveragePooling2D()(x)
    se = layers.Dense(filters // ratio, activation='swish', kernel_initializer='he_normal')(se)
    se = layers.Dense(filters, activation='sigmoid', kernel_initializer='he_normal')(se)
    se = layers.Reshape((1, 1, filters))(se)

    # Excitation
    return layers.Multiply()([x, se])

def improved_residual_block(x, filters, kernel_size=3, stride=1, use_se=True):
    """Enhanced residual block with Group Normalization and Swish activation"""
    shortcut = x

    # First conv layer with Group Normalization
    x = layers.Conv2D(filters, kernel_size, strides=stride, padding='same',
                     kernel_initializer='he_normal', use_bias=False)(x)
    x = layers.GroupNormalization(groups=8)(x)  # Better than BatchNorm for small batches
    x = layers.Activation('swish')(x)  # Swish activation

    # Second conv layer
    x = layers.Conv2D(filters, kernel_size, strides=1, padding='same',
                     kernel_initializer='he_normal', use_bias=False)(x)
    x = layers.GroupNormalization(groups=8)(x)

    # Add Squeeze-and-Excitation
    if use_se:
        x = squeeze_excitation_block(x, filters)

    # Adjust shortcut if needed
    if stride != 1 or shortcut.shape[-1] != filters:
        shortcut = layers.Conv2D(filters, 1, strides=stride, padding='same',
                               kernel_initializer='he_normal', use_bias=False)(shortcut)
        shortcut = layers.GroupNormalization(groups=8)(shortcut)

    # Add shortcut and apply activation
    x = layers.Add()([x, shortcut])
    x = layers.Activation('swish')(x)
    return x

def efficient_conv_block(x, filters, kernel_size=3, stride=1, expansion_factor=4):
    """EfficientNet-style inverted residual block"""
    # Expansion phase
    expanded_filters = filters * expansion_factor
    if expansion_factor != 1:
        x = layers.Conv2D(expanded_filters, 1, padding='same', use_bias=False,
                         kernel_initializer='he_normal')(x)
        x = layers.GroupNormalization(groups=8)(x)
        x = layers.Activation('swish')(x)

    # Depthwise convolution
    x = layers.DepthwiseConv2D(kernel_size, strides=stride, padding='same', use_bias=False,
                              depthwise_initializer='he_normal')(x)
    x = layers.GroupNormalization(groups=8)(x)
    x = layers.Activation('swish')(x)

    # Squeeze-and-Excitation
    x = squeeze_excitation_block(x, expanded_filters)

    # Projection phase
    x = layers.Conv2D(filters, 1, padding='same', use_bias=False,
                     kernel_initializer='he_normal')(x)
    x = layers.GroupNormalization(groups=8)(x)

    return x

def cbam_attention_block(x, ratio=16):
    """Convolutional Block Attention Module (CBAM)"""
    # Channel Attention
    avg_pool = layers.GlobalAveragePooling2D()(x)
    max_pool = layers.GlobalMaxPooling2D()(x)

    # Shared MLP
    channels = x.shape[-1]
    mlp_avg = layers.Dense(channels // ratio, activation='swish')(avg_pool)
    mlp_avg = layers.Dense(channels, activation='sigmoid')(mlp_avg)

    mlp_max = layers.Dense(channels // ratio, activation='swish')(max_pool)
    mlp_max = layers.Dense(channels, activation='sigmoid')(mlp_max)

    channel_attention = layers.Add()([mlp_avg, mlp_max])
    channel_attention = layers.Reshape((1, 1, channels))(channel_attention)

    x = layers.Multiply()([x, channel_attention])

    # Spatial Attention
    avg_pool_spatial = keras.ops.mean(x, axis=-1, keepdims=True)
    max_pool_spatial = keras.ops.max(x, axis=-1, keepdims=True)
    spatial_concat = layers.Concatenate(axis=-1)([avg_pool_spatial, max_pool_spatial])

    spatial_attention = layers.Conv2D(1, 7, padding='same', activation='sigmoid',
                                    kernel_initializer='he_normal')(spatial_concat)

    return layers.Multiply()([x, spatial_attention])

def create_cnn_classifier():
    inputs = keras.Input(shape=input_shape)
    # Augment data.
    x = data_augmentation(inputs)

    # Stem: Initial feature extraction
    x = layers.Conv2D(48, (3, 3), strides=1, padding='same', use_bias=False,
                     kernel_initializer='he_normal')(x)
    x = layers.GroupNormalization(groups=8)(x)
    x = layers.Activation('swish')(x)

    # Add initial attention
    x = cbam_attention_block(x)

    # Stage 1: Enhanced residual blocks with attention
    x = improved_residual_block(x, 64, stride=1, use_se=True)
    x = improved_residual_block(x, 64, stride=1, use_se=True)
    x = improved_residual_block(x, 64, stride=2, use_se=True)  # Stride 2 for downsampling
    x = layers.Dropout(0.15)(x)

    # Stage 2: EfficientNet-style blocks
    x = efficient_conv_block(x, 96, stride=1, expansion_factor=4)
    x = efficient_conv_block(x, 96, stride=1, expansion_factor=4)
    x = cbam_attention_block(x)  # Add CBAM attention
    x = efficient_conv_block(x, 96, stride=2, expansion_factor=4)  # Downsample
    x = layers.Dropout(0.2)(x)

    # Stage 3: Mixed convolution types
    x = improved_residual_block(x, 144, stride=1, use_se=True)
    x = efficient_conv_block(x, 144, stride=1, expansion_factor=6)
    x = improved_residual_block(x, 144, stride=1, use_se=True)
    x = cbam_attention_block(x)

    # Use adaptive pooling instead of fixed MaxPooling
    x = layers.AveragePooling2D((2, 2))(x)  # Average pooling preserves more info
    x = layers.Dropout(0.25)(x)

    # Stage 4: High-level feature extraction
    x = efficient_conv_block(x, 192, stride=1, expansion_factor=6)
    x = improved_residual_block(x, 192, stride=1, use_se=True)
    x = efficient_conv_block(x, 192, stride=1, expansion_factor=6)
    x = cbam_attention_block(x)
    x = layers.Dropout(0.3)(x)

    # Stage 5: Final feature maps
    x = improved_residual_block(x, 256, stride=1, use_se=True)
    x = efficient_conv_block(x, 256, stride=1, expansion_factor=8)

    # Multi-scale feature aggregation
    # Global Average Pooling
    gap = layers.GlobalAveragePooling2D()(x)
    # Global Max Pooling
    gmp = layers.GlobalMaxPooling2D()(x)
    # Combine both pooling methods
    x = layers.Concatenate()([gap, gmp])
    x = layers.Dropout(0.4)(x)

    # Enhanced classifier head with Ghost modules
    x = layers.Dense(768, use_bias=False, kernel_initializer='he_normal')(x)
    x = layers.GroupNormalization(groups=8)(x)
    x = layers.Activation('swish')(x)
    x = layers.Dropout(0.5)(x)

    # Ghost bottleneck
    x = layers.Dense(384, use_bias=False, kernel_initializer='he_normal')(x)
    x = layers.GroupNormalization(groups=8)(x)
    x = layers.Activation('swish')(x)
    x = layers.Dropout(0.4)(x)

    # Add MLP classification head with improved activation
    features = mlp(x, hidden_units=mlp_head_units, dropout_rate=0.5)

    # Final classification layer with label smoothing consideration
    logits = layers.Dense(num_classes, kernel_initializer='he_normal',
                         kernel_regularizer=keras.regularizers.l2(0.001))(features)

    # Create the Keras model
    model = keras.Model(inputs=inputs, outputs=logits)
    return model

## Compile, train, and evaluate the model

In [None]:
def create_callbacks():
    """Create advanced callbacks for better training"""
    callbacks = [
        # Model checkpoint
        keras.callbacks.ModelCheckpoint(
            "/tmp/best_model.weights.h5",
            monitor="val_accuracy",
            save_best_only=True,
            save_weights_only=True,
            verbose=1
        ),

        # Early stopping
        keras.callbacks.EarlyStopping(
            monitor="val_accuracy",
            patience=10,
            restore_best_weights=True,
            verbose=1
        ),

        # Reduce learning rate on plateau
        keras.callbacks.ReduceLROnPlateau(
            monitor="val_accuracy",
            factor=0.5,
            patience=5,
            min_lr=1e-7,
            verbose=1
        ),

        # Cosine annealing learning rate schedule
        keras.callbacks.LearningRateScheduler(
            lambda epoch: initial_learning_rate * 0.5 * (1 + np.cos(np.pi * epoch / num_epochs)),
            verbose=0
        )
    ]
    return callbacks

def run_experiment(model):
    # Use AdamW with better parameters for GroupNorm
    optimizer = keras.optimizers.AdamW(
        learning_rate=initial_learning_rate,
        weight_decay=weight_decay,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-7  # Smaller epsilon for better numerical stability
    )

    # Apply label smoothing to y_train
    y_train_smoothed = keras.utils.to_categorical(y_train, num_classes=num_classes)
    y_train_smoothed = y_train_smoothed * (1 - label_smoothing) + label_smoothing / num_classes


    # Compile with enhanced metrics
    model.compile(
        optimizer=optimizer,
        loss=keras.losses.CategoricalCrossentropy(
            from_logits=True,
            label_smoothing=0.0  # Already applied above
        ),
        metrics=[
            keras.metrics.CategoricalAccuracy(name="accuracy"),
            keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
            keras.metrics.TopKCategoricalAccuracy(10, name="top-10-accuracy"),
        ],
    )

    # Get callbacks
    callbacks = create_callbacks()

    # Train the model with mixed precision for faster training
    model.fit(
        x=x_train,
        y=y_train_smoothed,  # Use smoothed labels here
        batch_size=batch_size,
        epochs=num_epochs,
        validation_split=0.2,  # Increased validation split
        callbacks=callbacks,
        verbose=1
    )

    # Load best weights
    model.load_weights("/tmp/best_model.weights.h5")

    # Evaluate on test set
    test_loss, accuracy, top_5_accuracy, top_10_accuracy = model.evaluate(x_test, y_test, verbose=0)
    print(f"\nFinal Test Results:")
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
    print(f"Test top 10 accuracy: {round(top_10_accuracy * 100, 2)}%")
    print(f"Test loss: {round(test_loss, 4)}")

    return history


# Create and train the enhanced model
cnn_classifier = create_cnn_classifier()

# Print model summary
print("Model Architecture:")
cnn_classifier.summary()

# Train the model
history = run_experiment(cnn_classifier)


def plot_training_history(history):
    """Plot comprehensive training history"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Plot loss
    axes[0, 0].plot(history.history['loss'], label='Training Loss')
    axes[0, 0].plot(history.history['val_loss'], label='Validation Loss')
    axes[0, 0].set_title('Model Loss')
    axes[0, 0].set_xlabel('Epoch')
    axes[0, 0].set_ylabel('Loss')
    axes[0, 0].legend()
    axes[0, 0].grid(True)

    # Plot accuracy
    axes[0, 1].plot(history.history['accuracy'], label='Training Accuracy')
    axes[0, 1].plot(history.history['val_accuracy'], label='Validation Accuracy')
    axes[0, 1].set_title('Model Accuracy')
    axes[0, 1].set_xlabel('Epoch')
    axes[0, 1].set_ylabel('Accuracy')
    axes[0, 1].legend()
    axes[0, 1].grid(True)

    # Plot top-5 accuracy
    axes[1, 0].plot(history.history['top-5-accuracy'], label='Training Top-5 Accuracy')
    axes[1, 0].plot(history.history['val_top-5-accuracy'], label='Validation Top-5 Accuracy')
    axes[1, 0].set_title('Model Top-5 Accuracy')
    axes[1, 0].set_xlabel('Epoch')
    axes[1, 0].set_ylabel('Top-5 Accuracy')
    axes[1, 0].legend()
    axes[1, 0].grid(True)

    # Plot learning rate
    if 'lr' in history.history:
        axes[1, 1].plot(history.history['lr'])
        axes[1, 1].set_title('Learning Rate Schedule')
        axes[1, 1].set_xlabel('Epoch')
        axes[1, 1].set_ylabel('Learning Rate')
        axes[1, 1].set_yscale('log')
        axes[1, 1].grid(True)
    else:
        axes[1, 1].text(0.5, 0.5, 'Learning rate data not available',
                       ha='center', va='center', transform=axes[1, 1].transAxes)
        axes[1, 1].set_title('Learning Rate Schedule')

    plt.tight_layout()
    plt.show()

# Plot the training history
plot_training_history(history)

Model Architecture:


Epoch 1/10
[1m100/313[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m3:48[0m 1s/step - accuracy: 0.0092 - loss: nan - top-10-accuracy: 0.0322 - top-5-accuracy: 0.0305