<a href="https://colab.research.google.com/github/yeabwang/Human-Emotions-Detection/blob/main/Note_on_state_of_art_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Introduction to Sequential API

The Sequential API is the simplest way to create a model in TensorFlow using Keras. It allows you to stack layers sequentially, meaning each layer has exactly one input tensor and one output tensor. This is suitable for most feedforward neural networks where the model is a straight line of layers.

In [None]:
CONFIGURATION = {
    "BATCH_SIZE": 32,
    "IM_SIZE": 256,
    "LEARNING_RATE": 1e-3,
    "N_EPOCHS": 3,
    "DROPOUT_RATE": 0.0,
    "REGULARIZATION_RATE": 0.0,
    "N_FILTERS": 6,
    "KERNEL_SIZE": 3,
    "N_STRIDES": 1,
    "POOL_SIZE": 2,
    "N_DENSE_1": 1024,
    "N_DENSE_2": 128,
    "NUM_CLASSES": 3,
    "PATCH_SIZE": 16,
    "PROJ_DIM": 768,
    "CLASS_NAMES": ["angry", "happy", "sad"],
}

lenet_model = tf.keras.Sequential(
    [
    InputLayer(shape = (None, None, 3), ),
    # Accepts images of any height and width (None, None) with 3 color channels (RGB).
    # This makes the model flexible to different image sizes.

    resize_rescale_layers,
    # Resizes images to a fixed shape (e.g., 32x32 or 64x64) for the model.
    # Rescales pixel values (e.g., from [0, 255] to [0, 1]) for normalization.

    Conv2D(filters = CONFIGURATION["N_FILTERS"] , kernel_size = CONFIGURATION["KERNEL_SIZE"], strides = CONFIGURATION["N_STRIDES"] , padding='valid',
          activation = 'relu',kernel_regularizer = L2(CONFIGURATION["REGULARIZATION_RATE"])),
    BatchNormalization(),
    MaxPool2D (pool_size = CONFIGURATION["POOL_SIZE"], strides= CONFIGURATION["N_STRIDES"]*2),
    Dropout(rate = CONFIGURATION["DROPOUT_RATE"] ),

    # Conv2D Layer:
    # filters: Number of filters for feature extraction.
    # kernel_size: Size of the convolution window (e.g., 3x3).
    # strides: Step size for sliding the kernel over the input.
    # padding='valid': No padding, reducing output size.
    # activation='relu': Introduces non-linearity.
    # kernel_regularizer=L2(...): Applies L2 regularization to reduce overfitting.

    # BatchNormalization:
    # Normalizes the output of the Conv2D layer, speeding up training and providing regularization.
    # MaxPool2D:

    # pool_size: Size of the pooling window.
    # strides: How far the pooling window moves each step.
    # Reduces spatial dimensions while preserving key features.

    # Dropout:
    # rate: Fraction of neurons to drop (e.g., 0.5 drops 50%).
    # Prevents overfitting by promoting generalization.

    Conv2D(filters = CONFIGURATION["N_FILTERS"]*2 + 4, kernel_size = CONFIGURATION["KERNEL_SIZE"], strides=CONFIGURATION["N_STRIDES"], padding='valid',
          activation = 'relu', kernel_regularizer = L2(CONFIGURATION["REGULARIZATION_RATE"])),
    BatchNormalization(),
    MaxPool2D (pool_size = CONFIGURATION["POOL_SIZE"], strides= CONFIGURATION["N_STRIDES"]*2),

    Flatten(),
    # Converts the 2D feature maps into a 1D vector, preparing it for fully connected layers.

    Dense( CONFIGURATION["N_DENSE_1"], activation = "relu", kernel_regularizer = L2(CONFIGURATION["REGULARIZATION_RATE"])),
    BatchNormalization(),
    Dropout(rate = CONFIGURATION["DROPOUT_RATE"]),

    Dense( CONFIGURATION['N_DENSE_2'], activation = "relu", kernel_regularizer = L2(CONFIGURATION["REGULARIZATION_RATE"])),
    BatchNormalization(),

    # Dense Layers:

    # N_DENSE_1 and N_DENSE_2: Number of neurons in each fully connected layer.
    # activation='relu': Non-linearity for complex pattern learning.
    # kernel_regularizer=L2(...): Regularization to reduce overfitting.
    # BatchNormalization and Dropout:

    # Consistent use of Batch Norm for faster convergence.
    # Dropout helps in generalization by randomly turning off neurons.

    Dense(CONFIGURATION["NUM_CLASSES"], activation = "softmax"),
    # Output Layer:
    # NUM_CLASSES: Number of classes for classification (e.g., 10 for CIFAR-10).
    # activation='softmax': Converts outputs to probabilities for each class.

])

lenet_model.summary()

## Introduction to Functional API

Unlike the Sequential API, where layers are stacked one after another, the Functional API lets you define the computation graph as a directed acyclic graph (DAG).
You explicitly define the flow of data between layers, making it suitable for complex neural network architectures.

###When to Use the Functional API:
When you need multiple inputs or outputs.
When you want to use shared layers.
When building architectures with skip connections or residual blocks.
When you need more flexibility and control over the model design.

In [None]:
from tensorflow.keras import layers, Model
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D, Dropout, Flatten, Dense
from tensorflow.keras.regularizers import L2

# Configuration Dictionary
CONFIGURATION = {
    "BATCH_SIZE": 32,
    "IM_SIZE": 256,
    "LEARNING_RATE": 1e-3,
    "N_EPOCHS": 3,
    "DROPOUT_RATE": 0.0,
    "REGULARIZATION_RATE": 0.0,
    "N_FILTERS": 6,
    "KERNEL_SIZE": 3,
    "N_STRIDES": 1,
    "POOL_SIZE": 2,
    "N_DENSE_1": 1024,
    "N_DENSE_2": 128,
    "NUM_CLASSES": 3,
    "PATCH_SIZE": 16,
    "PROJ_DIM": 768,
    "CLASS_NAMES": ["angry", "happy", "sad"],
}

# Resize and Rescale Layer
# - Resizes input images to (IM_SIZE, IM_SIZE)
# - Rescales pixel values from [0, 255] to [0, 1]
resize_rescale_layer = tf.keras.Sequential([
    layers.Resizing(CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"]),
    layers.Rescaling(1./255)
])

# ========================
#    Feature Extractor
# ========================

# Input Layer
# - Accepts images of any height and width with 3 color channels
future_input = Input(shape=(None, None, 3))

# Resize and Rescale Layer
# - Ensures all images are of the same size and normalized
x = resize_rescale_layer(future_input)

# Convolutional Layer 1
# - Extracts low-level features like edges and textures
# - Uses L2 regularization to prevent overfitting
x = Conv2D(filters=CONFIGURATION["N_FILTERS"],
           kernel_size=CONFIGURATION["KERNEL_SIZE"],
           strides=CONFIGURATION["N_STRIDES"],
           padding='valid',
           activation='relu',
           kernel_regularizer=L2(CONFIGURATION["REGULARIZATION_RATE"]))(x)

# Batch Normalization
# - Normalizes the output of Conv layer
# - Accelerates training and provides regularization
x = BatchNormalization()(x)

# Max Pooling Layer
# - Reduces spatial dimensions
# - Keeps only the most prominent features
x = MaxPooling2D(pool_size=CONFIGURATION["POOL_SIZE"],
                 strides=CONFIGURATION["N_STRIDES"]*2)(x)

# Convolutional Layer 2
# - Extracts more complex patterns by increasing filters
x = Conv2D(filters=CONFIGURATION["N_FILTERS"]*2 + 4,
           kernel_size=CONFIGURATION["KERNEL_SIZE"],
           strides=CONFIGURATION["N_STRIDES"],
           padding='valid',
           activation='relu',
           kernel_regularizer=L2(CONFIGURATION["REGULARIZATION_RATE"]))(x)

# Batch Normalization
# - Normalizes output of the second Conv layer
x = BatchNormalization()(x)

# Max Pooling Layer
# - Further reduces spatial dimensions
x = MaxPooling2D(pool_size=CONFIGURATION["POOL_SIZE"],
                 strides=CONFIGURATION["N_STRIDES"]*2)(x)

# Flatten Layer
# - Converts 2D feature maps to 1D feature vector
future_output = Flatten()(x)

# Create Feature Extractor Model
future_model = Model(inputs=future_input, outputs=future_output, name="Feature_Extractor")

# ========================
#   Classification Model
# ========================

# Input Layer for Classification Model
class_input = Input(shape=(None, None, 3))

# Use Feature Extractor
# - Reuses the feature extraction part of the model
x = future_model(class_input)

# Fully Connected Layer 1
# - Learns complex patterns and combinations of features
x = Dense(CONFIGURATION["N_DENSE_1"],
          activation='relu',
          kernel_regularizer=L2(CONFIGURATION["REGULARIZATION_RATE"]))(x)

# Batch Normalization
# - Speeds up training and stabilizes learning
x = BatchNormalization()(x)

# Dropout Layer
# - Randomly sets a fraction of inputs to 0
# - Prevents overfitting by adding noise
x = Dropout(rate=CONFIGURATION["DROPOUT_RATE"])(x)

# Output Layer
# - Dense layer with softmax activation for multi-class classification
class_output = Dense(CONFIGURATION["NUM_CLASSES"], activation='softmax')(x)

# Create Classification Model
class_model = Model(inputs=class_input, outputs=class_output, name="Classification_Model")

# Model Summary
class_model.summary()

##Introduction to Model Subclassing in TensorFlow

Model subclassing in TensorFlow is a flexible way to build neural networks by directly inheriting from tf.keras.Model. This approach allows you to fully customize the forward pass (call() method) and define complex architectures that aren't easily implemented using the Sequential or Functional APIs.



In [None]:
CONFIGURATION = {
    "BATCH_SIZE": 32,
    "IM_SIZE": 256,
    "LEARNING_RATE": 1e-3,
    "N_EPOCHS": 3,
    "DROPOUT_RATE": 0.2,
    "REGULARIZATION_RATE": 1e-4,
    "N_FILTERS": 6,
    "KERNEL_SIZE": 3,
    "N_STRIDES": 1,
    "POOL_SIZE": 2,
    "N_DENSE_1": 1024,
    "N_DENSE_2": 128,
    "NUM_CLASSES": 3,
    "CLASS_NAMES": ["angry", "happy", "sad"],
}

# Preprocessing Layer: Resizes images and rescales pixel values to [0, 1]
resize_rescale_layer = tf.keras.Sequential([
    layers.Resizing(CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"]),
    layers.Rescaling(1./255)
])

# Feature Extractor Class
class LeNetModelFutureExtractor(Layer):
    def __init__(self, config):
        super(LeNetModelFutureExtractor, self).__init__()

        # Store configuration for reuse
        self.config = config

        # Preprocessing Layer
        self.resize_rescale = resize_rescale_layer

        # First Convolutional Block
        self.conv1 = layers.Conv2D(
            filters=config["N_FILTERS"],           # Number of filters
            kernel_size=config["KERNEL_SIZE"],     # Size of the convolution kernel
            strides=config["N_STRIDES"],           # Stride length
            padding='valid',                       # No padding, reduces output size
            activation='relu',                     # Activation function for non-linearity
            kernel_regularizer=L2(config["REGULARIZATION_RATE"]) # L2 regularization
        )
        self.bn1 = layers.BatchNormalization()    # Normalizes activations to stabilize learning
        self.pool1 = layers.MaxPooling2D(
            pool_size=config["POOL_SIZE"],          # Downsamples feature map size
            strides=config["N_STRIDES"]*2           # Stride for pooling
        )

        # Second Convolutional Block
        self.conv2 = layers.Conv2D(
            filters=config["N_FILTERS"]*2 + 4,     # Increasing number of filters
            kernel_size=config["KERNEL_SIZE"],
            strides=config["N_STRIDES"],
            padding='valid',
            activation='relu',
            kernel_regularizer=L2(config["REGULARIZATION_RATE"])
        )
        self.bn2 = layers.BatchNormalization()
        self.pool2 = layers.MaxPooling2D(
            pool_size=config["POOL_SIZE"],
            strides=config["N_STRIDES"]*2
        )

        # Flatten Layer
        self.flatten = layers.Flatten()            # Converts 2D feature maps to 1D vector

    def call(self, x):
        # Forward pass through the feature extractor
        x = self.resize_rescale(x)                 # Resize and Rescale Input
        x = self.conv1(x)                          # First Convolutional Layer
        x = self.bn1(x)                            # Batch Normalization
        x = self.pool1(x)                          # Max Pooling
        x = self.conv2(x)                          # Second Convolutional Layer
        x = self.bn2(x)                            # Batch Normalization
        x = self.pool2(x)                          # Max Pooling
        x = self.flatten(x)                        # Flatten for Dense Layers
        return x

# Classification Model Class
class LeNetClassification(Model):
    def __init__(self, config):
        super(LeNetClassification, self).__init__()

        # Feature Extractor (Reusable Component)
        self.feature_extractor = LeNetModelFutureExtractor(config)

        # Fully Connected Layer 1
        self.fc1 = layers.Dense(
            config["N_DENSE_1"],                   # Number of neurons in the layer
            activation='relu',                     # Activation function
            kernel_regularizer=L2(config["REGULARIZATION_RATE"]) # L2 regularization
        )
        self.bn3 = layers.BatchNormalization()     # Normalizes activations
        self.dropout1 = layers.Dropout(             # Dropout for regularization
            rate=config["DROPOUT_RATE"]
        )

        # Fully Connected Layer 2
        self.fc2 = layers.Dense(
            config["N_DENSE_2"],
            activation='relu',
            kernel_regularizer=L2(config["REGULARIZATION_RATE"])
        )
        self.bn4 = layers.BatchNormalization()

        # Output Layer
        self.output_layer = layers.Dense(
            config["NUM_CLASSES"],                 # Number of output classes
            activation='softmax'                   # Softmax for multi-class classification
        )

    def call(self, x):
        # Forward pass through the classification model
        x = self.feature_extractor(x)              # Extract Features
        x = self.fc1(x)                            # Fully Connected Layer 1
        x = self.bn3(x)                            # Batch Normalization
        x = self.dropout1(x)                       # Dropout
        x = self.fc2(x)                            # Fully Connected Layer 2
        x = self.bn4(x)                            # Batch Normalization
        x = self.output_layer(x)                   # Output Layer
        return x

# Instantiate the Classification Model
leNet_class_model = LeNetClassification(CONFIGURATION)

# Build Model with Input Shape to initialize weights
leNet_class_model.build((None, CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"], 3))

# Display Model Summary
leNet_class_model.summary()

In [None]:
#AlexNet was a breakthrough in deep learning, winning the ImageNet Large Scale Visual Recognition Challenge (ILSVRC) 2012 and pioneering modern CNN architectures.

# Architecture: 8 layers (5 convolutional + 3 fully connected).

# Input Size: 227 × 227 × 3 (RGB images).

# Convolutional Layers:
#### Conv1: 96 filters, 11×11 kernel, stride 4, ReLU.
#### Conv2: 256 filters, 5×5 kernel, stride 1, ReLU.
#### Conv3: 384 filters, 3×3 kernel, stride 1, ReLU.
#### Conv4: 384 filters, 3×3 kernel, stride 1, ReLU.
#### Conv5: 256 filters, 3×3 kernel, stride 1, ReLU.

# Max Pooling: After Conv1, Conv2, and Conv5 (3×3 kernel, stride 2).

# Fully Connected Layers:
#### FC6: 4096 neurons, ReLU.
#### FC7: 4096 neurons, ReLU.
#### FC8 (Output): 1000 neurons (ImageNet classes), Softmax.

# Activation Function: ReLU (introduced to speed up training).
# Normalization: Local Response Normalization (LRN) after Conv1 and Conv2.
# Regularization: Dropout (0.5) in FC6 and FC7.
# Optimization: Stochastic Gradient Descent (SGD) with momentum (0.9).
# Batch Size: 128.
# Weight Initialization: Gaussian distribution.
# Data Augmentation: Cropping, flipping, and color jittering.
# Training Dataset: ImageNet (1.2 million images, 1000 classes).
# Parallel Training: Two GPUs used to split model layers for efficiency

In [None]:
## VGG Model
# Key Features:
# Deep Network: 16 (VGG16) or 19 (VGG19) layers.
# Uniform Kernel Size: Only 3×3 convolution layers to maintain consistency.
# Increased Depth: More layers compared to AlexNet for hierarchical feature learning.
# Regularization: Dropout (0.5) in fully connected layers.
# Optimization: SGD with momentum (0.9), batch size = 256.
# Weight Initialization: Pretrained on ImageNet, useful for transfer learning.
# Data Augmentation: Cropping, flipping, and color jittering
# VGG16 and VGG19 are the most common variants. #the main difference here is the number of convulational neurons used vgg16 used 13 convulational neurons and the vgg 19 uses the 16 convulational neurons
# Stacked small convolutional filters (3×3 kernel, stride 1, padding 1) for deeper representations.
# Uses 2×2 max pooling (stride 2) after every block for downsampling.


# Layers - Vgg16
# Input Size: 224 × 224 × 3 (RGB images).

# Conv Layers:
#### Block 1: 2 × (64 filters, 3×3, ReLU) → Max Pooling
#### Block 2: 2 × (128 filters, 3×3, ReLU) → Max Pooling
#### Block 3: 3 × (256 filters, 3×3, ReLU) → Max Pooling
#### Block 4: 3 × (512 filters, 3×3, ReLU) → Max Pooling
#### Block 5: 3 × (512 filters, 3×3, ReLU) → Max Pooling

# Fully Connected Layers:
#### FC6: 4096 neurons, ReLU
#### FC7: 4096 neurons, ReLU
#### FC8 (Output): 1000 neurons (Softmax for classification)



In [None]:
#RESNET MODEL

# ResNet introduced residual learning to address the vanishing gradient problem, allowing for extremely deep networks.

# Key Features:
#### Deep Architecture: Can scale up to ResNet-18, ResNet-34, ResNet-50, ResNet-101, ResNet-152.
#### Residual Connections (Skip Connections):
####### Instead of directly learning H(x), it learns F(x) = H(x) - x, making optimization easier.
#### Helps gradients flow smoothly during backpropagation.
#### Batch Normalization: Used after every convolution to stabilize training.
#### ReLU Activation: Applied after each convolutional layer.

# ResNet-18 Layer-by-Layer Breakdown
# Here is the detailed layer-wise breakdown for ResNet-18:

# Conv1 (Initial Convolutional Layer):
# Operation: 7×7 Convolution, 64 filters, stride 2
# Output Size: 112 × 112 × 64

# MaxPool:
# Operation: 3×3 Max Pooling, stride 2
# Output Size: 56 × 56 × 64

# Conv2_x (Residual Block 1 and 2):
# Operation: 2 × Basic Residual Blocks (each with 2x 3×3 convolutions, 64 filters)
# Output Size: 56 × 56 × 64

# Conv3_x (Residual Block 3 and 4):
# Operation: 2 × Basic Residual Blocks (each with 2x 3×3 convolutions, 128 filters), stride 2
# Output Size: 28 × 28 × 128

# Conv4_x (Residual Block 5 and 6):
# Operation: 2 × Basic Residual Blocks (each with 2x 3×3 convolutions, 256 filters), stride 2
# Output Size: 14 × 14 × 256

# Conv5_x (Residual Block 7 and 8):
# Operation: 2 × Basic Residual Blocks (each with 2x 3×3 convolutions, 512 filters), stride 2
# Output Size: 7 × 7 × 512

# AvgPool (Global Average Pooling):
# Operation: Global Average Pooling
# Output Size: 1 × 1 × 512

# Fully Connected (FC):
# Operation: Fully Connected layer (512 → 1000 classes)
# Output Size: 1 × 1 × 1000 (classification result)


## So we can see ResNet as a collection of shallow layers with a condition of skipping layers which their cumulative is zero.
## Firstly this will help the model avoid vanishing gradient.
## Seconly it performs well since it acts like a collection of various shallow layers which the model choose its path based on the conditions.



In [None]:
class CustomConv2D(Layer):
  def __init__(self, n_filters, kernel_size, n_strides, padding='valid'):
    super(CustomConv2D, self).__init__(name='custom_conv2d')

    # Convolution layer with ReLU activation
    self.conv = Conv2D(
        filters=n_filters,
        kernel_size=kernel_size,
        activation='relu',  # Apply ReLU activation after convolution
        strides=n_strides,
        padding=padding)

    # Batch normalization layer
    self.batch_norm = BatchNormalization()

  def call(self, x, training=True):
    # Forward pass through Conv2D layer
    x = self.conv(x)

    # Forward pass through Batch Normalization
    x = self.batch_norm(x, training)

    return x

# Purpose: Encapsulates a Conv2D layer followed by batch normalization and ReLU activation.
# Why? In ResNet, the pattern of Conv2D → BatchNorm → ReLU is repeated often, so this class keeps the code DRY (Don't Repeat Yourself).

# Line-by-line:
# self.conv: Defines a convolution layer with ReLU activation.
# self.batch_norm: Adds batch normalization to stabilize learning.
# call() method: Implements the forward pass.

class ResidualBlock(Layer):
  def __init__(self, n_channels, n_strides=1):
    super(ResidualBlock, self).__init__(name='res_block')

    # Determine if a shortcut is needed
    self.dotted = (n_strides != 1)

    # First Conv layer with strides (for downsampling)
    self.custom_conv_1 = CustomConv2D(n_channels, 3, n_strides, padding="same")
    # Second Conv layer with stride of 1
    self.custom_conv_2 = CustomConv2D(n_channels, 3, 1, padding="same")

    # ReLU Activation for the output
    self.activation = Activation('relu')

    # If downsampling is required, add a 1x1 convolution
    if self.dotted:
      self.custom_conv_3 = CustomConv2D(n_channels, 1, n_strides)

  def call(self, input, training):
    # Forward pass through two convolutional layers
    x = self.custom_conv_1(input, training)
    x = self.custom_conv_2(x, training)

    # Apply skip connection
    if self.dotted:
      # Projection shortcut using 1x1 Conv if dimensions differ
      x_add = self.custom_conv_3(input, training)
      x_add = Add()([x, x_add])
    else:
      # Identity shortcut if dimensions are the same
      x_add = Add()([x, input])

    # ReLU Activation after addition
    return self.activation(x_add)

# Purpose: This block learns residual mappings. It adds the input to the output of the convolutional layers (skip connection).
# Why? Skip connections allow the network to learn identity mappings easily, which helps with vanishing gradients and deeper networks.

# Line-by-line:
# self.dotted: Checks if downsampling is needed (i.e., stride > 1).
# custom_conv_1 and custom_conv_2: Standard convolution layers.
# custom_conv_3: Used for 1x1 projection when downsampling.
# Add(): Adds the shortcut connection to the output.
# self.activation: ReLU applied after the addition.

class ResNet34(Model):
  def __init__(self):
    super(ResNet34, self).__init__(name='resnet_34')

    # Initial Conv Layer and Max Pooling
    self.conv_1 = CustomConv2D(64, 7, 2, padding='same')
    self.max_pool = MaxPooling2D(3, 2)

    # Conv2_x: 3 Residual Blocks
    self.conv_2_1 = ResidualBlock(64)
    self.conv_2_2 = ResidualBlock(64)
    self.conv_2_3 = ResidualBlock(64)

    # Conv3_x: 4 Residual Blocks (First Block with Stride 2 for Downsampling)
    self.conv_3_1 = ResidualBlock(128, 2)
    self.conv_3_2 = ResidualBlock(128)
    self.conv_3_3 = ResidualBlock(128)
    self.conv_3_4 = ResidualBlock(128)

    # Conv4_x: 6 Residual Blocks (First Block with Stride 2 for Downsampling)
    self.conv_4_1 = ResidualBlock(256, 2)
    self.conv_4_2 = ResidualBlock(256)
    self.conv_4_3 = ResidualBlock(256)
    self.conv_4_4 = ResidualBlock(256)
    self.conv_4_5 = ResidualBlock(256)
    self.conv_4_6 = ResidualBlock(256)

    # Conv5_x: 3 Residual Blocks (First Block with Stride 2 for Downsampling)
    self.conv_5_1 = ResidualBlock(512, 2)
    self.conv_5_2 = ResidualBlock(512)
    self.conv_5_3 = ResidualBlock(512)

    # Global Average Pooling and Fully Connected Layer
    self.global_pool = GlobalAveragePooling2D()
    self.fc_3 = Dense(CONFIGURATION["NUM_CLASSES"], activation='softmax')

  def call(self, x, training=True):
    # Initial Conv and Pooling
    x = self.conv_1(x)
    x = self.max_pool(x)

    # Conv2_x
    x = self.conv_2_1(x, training)
    x = self.conv_2_2(x, training)
    x = self.conv_2_3(x, training)

    # Conv3_x
    x = self.conv_3_1(x, training)
    x = self.conv_3_2(x, training)
    x = self.conv_3_3(x, training)
    x = self.conv_3_4(x, training)

    # Conv4_x
    x = self.conv_4_1(x, training)
    x = self.conv_4_2(x, training)
    x = self.conv_4_3(x, training)
    x = self.conv_4_4(x, training)
    x = self.conv_4_5(x, training)
    x = self.conv_4_6(x, training)

    # Conv5_x
    x = self.conv_5_1(x, training)
    x = self.conv_5_2(x, training)
    x = self.conv_5_3(x, training)

    # Global Average Pooling and Output Layer
    x = self.global_pool(x)
    return self.fc_3(x)

# Purpose: This is the complete ResNet-34 architecture with 34 layers using custom Residual Blocks.
# Why? It closely follows the original ResNet-34 design with grouped residual blocks.

# Line-by-line:
# Conv2_x to Conv5_x: Groups of residual blocks with downsampling at the start of each stage.
# GlobalAveragePooling2D: Reduces each feature map to a single value, preventing overfitting.
# Dense Layer: Output layer with softmax activation for classification.

In [None]:
# # Covariate Shift and Batch Normalization

# # Covariate Shift
# # Covariate Shift refers to a situation where the distribution of the input data changes between training and testing phases, but the conditional distribution of the output given the input remains the same. In simpler terms, it happens when the model is trained on data from one distribution, but when deployed, it encounters data from a different distribution, which can hurt model performance.

# # Batch Normalization (BatchNorm)
# # Batch Normalization is a technique introduced to address internal covariate shift during the training of deep neural networks. It normalizes the activations of each layer by scaling and shifting them, ensuring that the distribution of inputs to each layer remains stable throughout training.

# In 2D Global Average Pooling, the pooling operation averages over all spatial dimensions (height and width) for each feature map (channel) of the input.
# Instead of using traditional pooling methods like max pooling (which extracts the maximum value), global average pooling computes the average value of each feature map over its entire spatial area.

# In tasks where the position of the pixels in our datasample don't matter we are good to use the global avg pooling, but if the position matters its not recommended to use that.

In [None]:
# # MobileNet

# MobileNetV2 is a lightweight deep learning model designed specifically for mobile and embedded devices. It uses depthwise separable convolutions to reduce computation while maintaining accuracy. The key feature of MobileNetV2 is its inverted residual structure, which helps in capturing features with fewer parameters.

class DepthwiseSeparableConv(layers.Layer):
    def __init__(self, filters, kernel_size, strides=1, padding='same', expansion=1):
        super(DepthwiseSeparableConv, self).__init__()

        # Depthwise convolution: applies a single filter per input channel
        self.depthwise = layers.DepthwiseConv2D(kernel_size=kernel_size, strides=strides, padding=padding)

        # Pointwise convolution: applies 1x1 convolution to combine features
        self.pointwise = layers.Conv2D(filters, kernel_size=1, padding='same')

        # Batch Normalization to stabilize training and improve generalization
        self.batch_norm = layers.BatchNormalization()

        # ReLU activation to introduce non-linearity
        self.activation = layers.ReLU()

    def call(self, inputs, training=False):
        x = self.depthwise(inputs)  # Apply depthwise convolution
        x = self.pointwise(x)       # Apply pointwise convolution
        x = self.batch_norm(x, training=training)  # Apply batch normalization
        return self.activation(x)  # Apply ReLU activation

# Inverted Residual Block (with Linear Bottleneck)
class InvertedResidualBlock(layers.Layer):
    def __init__(self, input_channels, output_channels, strides=1, expansion=6):
        super(InvertedResidualBlock, self).__init__()

        # Expansion layer: increases the number of channels (width of the layer)
        self.expand = layers.Conv2D(input_channels * expansion, kernel_size=1, padding='same')
        self.expand_bn = layers.BatchNormalization()
        self.expand_relu = layers.ReLU()

        # Depthwise Separable Convolution
        self.depthwise = DepthwiseSeparableConv(input_channels * expansion, kernel_size=3, strides=strides, padding='same')

        # Projection layer: reduces the number of channels back to the desired output size
        self.project = layers.Conv2D(output_channels, kernel_size=1, padding='same')
        self.project_bn = layers.BatchNormalization()

        # If the input and output channels are the same, add a residual shortcut connection
        self.shortcut = None
        if strides == 1 and input_channels == output_channels:
            self.shortcut = layers.Add()

    def call(self, inputs, training=False):
        # Expansion phase
        x = self.expand(inputs)
        x = self.expand_bn(x, training=training)
        x = self.expand_relu(x)

        # Depthwise separable convolution phase
        x = self.depthwise(x, training=training)

        # Projection phase
        x = self.project(x)
        x = self.project_bn(x, training=training)

        # If a shortcut is present, add it (residual connection)
        if self.shortcut:
            return self.shortcut([x, inputs])
        else:
            return x

# MobileNetV2 Model
class MobileNetV2(Model):
    def __init__(self, num_classes=3):
        super(MobileNetV2, self).__init__()

        # Initial convolution layer
        self.conv_1 = layers.Conv2D(32, kernel_size=3, strides=2, padding='same', activation='relu')
        self.bn_1 = layers.BatchNormalization()

        # Create inverted residual blocks with different expansion factors and channel sizes
        self.block_1 = InvertedResidualBlock(32, 16, strides=1)
        self.block_2 = InvertedResidualBlock(16, 24, strides=2)
        self.block_3 = InvertedResidualBlock(24, 32, strides=1)
        self.block_4 = InvertedResidualBlock(32, 64, strides=2)
        self.block_5 = InvertedResidualBlock(64, 96, strides=1)
        self.block_6 = InvertedResidualBlock(96, 160, strides=2)
        self.block_7 = InvertedResidualBlock(160, 320, strides=1)

        # Final convolution to expand the feature maps
        self.conv_2 = layers.Conv2D(1280, kernel_size=1, strides=1, padding='same', activation='relu')

        # Global Average Pooling to reduce the spatial dimensions to 1x1
        self.global_pool = layers.GlobalAveragePooling2D()

        # Fully connected layer for classification (softmax for multi-class classification)
        self.fc = layers.Dense(num_classes, activation='softmax')

    def call(self, inputs, training=False):
        # Apply the initial convolution and batch normalization
        x = self.conv_1(inputs)
        x = self.bn_1(x, training=training)

        # Apply each Inverted Residual Block sequentially
        x = self.block_1(x, training=training)
        x = self.block_2(x, training=training)
        x = self.block_3(x, training=training)
        x = self.block_4(x, training=training)
        x = self.block_5(x, training=training)
        x = self.block_6(x, training=training)
        x = self.block_7(x, training=training)

        # Final convolution to expand features before pooling
        x = self.conv_2(x)

        # Global Average Pooling to flatten the feature map into a single vector
        x = self.global_pool(x)

        # Apply the fully connected layer for classification
        x = self.fc(x)
        return x


## MobileNet V3

The Squeeze-and-Excitation (SE) Block is a powerful mechanism introduced to enhance the representational power of convolutional neural networks by modeling the interdependencies between channels. It performs channel-wise attention, allowing the network to focus on the most informative features.

The reason why this was introduced is the traditional convulational networks use filters which extract features independetly in each channels making it not aware about the knowledge and the relationship with the other channel.

So this block will create a relation channel which will help the network to

Emphasize informative channels and suppress less useful ones.
Enhance feature maps by learning channel-wise importance.
Improve model performance with minimal additional computational cost.

The SE block has three main stages:

1. Squeeze: Global spatial information is aggregated into a single channel descriptor by using Global Average Pooling. This helps the network understand the global context of the feature maps.

-- The global avg pooling converts each channel into a single scalar value by averaging the spatial dimensions.

2. Excitation: Fully connected layers capture channel-wise dependencies, producing weights for each channel. These weights are then multiplied with the original feature maps to emphasize important channels.

-- It contains two fully connected layers:

--- First Layer: Reduces the number of channels by a factor of reduction (typically 4), capturing cross-channel dependencies.

--- Second Layer: Restores the original number of channels.

Both layers use ReLU for introducing Non-Linearity and Sigmoid normalizing the channel weights to the value of 0 and 1.

3. Recalibration (Scale and Excite):

The weights are multiplied by the original feature maps, recalibrating each channel's importance.





## Efficient Net

EfficientNet is a family of convolutional neural networks that scales up models efficiently in three dimensions:

Depth (more layers),
Width (wider layers),
Resolution (higher input image size)

It uses Compound Scaling to uniformly scale all three dimensions, balancing accuracy and computational efficiency.

Key Concepts:

*   Compound Scaling: A balanced approach to scaling network depth, width, and resolution.
* MBConv Blocks: Inverted residual blocks with squeeze-and-excitation layers, optimized for mobile and edge devices.
* Swish Activation: A smoother activation function (x * sigmoid(x)) leading to better performance.
* Depthwise Separable Convolutions: Used to reduce computation and parameter count.



In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import (
    Conv2D, BatchNormalization, ReLU, GlobalAveragePooling2D, Dense,
    DepthwiseConv2D, Add, Layer, Dropout, Activation, Multiply
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy, TopKCategoricalAccuracy
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Configuration for Learning Rate and Epochs
CONFIGURATION = {
    "LEARNING_RATE": 0.001,
    "N_EPOCHS": 30
}

# Swish Activation Function
class Swish(Layer):
    def call(self, inputs):
        return inputs * tf.nn.sigmoid(inputs)

# Squeeze and Excitation Block for Channel Attention
class SqueezeAndExcite(Layer):
    def __init__(self, input_channels, reduction=4):
        super(SqueezeAndExcite, self).__init__()
        self.global_pool = GlobalAveragePooling2D(keepdims=True)
        self.fc1 = Dense(input_channels // reduction, activation='relu')
        self.fc2 = Dense(input_channels, activation='sigmoid')

    def call(self, inputs):
        x = self.global_pool(inputs)
        x = self.fc1(x)
        x = self.fc2(x)
        return Multiply()([inputs, x])

# MBConv Block (Mobile Inverted Bottleneck with Squeeze and Excite)
class MBConv(Layer):
    def __init__(self, input_channels, output_channels, strides=1, expansion=6, kernel_size=3, se_ratio=0.25):
        super(MBConv, self).__init__()
        self.strides = strides
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.expanded_channels = input_channels * expansion

        # Expansion Phase
        self.expand_conv = Conv2D(self.expanded_channels, kernel_size=1, padding='same', use_bias=False)
        self.expand_bn = BatchNormalization()
        self.expand_swish = Swish()

        # Depthwise Convolution
        self.depthwise_conv = DepthwiseConv2D(kernel_size=kernel_size, strides=strides, padding='same', use_bias=False)
        self.depthwise_bn = BatchNormalization()
        self.depthwise_swish = Swish()

        # Squeeze and Excite
        self.se = SqueezeAndExcite(self.expanded_channels, reduction=int(1/se_ratio))

        # Output Projection
        self.project_conv = Conv2D(output_channels, kernel_size=1, padding='same', use_bias=False)
        self.project_bn = BatchNormalization()

        # Shortcut Connection (if input and output shapes match)
        self.use_shortcut = (strides == 1 and input_channels == output_channels)

    def call(self, inputs, training=False):
        x = self.expand_conv(inputs)
        x = self.expand_bn(x, training=training)
        x = self.expand_swish(x)

        x = self.depthwise_conv(x)
        x = self.depthwise_bn(x, training=training)
        x = self.depthwise_swish(x)

        x = self.se(x)

        x = self.project_conv(x)
        x = self.project_bn(x, training=training)

        # Add shortcut connection if applicable
        if self.use_shortcut:
            x = Add()([x, inputs])

        return x

# EfficientNet Model Implementation
class EfficientNet(Model):
    def __init__(self, num_classes=3):
        super(EfficientNet, self).__init__()

        # Initial Stem Layer
        self.stem_conv = Conv2D(32, kernel_size=3, strides=2, padding='same', use_bias=False)
        self.stem_bn = BatchNormalization()
        self.stem_swish = Swish()

        # MBConv Blocks (EfficientNetB0 Scaling)
        self.block_1 = MBConv(32, 16, strides=1, expansion=1)
        self.block_2 = MBConv(16, 24, strides=2, expansion=6)
        self.block_3 = MBConv(24, 40, strides=2, expansion=6)
        self.block_4 = MBConv(40, 80, strides=2, expansion=6)
        self.block_5 = MBConv(80, 112, strides=1, expansion=6)
        self.block_6 = MBConv(112, 192, strides=2, expansion=6)
        self.block_7 = MBConv(192, 320, strides=1, expansion=6)

        # Final Convolution and Pooling
        self.conv_head = Conv2D(1280, kernel_size=1, use_bias=False)
        self.bn_head = BatchNormalization()
        self.head_swish = Swish()
        self.global_pool = GlobalAveragePooling2D()

        # Dropout for regularization
        self.dropout = Dropout(0.3)

        # Fully connected layer for classification
        self.fc = Dense(num_classes, activation='softmax')

    def call(self, inputs, training=False):
        x = self.stem_conv(inputs)
        x = self.stem_bn(x, training=training)
        x = self.stem_swish(x)

        x = self.block_1(x, training=training)
        x = self.block_2(x, training=training)
        x = self.block_3(x, training=training)
        x = self.block_4(x, training=training)
        x = self.block_5(x, training=training)
        x = self.block_6(x, training=training)
        x = self.block_7(x, training=training)

        x = self.conv_head(x)
        x = self.bn_head(x, training=training)
        x = self.head_swish(x)

        x = self.global_pool(x)
        x = self.dropout(x, training=training)

        x = self.fc(x)
        return x


# Initialize and build the model
efficient_net_model = EfficientNet(num_classes=3)
efficient_net_model(tf.zeros([1, 256, 256, 3]), training=False)  # Dummy input to build model
efficient_net_model.summary()

## Transfer Learning

Transfer learning is a deep learning technique where a pre-trained model is used as a starting point for a different but related task. Instead of training a model from scratch, you "transfer" the learned features from a model trained on a large dataset (like ImageNet) to your specific problem.

There are two main approaches:

* Feature Extraction: Freeze the pre-trained layers and only train the new classifier on top.
* Fine-Tuning: Unfreeze some top layers of the base model and train them alongside the new classifier.




In [None]:
# Load MobileNetV2 without the top classification layer

backbone_model = tf.keras.applications.MobileNetV2(
    include_top=False,
    weights='imagenet',
    input_shape=(CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"], 3)
)

# Freeze the base model layers initially

backbone_model.trainable = False

# Define the input
pretrained_input = Input(shape=(CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"], 3))

# Build the classification head
x = backbone_model(pretrained_input, training = False)
x = GlobalAveragePooling2D()(x)
x = Dense(CONFIGURATION["N_DENSE_1"], activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(CONFIGURATION["N_DENSE_2"], activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)

# Output layer
pretrained_output = Dense(CONFIGURATION["NUM_CLASSES"], activation="softmax")(x)

# Assemble the model
pretrained_model = Model(inputs=pretrained_input, outputs=pretrained_output, name="EfficientNetB4_pretrained")

# Display the model architecture
pretrained_model.summary()

# Callbacks

early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=1e-6, verbose=1)
model_checkpoint = ModelCheckpoint('pretrained.keras', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
callbacks = [early_stopping, reduce_lr, model_checkpoint]

# Train only the top layers

history = pretrained_model.fit(
    mixed_dataset.map(cutmix),
    validation_data=validation_dataset,
    epochs=CONFIGURATION["N_EPOCHS"],
    verbose=1,
    callbacks=callbacks
)


# Unfreeze the last 20 layers

for layer in backbone_model.layers[-20:]:
    backbone_model.trainable = True

# Recompile with a smaller learning rate for fine-tuning

pretrained_model.compile(
    optimizer=Adam(learning_rate=1e-5),
    loss=CategoricalCrossentropy(),
    metrics=[CategoricalAccuracy(name="accuracy"), TopKCategoricalAccuracy(k=2, name="top_k_accuracy")]
)

# Continue training with fine-tuning

history_fine = pretrained_model.fit(
    mixed_dataset.map(cutmix),
    validation_data=validation_dataset,
    epochs=CONFIGURATION["N_EPOCHS"],
    verbose=1,
    callbacks=callbacks
)

## Feature map model

Definition: A feature map is the output of a convolutional layer after applying filters (kernels) to the input image or previous layer's output.

Purpose: It captures different aspects of the input, such as edges, textures, patterns, and more complex features at deeper layers.

Why Are They Important?

* Visualizing Learning: Feature maps help us understand what the model is learning at each layer.
* Debugging and Interpretability: By visualizing feature maps, we can see if the model is focusing on the right parts of the input.
* Transfer Learning: Pre-trained models use feature maps learned from large datasets (like ImageNet) as a starting point for new tasks.

In [None]:
vgg_base_model = tf.keras.applications.vgg16.VGG16(
    include_top=False,
    weights='imagenet',
    input_shape=(CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"], 3)
)

vgg_base_model.summary()

In [None]:
feature_maps = [layer.output for layer in vgg_base_model.layers]
feature_map_model = Model(
    inputs=vgg_base_model.input,
    outputs=feature_maps)


feature_map_model.summary()

## Class Activation Mapping (CAM)

Class Activation Mapping (CAM) is a technique used in Convolutional Neural Networks (CNNs) to highlight the important regions in an image that contribute to a specific class prediction. CAM provides interpretability by generating a heatmap that overlays on the original image, showing which areas influenced the model’s decision.

## Grand-cam

Grand-CAM (Gradient-weighted Class Activation Mapping) is a technique used in Convolutional Neural Networks (CNNs), to visualize which regions of an image contribute the most to a model's decision. It is an extension of CAM (Class Activation Mapping) but uses gradients to improve flexibility.

How Grand-CAM Works
* Feature Map Extraction – The convolutional layers of a CNN extract feature maps from the input image.
* Gradient Computation – The gradients of the target class score (final prediction) are computed concerning the feature maps. All gradients will be set to 0 except the target class.
* Weight Calculation – The gradients are averaged spatially to obtain importance weights for each feature map.
* Heatmap Generation – A weighted sum of the feature maps is taken, followed by a ReLU activation to remove negative values.
* Overlay on Original Image – The heatmap is resized and overlaid on the original image to highlight important regions.

In [None]:
# Pretrain a model

# Initialize the EfficientNetB5 model as the backbone, without the top layers and using ImageNet weights.
backbone_model = tf.keras.applications.efficientnet.EfficientNetB5(
    include_top = False,  # Exclude the top classification layer
    weights = "imagenet",  # Use pretrained ImageNet weights
    input_shape=(CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"], 3),  # Define input image shape
)

# Set the backbone model to non-trainable (freeze its layers).
backbone_model.trainable = False

# Get the output of the backbone model.
x = backbone_model.output

# Add a global average pooling layer to reduce spatial dimensions of feature maps.
x = GlobalAveragePooling2D()(x)

# Add a fully connected (Dense) layer with ReLU activation and a specified number of neurons (N_DENSE_1).
x = Dense(CONFIGURATION["N_DENSE_1"], activation="relu")(x)

# Add batch normalization to stabilize training by normalizing activations.
x = BatchNormalization()(x)

# Add dropout regularization to prevent overfitting.
x = Dropout(0.3)(x)

# Add another fully connected (Dense) layer with ReLU activation and a specified number of neurons (N_DENSE_2).
x = Dense(CONFIGURATION["N_DENSE_2"], activation="relu")(x)

# Add batch normalization again to stabilize training.
x = BatchNormalization()(x)

# Add another dropout layer for further regularization.
x = Dropout(0.3)(x)

# Add the final output layer with softmax activation for multi-class classification.
pretrained_output = Dense(CONFIGURATION["NUM_CLASSES"], activation="softmax")(x)

# Create the complete model by defining inputs and outputs, using the EfficientNetB5 backbone.
pretrained_model = Model(inputs=backbone_model.inputs, outputs=pretrained_output, name="EfficientNetB5_pretrained")

# Display a summary of the model architecture.
pretrained_model.summary()

# Callbacks

# Early stopping callback: stop training if the validation loss doesn't improve for 3 epochs.
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)

# Reduce learning rate on plateau callback: reduce learning rate by a factor of 0.1 if validation loss doesn't improve.
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=1e-6, verbose=1)

# Model checkpoint callback: save the best model based on validation accuracy.
model_checkpoint = ModelCheckpoint('pretrained_eff_b5.keras', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

# List of callbacks to be used during training.
callbacks = [early_stopping, reduce_lr, model_checkpoint]

# Fit the model using the training data with the CutMix data augmentation technique.
history = pretrained_model.fit(
    mixed_dataset.map(cutmix),  # Apply CutMix augmentation to the training dataset
    validation_data=validation_dataset,  # Use a validation dataset for evaluation
    epochs=10,  # Train the model for 10 epochs
    verbose=1,  # Show training progress
    callbacks=callbacks  # Use the defined callbacks
)


In [None]:
# Load pre-trained model weights from a saved file
pretrained_model.load_weights('/content/pretrained_eff_b5.keras')

# Read and preprocess a test image
test_image = cv2.imread("/content/dataset/Emotions Dataset/Emotions Dataset/test/angry/101071.jpg_rotation_1.jpg")  # Read image from the specified path
test_image = cv2.resize(test_image, (CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"]))  # Resize the image to match the model input size

# Convert the image to a TensorFlow constant and add an extra dimension to represent batch size
im = tf.constant(test_image, dtype = tf.float32)
img_array = tf.expand_dims(im, axis = 0)  # Add batch dimension (shape: 1, height, width, channels)

# Print the shape of the image array (should be [1, height, width, channels])
print(img_array.shape)

# Make a prediction using the model and map the predicted class index to its name
prediction = CONFIGURATION["CLASS_NAMES"][tf.argmax(pretrained_model(img_array), axis = -1).numpy()[0]]
print(prediction)

# Get the last convolutional layer's output
last_conv_layer_name = "top_activation"  # Define the last convolutional layer name
last_conv_layer = pretrained_model.get_layer(last_conv_layer_name)  # Retrieve the layer by its name

# Create a new model that outputs the activations of the last convolutional layer
last_conv_layer_model = tf.keras.Model(inputs = pretrained_model.inputs, outputs = last_conv_layer.output, name = "final_conv_layer")
last_conv_layer_model.summary()  # Print the summary of the model (structure of the last conv layer)

# Define the classifier layers to use for final predictions
classifier_layer_names = [
    "global_average_pooling2d",  # Layer name for global average pooling
    "dense",  # Fully connected layer 1
    "dense_1",  # Fully connected layer 2
    "dense_2"  # Fully connected layer 3
]

# Define the input shape for the classifier model based on the last conv layer's output shape
classifier_input = Input(shape= last_conv_layer.output.shape[1:])
x = classifier_input  # Set input to the model

# Apply the layers from the classifier to the input
for layer in classifier_layer_names:
    x = pretrained_model.get_layer(layer)(x)  # Pass the input through each layer

# Create a classifier model from the input and output
classifier_model = Model(classifier_input, x)

# Use GradientTape to record the gradient calculation for backpropagation
with tf.GradientTape() as tape:
    # Get the output of the last conv layer
    last_conv_layer_output = last_conv_layer_model(img_array)

    # Get the classifier model's prediction
    prediction = classifier_model(last_conv_layer_output)

    # Get the index of the class with the highest predicted probability
    top_pred_index = tf.argmax(prediction[0])
    top_class_channel = prediction[:, top_pred_index]  # Extract the output of the top predicted class

# Calculate the gradients of the top predicted class with respect to the output of the last convolutional layer
grads = tape.gradient(top_class_channel, last_conv_layer_output)

# Pool the gradients by averaging them across spatial dimensions (height and width)
pool_grads = tf.reduce_mean(grads, axis=(0,1,2)).numpy()

# Convert the output from the last conv layer to a numpy array for further processing
last_conv_layer_output = last_conv_layer_output.numpy()[0]

# Apply the pooled gradients to the output of the last convolutional layer (weighted sum)
for i in range(len(pool_grads)):
    last_conv_layer_output[:,:,i] *= pool_grads[i]

# Create a heatmap by summing across the channel axis (depth) of the feature map
heatmap = np.sum(last_conv_layer_output, axis=-1)
heatmap = tf.nn.relu(heatmap)  # Apply ReLU activation to ensure all values are positive

# Convert the heatmap to a numpy array
heatmap_np = heatmap.numpy()

# Normalize the heatmap values to the range [0, 1]
heatmap_np = heatmap_np / np.max(heatmap_np)

# Resize the heatmap to match the original image size
resized_heatmap = cv2.resize(heatmap_np, (img_array.shape[2], img_array.shape[1]))

# Apply a color map (jet) to the heatmap to make it visually interpretable
colored_heatmap = cv2.applyColorMap(np.uint8(255 * resized_heatmap), cv2.COLORMAP_JET)

# Convert the original image from TensorFlow format to a uint8 format (between 0 and 255)
original_img = np.uint8(img_array[0] * 255)

# If the original image is grayscale, convert it to BGR format for display
if original_img.shape[-1] == 1:
    original_img = cv2.cvtColor(original_img, cv2.COLOR_GRAY2BGR)

# Blend the original image with the colored heatmap (using alpha blending)
alpha = 0.5  # Define the transparency of the overlay
overlay = cv2.addWeighted(original_img, 1 - alpha, colored_heatmap, alpha, 0)

# Display the final overlay image with the heatmap on top
plt.imshow(cv2.cvtColor(overlay, cv2.COLOR_BGR2RGB))  # Convert from BGR to RGB for proper display
plt.axis("off")  # Remove axis from the plot
plt.show()  # Show the image
