<a href="https://colab.research.google.com/github/yeabwang/Human-Emotions-Detection/blob/main/Note_on_state_of_art_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import cv2
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns
import datetime
import pathlib
import io
import os
import time
import random
from google.colab import files
from PIL import Image
import albumentations as A
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
import matplotlib.cm as cm
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import (GlobalAveragePooling2D, Activation, MaxPooling2D, Add, Conv2D, MaxPool2D, Dense,
                                     Flatten, InputLayer, BatchNormalization, Input, Embedding, Permute,
                                     Dropout, RandomFlip, RandomRotation, LayerNormalization, MultiHeadAttention,
                                     RandomContrast, Rescaling, Resizing, Reshape)
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import (Callback, CSVLogger, EarlyStopping, LearningRateScheduler,
                                        ModelCheckpoint, ReduceLROnPlateau)
from tensorflow.keras.regularizers  import L2, L1
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Example, Features, Feature
from google.colab import drive

## Introduction to Sequential API

The Sequential API is the simplest way to create a model in TensorFlow using Keras. It allows you to stack layers sequentially, meaning each layer has exactly one input tensor and one output tensor. This is suitable for most feedforward neural networks where the model is a straight line of layers.

In [None]:
CONFIGURATION = {
    "BATCH_SIZE": 32,
    "IM_SIZE": 256,
    "LEARNING_RATE": 1e-3,
    "N_EPOCHS": 3,
    "DROPOUT_RATE": 0.0,
    "REGULARIZATION_RATE": 0.0,
    "N_FILTERS": 6,
    "KERNEL_SIZE": 3,
    "N_STRIDES": 1,
    "POOL_SIZE": 2,
    "N_DENSE_1": 1024,
    "N_DENSE_2": 128,
    "NUM_CLASSES": 3,
    "PATCH_SIZE": 16,
    "PROJ_DIM": 768,
    "CLASS_NAMES": ["angry", "happy", "sad"],
}

lenet_model = tf.keras.Sequential(
    [
    InputLayer(shape = (None, None, 3), ),
    # Accepts images of any height and width (None, None) with 3 color channels (RGB).
    # This makes the model flexible to different image sizes.

    resize_rescale_layers,
    # Resizes images to a fixed shape (e.g., 32x32 or 64x64) for the model.
    # Rescales pixel values (e.g., from [0, 255] to [0, 1]) for normalization.

    Conv2D(filters = CONFIGURATION["N_FILTERS"] , kernel_size = CONFIGURATION["KERNEL_SIZE"], strides = CONFIGURATION["N_STRIDES"] , padding='valid',
          activation = 'relu',kernel_regularizer = L2(CONFIGURATION["REGULARIZATION_RATE"])),
    BatchNormalization(),
    MaxPool2D (pool_size = CONFIGURATION["POOL_SIZE"], strides= CONFIGURATION["N_STRIDES"]*2),
    Dropout(rate = CONFIGURATION["DROPOUT_RATE"] ),

    # Conv2D Layer:
    # filters: Number of filters for feature extraction.
    # kernel_size: Size of the convolution window (e.g., 3x3).
    # strides: Step size for sliding the kernel over the input.
    # padding='valid': No padding, reducing output size.
    # activation='relu': Introduces non-linearity.
    # kernel_regularizer=L2(...): Applies L2 regularization to reduce overfitting.

    # BatchNormalization:
    # Normalizes the output of the Conv2D layer, speeding up training and providing regularization.
    # MaxPool2D:

    # pool_size: Size of the pooling window.
    # strides: How far the pooling window moves each step.
    # Reduces spatial dimensions while preserving key features.

    # Dropout:
    # rate: Fraction of neurons to drop (e.g., 0.5 drops 50%).
    # Prevents overfitting by promoting generalization.

    Conv2D(filters = CONFIGURATION["N_FILTERS"]*2 + 4, kernel_size = CONFIGURATION["KERNEL_SIZE"], strides=CONFIGURATION["N_STRIDES"], padding='valid',
          activation = 'relu', kernel_regularizer = L2(CONFIGURATION["REGULARIZATION_RATE"])),
    BatchNormalization(),
    MaxPool2D (pool_size = CONFIGURATION["POOL_SIZE"], strides= CONFIGURATION["N_STRIDES"]*2),

    Flatten(),
    # Converts the 2D feature maps into a 1D vector, preparing it for fully connected layers.

    Dense( CONFIGURATION["N_DENSE_1"], activation = "relu", kernel_regularizer = L2(CONFIGURATION["REGULARIZATION_RATE"])),
    BatchNormalization(),
    Dropout(rate = CONFIGURATION["DROPOUT_RATE"]),

    Dense( CONFIGURATION['N_DENSE_2'], activation = "relu", kernel_regularizer = L2(CONFIGURATION["REGULARIZATION_RATE"])),
    BatchNormalization(),

    # Dense Layers:

    # N_DENSE_1 and N_DENSE_2: Number of neurons in each fully connected layer.
    # activation='relu': Non-linearity for complex pattern learning.
    # kernel_regularizer=L2(...): Regularization to reduce overfitting.
    # BatchNormalization and Dropout:

    # Consistent use of Batch Norm for faster convergence.
    # Dropout helps in generalization by randomly turning off neurons.

    Dense(CONFIGURATION["NUM_CLASSES"], activation = "softmax"),
    # Output Layer:
    # NUM_CLASSES: Number of classes for classification (e.g., 10 for CIFAR-10).
    # activation='softmax': Converts outputs to probabilities for each class.

])

lenet_model.summary()

## Introduction to Functional API

Unlike the Sequential API, where layers are stacked one after another, the Functional API lets you define the computation graph as a directed acyclic graph (DAG).
You explicitly define the flow of data between layers, making it suitable for complex neural network architectures.

###When to Use the Functional API:
When you need multiple inputs or outputs.
When you want to use shared layers.
When building architectures with skip connections or residual blocks.
When you need more flexibility and control over the model design.

In [None]:
from tensorflow.keras import layers, Model
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D, Dropout, Flatten, Dense
from tensorflow.keras.regularizers import L2

# Configuration Dictionary
CONFIGURATION = {
    "BATCH_SIZE": 32,
    "IM_SIZE": 256,  # Target image size after resizing
    "LEARNING_RATE": 1e-3,
    "N_EPOCHS": 3,
    "DROPOUT_RATE": 0.0,
    "REGULARIZATION_RATE": 0.0,
    "N_FILTERS": 6,
    "KERNEL_SIZE": 3,
    "N_STRIDES": 1,
    "POOL_SIZE": 2,
    "N_DENSE_1": 1024,
    "N_DENSE_2": 128,
    "NUM_CLASSES": 3,
    "PATCH_SIZE": 16,
    "PROJ_DIM": 768,
    "CLASS_NAMES": ["angry", "happy", "sad"],
}

# Resize and Rescale Layer
# - Resizes input images to (IM_SIZE, IM_SIZE)
# - Rescales pixel values from [0, 255] to [0, 1]
resize_rescale_layer = tf.keras.Sequential([
    layers.Resizing(CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"]),
    layers.Rescaling(1./255)
])

# ========================
#    Feature Extractor
# ========================

# Input Layer
# - Accepts images of any height and width with 3 color channels
future_input = Input(shape=(None, None, 3))

# Resize and Rescale Layer
# - Ensures all images are of the same size and normalized
x = resize_rescale_layer(future_input)

# Convolutional Layer 1
# - Extracts low-level features like edges and textures
# - Uses L2 regularization to prevent overfitting
x = Conv2D(filters=CONFIGURATION["N_FILTERS"],
           kernel_size=CONFIGURATION["KERNEL_SIZE"],
           strides=CONFIGURATION["N_STRIDES"],
           padding='valid',
           activation='relu',
           kernel_regularizer=L2(CONFIGURATION["REGULARIZATION_RATE"]))(x)

# Batch Normalization
# - Normalizes the output of Conv layer
# - Accelerates training and provides regularization
x = BatchNormalization()(x)

# Max Pooling Layer
# - Reduces spatial dimensions
# - Keeps only the most prominent features
x = MaxPooling2D(pool_size=CONFIGURATION["POOL_SIZE"],
                 strides=CONFIGURATION["N_STRIDES"]*2)(x)

# Convolutional Layer 2
# - Extracts more complex patterns by increasing filters
x = Conv2D(filters=CONFIGURATION["N_FILTERS"]*2 + 4,
           kernel_size=CONFIGURATION["KERNEL_SIZE"],
           strides=CONFIGURATION["N_STRIDES"],
           padding='valid',
           activation='relu',
           kernel_regularizer=L2(CONFIGURATION["REGULARIZATION_RATE"]))(x)

# Batch Normalization
# - Normalizes output of the second Conv layer
x = BatchNormalization()(x)

# Max Pooling Layer
# - Further reduces spatial dimensions
x = MaxPooling2D(pool_size=CONFIGURATION["POOL_SIZE"],
                 strides=CONFIGURATION["N_STRIDES"]*2)(x)

# Flatten Layer
# - Converts 2D feature maps to 1D feature vector
future_output = Flatten()(x)

# Create Feature Extractor Model
future_model = Model(inputs=future_input, outputs=future_output, name="Feature_Extractor")

# ========================
#   Classification Model
# ========================

# Input Layer for Classification Model
class_input = Input(shape=(None, None, 3))

# Use Feature Extractor
# - Reuses the feature extraction part of the model
x = future_model(class_input)

# Fully Connected Layer 1
# - Learns complex patterns and combinations of features
x = Dense(CONFIGURATION["N_DENSE_1"],
          activation='relu',
          kernel_regularizer=L2(CONFIGURATION["REGULARIZATION_RATE"]))(x)

# Batch Normalization
# - Speeds up training and stabilizes learning
x = BatchNormalization()(x)

# Dropout Layer
# - Randomly sets a fraction of inputs to 0
# - Prevents overfitting by adding noise
x = Dropout(rate=CONFIGURATION["DROPOUT_RATE"])(x)

# Output Layer
# - Dense layer with softmax activation for multi-class classification
class_output = Dense(CONFIGURATION["NUM_CLASSES"], activation='softmax')(x)

# Create Classification Model
class_model = Model(inputs=class_input, outputs=class_output, name="Classification_Model")

# Model Summary
class_model.summary()

##Introduction to Model Subclassing in TensorFlow

Model subclassing in TensorFlow is a flexible way to build neural networks by directly inheriting from tf.keras.Model. This approach allows you to fully customize the forward pass (call() method) and define complex architectures that aren't easily implemented using the Sequential or Functional APIs.



In [None]:
CONFIGURATION = {
    "BATCH_SIZE": 32,
    "IM_SIZE": 256,  # Target image size after resizing
    "LEARNING_RATE": 1e-3,
    "N_EPOCHS": 3,
    "DROPOUT_RATE": 0.2,
    "REGULARIZATION_RATE": 1e-4,
    "N_FILTERS": 6,
    "KERNEL_SIZE": 3,
    "N_STRIDES": 1,
    "POOL_SIZE": 2,
    "N_DENSE_1": 1024,
    "N_DENSE_2": 128,
    "NUM_CLASSES": 3,
    "CLASS_NAMES": ["angry", "happy", "sad"],
}

# Preprocessing Layer: Resizes images and rescales pixel values to [0, 1]
resize_rescale_layer = tf.keras.Sequential([
    layers.Resizing(CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"]),
    layers.Rescaling(1./255)
])

# Feature Extractor Class
class LeNetModelFutureExtractor(Layer):
    def __init__(self, config):
        super(LeNetModelFutureExtractor, self).__init__()

        # Store configuration for reuse
        self.config = config

        # Preprocessing Layer
        self.resize_rescale = resize_rescale_layer

        # First Convolutional Block
        self.conv1 = layers.Conv2D(
            filters=config["N_FILTERS"],           # Number of filters
            kernel_size=config["KERNEL_SIZE"],     # Size of the convolution kernel
            strides=config["N_STRIDES"],           # Stride length
            padding='valid',                       # No padding, reduces output size
            activation='relu',                     # Activation function for non-linearity
            kernel_regularizer=L2(config["REGULARIZATION_RATE"]) # L2 regularization
        )
        self.bn1 = layers.BatchNormalization()    # Normalizes activations to stabilize learning
        self.pool1 = layers.MaxPooling2D(
            pool_size=config["POOL_SIZE"],          # Downsamples feature map size
            strides=config["N_STRIDES"]*2           # Stride for pooling
        )

        # Second Convolutional Block
        self.conv2 = layers.Conv2D(
            filters=config["N_FILTERS"]*2 + 4,     # Increasing number of filters
            kernel_size=config["KERNEL_SIZE"],
            strides=config["N_STRIDES"],
            padding='valid',
            activation='relu',
            kernel_regularizer=L2(config["REGULARIZATION_RATE"])
        )
        self.bn2 = layers.BatchNormalization()
        self.pool2 = layers.MaxPooling2D(
            pool_size=config["POOL_SIZE"],
            strides=config["N_STRIDES"]*2
        )

        # Flatten Layer
        self.flatten = layers.Flatten()            # Converts 2D feature maps to 1D vector

    def call(self, x):
        # Forward pass through the feature extractor
        x = self.resize_rescale(x)                 # Resize and Rescale Input
        x = self.conv1(x)                          # First Convolutional Layer
        x = self.bn1(x)                            # Batch Normalization
        x = self.pool1(x)                          # Max Pooling
        x = self.conv2(x)                          # Second Convolutional Layer
        x = self.bn2(x)                            # Batch Normalization
        x = self.pool2(x)                          # Max Pooling
        x = self.flatten(x)                        # Flatten for Dense Layers
        return x

# Classification Model Class
class LeNetClassification(Model):
    def __init__(self, config):
        super(LeNetClassification, self).__init__()

        # Feature Extractor (Reusable Component)
        self.feature_extractor = LeNetModelFutureExtractor(config)

        # Fully Connected Layer 1
        self.fc1 = layers.Dense(
            config["N_DENSE_1"],                   # Number of neurons in the layer
            activation='relu',                     # Activation function
            kernel_regularizer=L2(config["REGULARIZATION_RATE"]) # L2 regularization
        )
        self.bn3 = layers.BatchNormalization()     # Normalizes activations
        self.dropout1 = layers.Dropout(             # Dropout for regularization
            rate=config["DROPOUT_RATE"]
        )

        # Fully Connected Layer 2
        self.fc2 = layers.Dense(
            config["N_DENSE_2"],
            activation='relu',
            kernel_regularizer=L2(config["REGULARIZATION_RATE"])
        )
        self.bn4 = layers.BatchNormalization()

        # Output Layer
        self.output_layer = layers.Dense(
            config["NUM_CLASSES"],                 # Number of output classes
            activation='softmax'                   # Softmax for multi-class classification
        )

    def call(self, x):
        # Forward pass through the classification model
        x = self.feature_extractor(x)              # Extract Features
        x = self.fc1(x)                            # Fully Connected Layer 1
        x = self.bn3(x)                            # Batch Normalization
        x = self.dropout1(x)                       # Dropout
        x = self.fc2(x)                            # Fully Connected Layer 2
        x = self.bn4(x)                            # Batch Normalization
        x = self.output_layer(x)                   # Output Layer
        return x

# Instantiate the Classification Model
leNet_class_model = LeNetClassification(CONFIGURATION)

# Build Model with Input Shape to initialize weights
leNet_class_model.build((None, CONFIGURATION["IM_SIZE"], CONFIGURATION["IM_SIZE"], 3))

# Display Model Summary
leNet_class_model.summary()

In [None]:
#AlexNet was a breakthrough in deep learning, winning the ImageNet Large Scale Visual Recognition Challenge (ILSVRC) 2012 and pioneering modern CNN architectures.

# Architecture: 8 layers (5 convolutional + 3 fully connected).

# Input Size: 227 × 227 × 3 (RGB images).

# Convolutional Layers:
#### Conv1: 96 filters, 11×11 kernel, stride 4, ReLU.
#### Conv2: 256 filters, 5×5 kernel, stride 1, ReLU.
#### Conv3: 384 filters, 3×3 kernel, stride 1, ReLU.
#### Conv4: 384 filters, 3×3 kernel, stride 1, ReLU.
#### Conv5: 256 filters, 3×3 kernel, stride 1, ReLU.

# Max Pooling: After Conv1, Conv2, and Conv5 (3×3 kernel, stride 2).

# Fully Connected Layers:
#### FC6: 4096 neurons, ReLU.
#### FC7: 4096 neurons, ReLU.
#### FC8 (Output): 1000 neurons (ImageNet classes), Softmax.

# Activation Function: ReLU (introduced to speed up training).
# Normalization: Local Response Normalization (LRN) after Conv1 and Conv2.
# Regularization: Dropout (0.5) in FC6 and FC7.
# Optimization: Stochastic Gradient Descent (SGD) with momentum (0.9).
# Batch Size: 128.
# Weight Initialization: Gaussian distribution.
# Data Augmentation: Cropping, flipping, and color jittering.
# Training Dataset: ImageNet (1.2 million images, 1000 classes).
# Parallel Training: Two GPUs used to split model layers for efficiency

In [None]:
## VGG Model
# Key Features:
# Deep Network: 16 (VGG16) or 19 (VGG19) layers.
# Uniform Kernel Size: Only 3×3 convolution layers to maintain consistency.
# Increased Depth: More layers compared to AlexNet for hierarchical feature learning.
# Regularization: Dropout (0.5) in fully connected layers.
# Optimization: SGD with momentum (0.9), batch size = 256.
# Weight Initialization: Pretrained on ImageNet, useful for transfer learning.
# Data Augmentation: Cropping, flipping, and color jittering
# VGG16 and VGG19 are the most common variants. #the main difference here is the number of convulational neurons used vgg16 used 13 convulational neurons and the vgg 19 uses the 16 convulational neurons
# Stacked small convolutional filters (3×3 kernel, stride 1, padding 1) for deeper representations.
# Uses 2×2 max pooling (stride 2) after every block for downsampling.


# Layers - Vgg16
# Input Size: 224 × 224 × 3 (RGB images).

# Conv Layers:
#### Block 1: 2 × (64 filters, 3×3, ReLU) → Max Pooling
#### Block 2: 2 × (128 filters, 3×3, ReLU) → Max Pooling
#### Block 3: 3 × (256 filters, 3×3, ReLU) → Max Pooling
#### Block 4: 3 × (512 filters, 3×3, ReLU) → Max Pooling
#### Block 5: 3 × (512 filters, 3×3, ReLU) → Max Pooling

# Fully Connected Layers:
#### FC6: 4096 neurons, ReLU
#### FC7: 4096 neurons, ReLU
#### FC8 (Output): 1000 neurons (Softmax for classification)



In [None]:
#RESNET MODEL

# ResNet introduced residual learning to address the vanishing gradient problem, allowing for extremely deep networks.

# Key Features:
#### Deep Architecture: Can scale up to ResNet-18, ResNet-34, ResNet-50, ResNet-101, ResNet-152.
#### Residual Connections (Skip Connections):
####### Instead of directly learning H(x), it learns F(x) = H(x) - x, making optimization easier.
#### Helps gradients flow smoothly during backpropagation.
#### Batch Normalization: Used after every convolution to stabilize training.
#### ReLU Activation: Applied after each convolutional layer.

# ResNet-18 Layer-by-Layer Breakdown
# Here is the detailed layer-wise breakdown for ResNet-18:

# Conv1 (Initial Convolutional Layer):

# Operation: 7×7 Convolution, 64 filters, stride 2
# Output Size: 112 × 112 × 64
# MaxPool:

# Operation: 3×3 Max Pooling, stride 2
# Output Size: 56 × 56 × 64
# Conv2_x (Residual Block 1 and 2):

# Operation: 2 × Basic Residual Blocks (each with 2x 3×3 convolutions, 64 filters)
# Output Size: 56 × 56 × 64
# Conv3_x (Residual Block 3 and 4):

# Operation: 2 × Basic Residual Blocks (each with 2x 3×3 convolutions, 128 filters), stride 2
# Output Size: 28 × 28 × 128
# Conv4_x (Residual Block 5 and 6):

# Operation: 2 × Basic Residual Blocks (each with 2x 3×3 convolutions, 256 filters), stride 2
# Output Size: 14 × 14 × 256
# Conv5_x (Residual Block 7 and 8):

# Operation: 2 × Basic Residual Blocks (each with 2x 3×3 convolutions, 512 filters), stride 2
# Output Size: 7 × 7 × 512
# AvgPool (Global Average Pooling):

# Operation: Global Average Pooling
# Output Size: 1 × 1 × 512
# Fully Connected (FC):

# Operation: Fully Connected layer (512 → 1000 classes)
# Output Size: 1 × 1 × 1000 (classification result)


## So we can see ResNet as a collection of shallow layers with a condition of skipping layers which their cumulative is zero.
## Firstly this will help the model avoid vanishing gradient.
## Seconly it performs well since it acts like a collection of various shallow layers which the model choose its path based on the conditions.



In [None]:
# # Covariate Shift and Batch Normalization

# # Covariate Shift
# # Covariate Shift refers to a situation where the distribution of the input data changes between training and testing phases, but the conditional distribution of the output given the input remains the same. In simpler terms, it happens when the model is trained on data from one distribution, but when deployed, it encounters data from a different distribution, which can hurt model performance.

# # Batch Normalization (BatchNorm)
# # Batch Normalization is a technique introduced to address internal covariate shift during the training of deep neural networks. It normalizes the activations of each layer by scaling and shifting them, ensuring that the distribution of inputs to each layer remains stable throughout training.

# In 2D Global Average Pooling, the pooling operation averages over all spatial dimensions (height and width) for each feature map (channel) of the input.
# Instead of using traditional pooling methods like max pooling (which extracts the maximum value), global average pooling computes the average value of each feature map over its entire spatial area.

# In tasks where the position of the pixels in our datasample don't matter we are good to use the global avg pooling, but if the position matters its not recommended to use that.