In [29]:
## Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

## Local Response Normalization

In [30]:
# Local Response Normalization (LRN) is a normalization layer that
# operates on local input regions within a feature map.  It's less
# common now, often superseded by batch normalization, but it can still
# be useful in specific situations.

# When to use LRN:
# 1.  Historically, LRN was used to enhance the generalization ability
#     of convolutional neural networks (CNNs), particularly in the
#     AlexNet architecture.  It was thought to encourage competition
#     among neurons within the same feature map, promoting more robust
#     feature representations.

# 2.  LRN can be beneficial when dealing with image data where local
#     contrast normalization is desired.  It normalizes responses
#     across adjacent channels, which can help to suppress background
#     noise and highlight salient features.


# When NOT to use LRN:
# 1. Batch Normalization (BatchNorm): BatchNorm has largely replaced LRN
#    because it's generally more effective in improving the training
#    process and model performance.  BatchNorm normalizes across the
#    batch dimension, providing a more stable and efficient way to
#    handle internal covariate shift.

# 2.  Modern CNN Architectures:  Most state-of-the-art architectures
#     no longer include LRN layers.  The benefits of LRN are typically
#     outweighed by BatchNorm or other normalization techniques.

# 3.  Small Datasets: LRN might be less effective on smaller datasets
#     because the normalization statistics calculated within local
#     regions might not be representative.

class LocalResponseNorm2D(nn.Module):
    def __init__(self, size=5, alpha=1e-4, beta=0.75, k=2):
        super(LocalResponseNorm2D, self).__init__()
        self.size = size
        self.alpha = alpha
        self.beta = beta
        self.k = k

    def forward(self, x):
        return F.local_response_norm(x, size=self.size, alpha=self.alpha, beta=self.beta, k=self.k)

## Example usage
batch_size = 32
channels = 3
height, width = 224, 224

input_tensor = torch.randn(batch_size, channels, height, width)

lrn_example = LocalResponseNorm2D()
lrn_output = lrn_example(input_tensor)

print("Local Response Normalization output shape:", lrn_output.shape)


Local Response Normalization output shape: torch.Size([32, 3, 224, 224])


## Batch Normalization

In [31]:
# When to use:  Generally effective for larger batch sizes and when the distribution of activations
#               across the batch is representative of the overall data distribution.  Suitable for
#               most convolutional neural networks (CNNs) when training data is plentiful.
# How to use: Apply after a convolutional or fully connected layer.  The normalization is performed across
#              the mini-batch.
# When not to use:

# 1. Small Batch Sizes: Batch Normalization calculates statistics (mean and variance) over a mini-batch.
#           With very small batch sizes, these statistics become unreliable and noisy, potentially harming the model's performance.
#           In such cases, consider Layer Normalization or Group Normalization which are less sensitive to batch size.
# 2. Recurrent Neural Networks (RNNs):  Applying Batch Normalization directly to RNNs can be problematic
#           due to the sequential nature of the data. The normalization statistics are calculated across different
#           sequences in the batch, which might not be meaningful.  Instead, consider using Layer Normalization
#           which normalizes across the features of a single time step.
# 3. Online Learning:  Batch Normalization relies on batch statistics. In online learning scenarios
#           where you receive and process data instances individually, it's not directly applicable.
# 4. Limited Computational Resources:  Batch Normalization adds computational overhead.
#           If you're working with limited resources, it might be a performance bottleneck.
# 5. When other normalization methods are more suitable:  There are other normalization
#           techniques like Layer Normalization, Instance Normalization, and Group Normalization
#           that may be more appropriate depending on the architecture and task.
#           Consider alternatives if Batch Normalization doesn't improve performance or adds undesirable effects.

class BatchNormExample(nn.Module):
    def __init__(self, in_channels):
        super(BatchNormExample, self).__init__()
        self.conv = nn.Conv2d(in_channels, 64, kernel_size=3, padding=1)
        self.bn = nn.BatchNorm2d(64)  ## Applies batch norm to the 64 output channels

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        return x

## Example usage
batch_size = 32
channels = 3
height, width = 224, 224

input_tensor = torch.randn(batch_size, channels, height, width)

bn_example = BatchNormExample(channels)
bn_output = bn_example(input_tensor)

print("Batch Norm output shape:", bn_output.shape)

Batch Norm output shape: torch.Size([32, 64, 224, 224])


## Group Normalization

In [32]:
# When to use:  Useful when batch sizes are small, where batch normalization might not be as reliable due to
#               limited sample diversity.  Helps to stabilize training in situations with limited data.
# How to use:  Similar to Batch Normalization but divides the channels into groups and performs normalization
#              within each group.  The number of groups is a hyperparameter.
# When not to use:
#           Group Normalization (GN) is less effective than other normalization methods when the number of
#           channels in your data is small.  Since GN normalizes across groups of channels, if the number of
#           channels is small (e.g., less than the group size), the normalization statistics become less
#           reliable and may not properly capture the distribution of activations. In such scenarios,
#           other techniques like Layer Normalization might be preferable.

class GroupNormExample(nn.Module):
    def __init__(self, in_channels):
        super(GroupNormExample, self).__init__()
        self.conv = nn.Conv2d(in_channels, 64, kernel_size=3, padding=1)
        self.gn = nn.GroupNorm(num_groups=8, num_channels=64)  ## 8 groups

    def forward(self, x):
        x = self.conv(x)
        x = self.gn(x)
        return x

## Example usage
batch_size = 32
channels = 3
height, width = 224, 224

input_tensor = torch.randn(batch_size, channels, height, width)

gn_example = GroupNormExample(channels)
gn_output = gn_example(input_tensor)

print("Group Norm output shape:", gn_output.shape)


Group Norm output shape: torch.Size([32, 64, 224, 224])


## Instance Normalization

In [33]:
## Instance Normalization
# When to use: Effective for style transfer and image generation tasks, particularly when the statistics
#              of each image should be normalized independently.
# How to use: Normalizes the activations of each channel separately for each image in the batch.
#              Useful when the style of each instance needs to be preserved, as it does not rely on
#              the statistics of other images.
# When not to use:
# 1. When batch statistics are important: Instance Normalization normalizes each channel of each individual image in the batch independently.
#           It ignores the information across the batch dimension.  If you need to use batch statistics for normalization (like in Batch Normalization),
#           then Instance Normalization is not a suitable substitute.
# 2. Tasks that benefit from inter-sample relationships:  Tasks like image classification often benefit from
#           learning features that are shared across images within a batch. Instance Normalization removes this information,
#           potentially hindering performance.  Since it normalizes per image, it removes the information
#           about how a specific feature varies *across* images in a batch.  For classification, this could be detrimental.
# 3. Small datasets or images: While the code doesn't explicitly mention it,  like other normalization methods,
#           Instance Normalization's effectiveness can be affected by small datasets or image sizes.
#           The statistics calculated per image might be unreliable in these scenarios.
# 4. Style transfer applications (sometimes): While Instance Normalization is often used in style transfer,
#           there are cases where other normalization methods might perform better. The choice depends on the
#           specifics of the model and the desired effect.  In style transfer, the goal is often to transfer
#           the style *across* images.  If you're not careful about how to apply Instance Norm,
#           it might end up isolating each image more than you want.

class InstanceNormExample(nn.Module):
    def __init__(self, in_channels):
        super(InstanceNormExample, self).__init__()
        self.conv = nn.Conv2d(in_channels, 64, kernel_size=3, padding=1)
        self.inorm = nn.InstanceNorm2d(64, affine=True) # affine=True allows learnable scaling and bias parameters

    def forward(self, x):
        x = self.conv(x)
        x = self.inorm(x)
        return x

## Example usage
batch_size = 32
channels = 3
height, width = 224, 224

input_tensor = torch.randn(batch_size, channels, height, width)

inorm_example = InstanceNormExample(channels)
inorm_output = inorm_example(input_tensor)

print("Instance Norm output shape:", inorm_output.shape)

Instance Norm output shape: torch.Size([32, 64, 224, 224])


## Layer Normalization

In [34]:
# When to use:  Effective when batch sizes are very small or when you want to normalize activations
#               across features within a single sample, irrespective of the batch.  Often used in
#               recurrent neural networks (RNNs) and transformers.
# How to use:  Normalizes the activations across the features (channels) for each element in the sequence.
#              It does not depend on the batch statistics, making it suitable for situations with variable-length
#              sequences or small batch sizes.
# When NOT to use:
# 1. Convolutional Neural Networks (CNNs): Layer Normalization is generally less effective than
#    Batch Normalization in CNNs.  BatchNorm's normalization across the batch dimension often leads to
#    better performance in image-related tasks.  Layer Normalization normalizes across channels, which might
#    not capture the relevant statistical properties for image data as effectively.

# 2. Recurrent Neural Networks (RNNs) with large sequence lengths: While Layer Normalization is often
#    preferred over Batch Normalization in RNNs due to the varying sequence lengths, it can still be less
#    effective than other methods like RMSNorm when dealing with very long sequences. The normalization
#    across the feature dimension might not capture the temporal dependencies as well in this case.

# 3. When batch statistics are crucial:  If the statistical properties across the batch are important
#    for the model's learning (e.g., in some generative models), Layer Normalization, which ignores the
#    batch dimension, may not be the best choice.  In this case, Batch Normalization is more appropriate.

# 4. When small batch sizes are not a concern: The primary advantage of Layer Normalization over
#     Batch Normalization is its robustness to small batch sizes.  If you have reasonably large batch sizes,
#     then Batch Normalization often performs better.

# 5. When other normalization methods are more suitable:  Always consider other alternatives like
#     Group Normalization, Instance Normalization, or even techniques like Weight Standardization.
#     Experimentation is key to determine the best normalization technique for a given architecture and dataset.

class LayerNormExample(nn.Module):
    def __init__(self, in_features):
        super(LayerNormExample, self).__init__()
        self.linear = nn.Linear(in_features, 64)
        self.ln = nn.LayerNorm(64) ## Applies layer norm to the 64 output features

    def forward(self, x):
        x = self.linear(x)
        x = self.ln(x)
        return x

## Example usage (assuming input is a sequence of vectors)
sequence_length = 10
in_features = 512

input_sequence = torch.randn(batch_size, sequence_length, in_features)

ln_example = LayerNormExample(in_features)
ln_output = ln_example(input_sequence)

print("Layer Norm output shape:", ln_output.shape)

Layer Norm output shape: torch.Size([32, 10, 64])


## Weight Standardization

In [35]:
# When to use: Weight standardization can be beneficial when you want to stabilize the training process
#              of deep neural networks, especially when dealing with very deep or complex architectures.
#              It can sometimes provide a performance boost, especially in conjunction with other
#              regularization techniques like weight decay.
# How to use:  Apply weight standardization to the weights of each layer.  It's typically done within the
#              layer's forward pass before applying the weights to the input activations.
# When not to use:
# 1. Shallow Networks:  For relatively shallow networks, the benefits of weight standardization might be
#                      minimal or even negligible. Other normalization techniques like BatchNorm might be
#                      more effective.
# 2. Already Stable Training: If your network's training process is already stable (e.g., using BatchNorm),
#                            introducing weight standardization might not provide additional benefits and
#                            could even slightly hurt performance.
# 3. Computational Constraints: Weight standardization adds a small amount of computational overhead. If
#                              you have limited computational resources, this might be a concern.
# 4. Hyperparameter Tuning:  As with any normalization technique, tuning the learning rate and weight decay
#                           may be necessary when using Weight Standardization.

class WeightStandardization(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, padding=1):
        super(WeightStandardization, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels, kernel_size, kernel_size))
        self.bias = nn.Parameter(torch.Tensor(out_channels))
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) ## You can use other initializations
        if self.bias is not None:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            nn.init.uniform_(self.bias, -bound, bound)


    def forward(self, x):
        ## Weight standardization
        mean = self.weight.mean(dim=[1, 2, 3], keepdim=True)
        std = self.weight.std(dim=[1, 2, 3], keepdim=True) + 1e-5  ## Add small epsilon for stability
        normalized_weight = (self.weight - mean) / std

        ## Convolution
        return F.conv2d(x, normalized_weight, self.bias, padding=1)


## Example Usage
batch_size = 32
channels = 3
height, width = 224, 224

input_tensor = torch.randn(batch_size, channels, height, width)

ws_example = WeightStandardization(channels, 64)
ws_output = ws_example(input_tensor)

print("Weight Standardization output shape:", ws_output.shape)

Weight Standardization output shape: torch.Size([32, 64, 224, 224])


## Drop-out Layer

In [36]:
## When to use Dropout:
# 1.  Prevent Overfitting: Dropout is primarily used to reduce overfitting in neural networks.
#     It randomly deactivates neurons during training, forcing the network to learn more robust
#     and generalized features.  It's particularly effective when you have a limited amount of training data.
# 2.  Complex Architectures:  Dropout is often beneficial in deep or complex networks with many layers
#     and parameters, as these architectures are more prone to overfitting.

## How to use Dropout:
# 1.  Insert Dropout Layers: Add dropout layers strategically within your network architecture, typically
#     between fully connected layers or convolutional layers.
# 2.  Dropout Rate:  The dropout rate (p) is a hyperparameter that controls the probability of a neuron being deactivated.
#     Typical values are between 0.2 and 0.5. A higher dropout rate means more neurons are deactivated.
#     Experiment to find the best value for your specific task.
# 3.  Training vs. Inference:  It is crucial to remember that dropout is only active during the training phase.
#     During inference (when you use the model for prediction), dropout should be turned off.  Most deep learning
#     frameworks handle this automatically when you set the model to "eval" mode.


## When NOT to use Dropout:
# 1.  Small Networks: For very small networks with few parameters, dropout might not be necessary and could even
#     harm performance.  Overfitting is less of a concern in these cases.
# 2.  Limited Training Data: While dropout helps with limited data, if your dataset is extremely small,
#     dropout might exacerbate the problem of insufficient data. Other regularization techniques might be better
#     suited in this case.
# 3.  When other regularization techniques are more effective: Data augmentation and weight decay are often
#     very effective regularization methods. In some cases, they might be sufficient, and dropout could be redundant.


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(784, 256)
        self.dropout1 = nn.Dropout(0.2)  ## Dropout with 20% probability
        self.fc2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.3) ## Dropout with 30% probability
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):
        x = x.view(-1, 784) ## Flatten the input image
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

## Example usage:
## Create an instance of the network
net = Net()

## Create a sample input tensor
input_tensor = torch.randn(1, 784)  # Example input (batch size 1, 784 features)

## Set the model to training mode (dropout is active)
net.train()

## Perform a forward pass
output = net(input_tensor)

## Print the output shape
print("Output shape (Training mode):", output.shape)

## Set the model to evaluation mode (dropout is inactive)
net.eval()

## Perform a forward pass in evaluation mode
with torch.no_grad(): ## No need to calculate the gradient during evaluation
    output_eval = net(input_tensor)

## Print the output shape (evaluation mode)
print("Output shape (Evaluation mode):", output_eval.shape)


Output shape (Training mode): torch.Size([1, 10])
Output shape (Evaluation mode): torch.Size([1, 10])
