In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import requests
from io import BytesIO

print(f"Pytorch Version {torch.__version__}")
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device {device}")

Pytorch Version 2.9.0+cpu
Device cpu


In [None]:
class LayerNorm(nn.Module):
    def __init__(self,
                 normalized_shape,
                 eps=1e-5,
                 data_format="channels_last"
                  ):
      super().__init__()
      self.weight= nn.Parameter(torch.ones(normalized_shape))
      self.bias= nn.Parameter(torch.ones(normalized_shape))
      self.eps= eps
      self.data_format= data_format
      if self.data_format not in ("channels_last","channels_first"):
        raise NotImplementedError
      self.normalized_shape= (normalized_shape,)

    def forward(self, x):
      if self.data_format=="channels_last":
        return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
      elif self.data_format=="channels_first":
        u= x.mean(dim=1, keepdim=True)
        s= (x-u).pow(2).mean(dim=1, keepdim=True)
        x= (x-u)/ torch.sqrt(s+self.eps)
        x= self.weight[:, None, None] * x + self.bias[:, None, None]
        return x


In [None]:
class DropPath(nn.Module):
  def __init__(self, drop_prob=0.):
    super(DropPath, self).__init__()
    self.drop_prob= drop_prob

  def forward(self, x):
    if self.drop_prob ==0 or not self.training:
      return x
    keep_prob= 1- self.drop_prob
    shape= (x.shape[0], ) + (1,) * (x.ndim -1)
    random_tensor= keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
    random_tensor.floor_()
    output= x.div(keep_prob) * random_tensor
    return output

In [None]:
class GlobalResponeNorm(nn.Module):
  def __init__(self, dim):
    super().__init__()
    self.gamma= nn.Parameter(torch.zeros(1, 1, 1, dim))
    self.beta= nn.Parameter(torch.zeros(1,1,1, dim))

  def forward(self, x):
    Global_x= torch.norm(x, p=2, dim=(1,2), keepdim=True)
    Norm_x= Global_x / (Global_x.mean(dim=-1, keepdim=True) + 1e-6)
    return self.gamma * (x * Norm_x) + self.beta
print("GRN (Global Response Normalization) defined successfully!")
print("This is the key innovation in ConvNeXt V2!")

GRN (Global Response Normalization) defined successfully!
This is the key innovation in ConvNeXt V2!


In [None]:
class Block(nn.Module):
    """ ConvNeXtV2 Block.

    Main differences from V1:
    - Removed Layer Scale (gamma parameter)
    - Added GRN (Global Response Normalization) after GELU

    Architecture:
        DwConv 7x7 -> Permute -> LayerNorm -> Linear (4x expansion) ->
        GELU -> GRN -> Linear (compression) -> Permute -> DropPath -> Residual

    Args:
        dim (int): Number of input channels.
        drop_path (float): Stochastic depth rate. Default: 0.0
    """
    def __init__(self, dim, drop_path=0.):
        super().__init__()
        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
        self.norm = LayerNorm(dim, eps=1e-6)
        self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
        self.act = nn.GELU()
        self.grn= GlobalResponeNorm(dim= 4*dim)
        self.pwconv2 = nn.Linear(4 * dim, dim)
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()

    def forward(self, x):
      input = x
      x= self.dwconv(x)
      x= x.permute(0,2,3,1)
      x= self.norm(x)
      x= self.pwconv1(x)
      x= self.act(x)
      x= self.grn(x)
      x= self.pwconv2(x)
      x= x.permute(0,3,1,2)
      x= self.drop_path(x)
      x= x+input
      return x

print("ConvNeXt V2 Block defined successfully!")
print("Note: This uses GRN instead of Layer Scale from V1")

ConvNeXt V2 Block defined successfully!
Note: This uses GRN instead of Layer Scale from V1


In [None]:
class ConvNeXtV2(nn.Module):
    """ ConvNeXt V2
        A PyTorch impl of : `ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders`
        http://arxiv.org/abs/2301.00808

    Args:
        in_chans (int): Number of input image channels. Default: 3
        num_classes (int): Number of classes for classification head. Default: 1000
        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
        drop_path_rate (float): Stochastic depth rate. Default: 0.
        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
    """
    def __init__(self,
                 in_channels=3,
                 num_classes=1000,
                 depths=[3, 3, 9, 3],
                 dims=[80, 160, 320, 640], # Corrected for Nano model as per the original problem context
                 drop_path_rate=0.,
                 head_init_scale=1.,
                 ):
      super().__init__()
      self.depths= depths

      # Initialize two ModuleLists: one for downsampling ops, one for block stages
      self.downsample_layers = nn.ModuleList() # This will hold inter-stage downsampling layers
      self.stages = nn.ModuleList()           # This will hold the sequences of Blocks

      dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
      cur=0

      # The stem is a separate module
      self.stem = nn.Sequential(
          nn.Conv2d(in_channels=in_channels, out_channels= dims[0], kernel_size=4, stride=4),
          LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
      )

      for i in range(len(depths)): # Iterate over the number of stages (0, 1, 2, 3)
          # Add the blocks for the current stage to self.stages
          blocks = [Block(dim=dims[i], drop_path=dp_rates[cur + j]) for j in range(depths[i])]
          self.stages.append(nn.Sequential(*blocks)) # Corresponds to stages.0, stages.1, stages.2, stages.3 in checkpoint
          cur += depths[i]

          # Add inter-stage downsampling layer for all but the last stage
          if i < len(depths) - 1:
              downsample_layer = nn.Sequential(
                  LayerNorm(dims[i], eps=1e-6, data_format="channels_first"), # dims[i] is input for next stage
                  nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2)
              )
              # These are downsample_layers[0], downsample_layers[1], downsample_layers[2]
              self.downsample_layers.append(downsample_layer)

      self.norm= nn.LayerNorm(dims[-1], eps=1e-6)
      self.head= nn.Linear(dims[-1], num_classes)

      self.apply(self._init_weights)
      self.head.weight.data.mul_(head_init_scale)
      self.head.bias.data.mul_(head_init_scale)

    def _init_weights(self, m):
        if isinstance(m, (nn.Conv2d, nn.Linear)):
            nn.init.trunc_normal_(m.weight, std=.02)
            nn.init.constant_(m.bias, 0)

    def forward_features(self, x):
        x = self.stem(x) # Process the initial stem

        # Iterate through stages and apply inter-stage downsampling
        for i in range(len(self.depths)):
            x = self.stages[i](x) # Process blocks for current stage
            if i < len(self.depths) - 1: # Apply downsampling for all but the last stage
                # Use downsample_layers[i] for downsampling after stages[i]
                x = self.downsample_layers[i](x)

        return self.norm(x.mean([-2, -1])) # global average pooling, (N, C, H, W) -> (N, C)

    def forward(self, x):
        x = self.forward_features(x)
        x = self.head(x)
        return x

print("ConvNeXt V2 Model defined successfully!")

ConvNeXt V2 Model defined successfully!
