# ResNet-50 architecture

<img src="https://cdn-5f733ed3c1ac190fbc56ef88.closte.com/wp-content/uploads/2019/07/ResNet50_architecture-1.png"  width="500" height="3000">

In [None]:
"""
From scratch implementation of the famous ResNet models.
"""

import torch
import torch.nn as nn

class block(nn.Module):
  def __init__(
      self, in_channels,  intermediate_channels, identity_downsample=None, stride=1
      ):
    super().__init__()
    self.expansion = 4
    self.conv1 = nn.Conv2d(
                            in_channels,
                            intermediate_channels,
                            kernel_size=1,
                            stride=1,
                            padding=0,
                            bias=False 
                            )                 # for example if 
    self.bn1 = nn.BatchNorm2d(intermediate_channels)

    self.conv2 = nn.Conv2d(
                            intermediate_channels,
                            intermediate_channels,
                            kernel_size=3,
                            stride=stride,
                            padding=1,
                            bias=False 
                            )
    self.bn2 = nn.BatchNorm2d(intermediate_channels)

    self.conv3 = nn.Conv2d(
                            intermediate_channels,
                            intermediate_channels * self.expansion,
                            kernel_size=1,
                            stride=1,
                            padding=0,
                            bias=False 
                            )
    self.bn3 = nn.BatchNorm2d(intermediate_channels * self.expansion)

    self.relu = nn.ReLU()
    self.identity_downsample = identity_downsample
    self.stride = stride

  def forward(self, x):
    identity = x.clone()  # deep copy

    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.conv2(x)
    x = self.bn2(x)
    x = self.relu(x)
    x = self.conv3(x)
    x = self.bn3(x)

    if self.identity_downsample is not None:
      identity = self.identity_downsample(identity)

    x += identity
    x = self.relu(x)
    return x





class ResNet(nn.Module):
  def __init__(self, block, num_block, image_channels, num_classes):   # lyers ->for example [3, 4, 6, 3]
    super(ResNet, self).__init__()
    self.in_channels = 64

    self.conv1 = nn.Conv2d(
        image_channels, 64, kernel_size=7, stride=2, padding=3, bias=False
    )
    self.bn1 = nn.BatchNorm2d(64)
    self.relu = nn.ReLU()
    self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

    # Essentially the entire ResNet architecture are in these 4 lines below
    self.layer1 = self._make_layer(
                            block, num_block[0], intermediate_channels=64, stride=1)
    self.layer2 = self._make_layer(
                            block, num_block[1], intermediate_channels=128, stride=2)
    self.layer3 = self._make_layer(
                            block, num_block[2], intermediate_channels=256, stride=2)
    self.layer4 = self._make_layer(
                            block, num_block[3], intermediate_channels=512, stride=2)
    
    # In AdaptiveAvgPool2d you specify output size. in below example we get a 1*1 size for each channel
    self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
    self.fc = nn.Linear(512 * 4, num_classes)


  def forward(self, x):
      x = self.conv1(x)
      x = self.bn1(x)
      x = self.relu(x)
      x = self.maxpool(x)
      x = self.layer1(x)
      x = self.layer2(x)
      x = self.layer3(x)
      x = self.layer4(x)

      x = self.avgpool(x)
      x = x.reshape(x.shape[0], -1)
      x = self.fc(x)

      return x


  def _make_layer(self, block, num_residual_blocks, intermediate_channels, stride):
      identity_downsample = None
      layers = []


      # Either if we half the input space for ex, 56x56 -> 28x28 (stride=2), or channels changes
      # we need to adapt the Identity (skip connection) so it will be able to be added
      # to the layer that's ahead
      if stride != 1 or self.in_channels != intermediate_channels * 4:
          identity_downsample = nn.Sequential(
              nn.Conv2d(
                  self.in_channels,
                  intermediate_channels * 4,
                  kernel_size=1,
                  stride=stride,
                  bias=False,
              ),
              nn.BatchNorm2d(intermediate_channels * 4)
          )

      # The first operation of each layer (layer here means a collection of blocks) is reducing the dimension,
      # so we also need to resize the volume that goes through the skip connection
      # so in below line we downsample for the first block
      layers.append(
          block(self.in_channels, intermediate_channels, identity_downsample, stride)
      )
      
      # input channel of second block of each layer should be changed as below
      # The expansion size is always 4 for ResNet 50,101,152
      self.in_channels = intermediate_channels * 4

      # For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,
      # then finally back to 256. Hence no identity downsample is needed, since stride = 1,
      # and also same amount of channels.
      for i in range(num_residual_blocks - 1):
          layers.append(block(self.in_channels, intermediate_channels))

      return nn.Sequential(*layers)


def ResNet50(img_channel=3, num_classes=1000):
    return ResNet(block, [3, 4, 6, 3], img_channel, num_classes)



def ResNet101(img_channel=3, num_classes=1000):
    return ResNet(block, [3, 4, 23, 3], img_channel, num_classes)


def ResNet152(img_channel=3, num_classes=1000):
    return ResNet(block, [3, 8, 36, 3], img_channel, num_classes)     


# finding output size of convolutional layer

<img src="https://miro.medium.com/v2/resize:fit:358/format:webp/1*SaaR10uSdDFBP3EVNrZwnA.png"  width="300" height="100">

The result of the quotient of above equation is always rounded off. This is represented mathematically by the floor function. 

**Hint**: These ‘floor-parenthesis’ can easily be perceived incorrectly as square brackets. Do not make this mistake!

— *If the kernel does not ‘fit’ into the input array* —

By rounding off the result, the ‘dropping’ of superfluous rows and columns on the border is described mathematically. That means, if the kernel with a specific stride does not fit into the input array, a float value is generated as a result. Because the remaining pixels of the input array do not impact the output array, the result is rounded off.


<img src="https://miro.medium.com/v2/resize:fit:828/format:webp/1*9rOrbW8-MHMzt2yM9A1lEQ.png"  width="500" height="250">

<img src="https://miro.medium.com/v2/resize:fit:828/format:webp/1*bH3vy608kUUn3wBvEXUBJw.png"  width="500" height="250">

<img src="https://miro.medium.com/v2/resize:fit:828/format:webp/1*0Fu9UbpTmbIkfPLV-34cuw.png"  width="500" height="250">

<img src="https://miro.medium.com/v2/resize:fit:828/format:webp/1*PbXsNYcVpk9vCG45Z0lHdQ.png"  width="500" height="250">



For the example of above :

<img src="https://miro.medium.com/v2/resize:fit:608/format:webp/1*lbGuvjqQrJYdVkFPAzwK3A.png"  width="400" height="100">

The behavior of dropping the remaining pixels can be omitted by the application of padding. If padding of one pixel on the top, the bottom, the left, and the right border is applied, the kernel covers all values of the input array

<img src="https://miro.medium.com/v2/resize:fit:786/format:webp/1*UKLpbHzeDDuDBcO7oYjt4g.png"  width="600" height="300">

But there is still one row and one column of pixels dropped in this process. just zero-padded values are dropped out, which leads to no information loss.

for more information on this topic got to https://towardsdatascience.com/a-comprehensible-explanation-of-the-dimensions-in-cnns-841dba49df5e

**Output size calculation after applying convolution**


0. Input Layer shape = 3 * 224 * 224  -> (color channels, height, width)

1. After applying conv2d with 64 filters of (7*7) stride = 2 and padding = 3:

* Output shape = ((224 + 2*3 - 7) / 2) + 1 = 112.5   ---after floor round---> 112

2. After applying Max Pooling (3*3) stride = 2 and padding = 1:

* Output shape = ((112 + 2*1 - 3) / 2) + 1 = 56

___

3. After applying conv2d with 64 filters of (1*1) stride = 1 and padding = 0:

* Output shape = ((56 + 2*0 - 1) / 1) + 1 = 56

4. After applying conv2d with 64 filters of (3*3) stride = 1 and padding = 1:

* Output shape = ((56 + 2*1 - 3) / 1) + 1 = 56

5. After applying conv2d with 256 (64 * 4) filters of (1*1) stride = 1 and padding = 0:

* Output shape = ((56 + 2*0 - 1) / 1) + 1 = 56


---

6. After applying conv2d with 128 filters of (1*1) stride = 2 and padding = 0:

* Output shape = ((58 + 2*0 - 1) / 2) + 1 = 28

7. After applying conv2d with 128 filters of (3*3) stride = 2 and padding = 1:

* Output shape = ((56 + 2*1 - 3) / 2) + 1 = 28

8. After applying conv2d with 512 (128 * 4) filters of (1*1) stride = 2 and padding = 0:

* Output shape = ((56 + 2*0 - 1) / 2) + 1 = 28

___

9. After applying conv2d with 256 filters of (1*1) stride = 2 and padding = 0:

* Output shape = ((28 + 2*0 - 1) / 2) + 1 = 14

10. After applying conv2d with 256 filters of (3*3) stride = 2 and padding = 1:

* Output shape = ((28 + 2*1 - 3) / 2) + 1 = 14

11. After applying conv2d with 1024 (256 * 4) filters of (1*1) stride = 2 and padding = 0:

* Output shape = ((28 + 2*0 - 1) / 2) + 1 = 14

---


12. After applying conv2d with 512 filters of (1*1) stride = 2 and padding = 0:

* Output shape = ((14 + 2*0 - 1) / 2) + 1 = 7

13. After applying conv2d with 512 filters of (3*3) stride = 2 and padding = 1:

* Output shape = ((14 + 2*1 - 3) / 2) + 1 = 7

14. After applying conv2d with 2048 (512 * 4) filters of (1*1) stride = 2 and padding = 0:

* Output shape = ((14 + 2*0 - 1) / 2) + 1 = 7




# Identity Shortcut and Projection Shortcut

This difference on the skip connections are the so called in the paper as Identity Shortcut and Projection Shortcut. The identity shortcut is the one we have already discussed, simply bypassing the input volume to the addition operator. The projection shortcut performs a convolution operation to ensure the volumes at this addition operation are the same size. From the paper we can see that there are 2 options for matching the output size. Either padding the input volume or perform 1x1 convolutions. Here, this second option is shown.



# understanding and visualizing ResNet

go to below urls to learn more about ResNet visually

https://towardsdatascience.com/understanding-and-visualizing-resnets-442284831be8

https://cv-tricks.com/keras/understand-implement-resnets/

In [None]:
try:
  from torchinfo import summary
except:
  ! pip install torchinfo
  from torchinfo import summary

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.7.2-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.7.2


# ResNet-50

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = ResNet50(img_channel=3, num_classes=1000).to(device)

summary(
    net,
    (32,3,223,223),
    col_width=25,
    col_names=["kernel_size","input_size", "output_size", "num_params"],
    row_settings=["var_names"],
)

Layer (type (var_name))                            Kernel Shape              Input Shape               Output Shape              Param #
ResNet (ResNet)                                    --                        [32, 3, 223, 223]         [32, 1000]                --
├─Conv2d (conv1)                                   [7, 7]                    [32, 3, 223, 223]         [32, 64, 112, 112]        9,408
├─BatchNorm2d (bn1)                                --                        [32, 64, 112, 112]        [32, 64, 112, 112]        128
├─ReLU (relu)                                      --                        [32, 64, 112, 112]        [32, 64, 112, 112]        --
├─MaxPool2d (maxpool)                              3                         [32, 64, 112, 112]        [32, 64, 56, 56]          --
├─Sequential (layer1)                              --                        [32, 64, 56, 56]          [32, 256, 56, 56]         --
│    └─block (0)                                   --              

# ResNet-34

In [49]:
import torch
import torch.nn as nn

class block(nn.Module):
  def __init__(self, in_channels, block_channels, identity_downsample=None, stride=1):
    super().__init__()
    self.conv1 = nn.Conv2d(in_channels,block_channels,kernel_size=3,
                           stride=stride, padding=1)
    self.bn1 = nn.BatchNorm2d(block_channels)
 

    self.conv2 = nn.Conv2d(block_channels, block_channels, kernel_size=3,
                           stride=1, padding=1)
    self.bn2 = nn.BatchNorm2d(block_channels)
    self.relu = nn.ReLU()
    self.identity_downsample = identity_downsample


  def forward(self,x):
    identity = x.clone()

    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.conv2(x)
    x = self.bn2(x)

    if self.identity_downsample is not None:
      identity = self.identity_downsample(identity)
    
    x += identity
    x = self.relu(x)

    return x




class ResNet34(nn.Module):
  def __init__(self, block, num_block, image_channels, num_classes):
    super().__init__()

    self.in_channels = 64
    self.conv1 = nn.Conv2d(in_channels=image_channels, out_channels=64, kernel_size=7,
                           stride=2, padding=3)
    self.bn1 = nn.BatchNorm2d(64)
    self.relu = nn.ReLU()
    self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

    self.layer1 = self._make_layer(block, num_block[0], block_channels=64, stride=1)
    self.layer2 = self._make_layer(block, num_block[1], block_channels=128, stride=2)
    self.layer3 = self._make_layer(block, num_block[2], block_channels=256, stride=2)
    self.layer4 = self._make_layer(block, num_block[3], block_channels=512, stride=2)

    self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
    self.fc = nn.Linear(512, num_classes)




  def forward(self,x):
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.maxpool(x)
    x = self.layer1(x)
    x = self.layer2(x)
    x = self.layer3(x)
    x = self.layer4(x)
    x = self.avgpool(x)
    x = x.view(x.shape[0], -1)
    x = self.fc(x)

    return x



  def _make_layer(self,block,num_residual_block, block_channels, stride):
    layers = []
    identity_downsample = None

    if stride != 1 or self.in_channels != 2*block_channels:
      identity_downsample = nn.Sequential(
                                nn.Conv2d(self.in_channels, block_channels, kernel_size=1,
                                          stride=stride),
                                nn.BatchNorm2d(block_channels))
    
    layers.append(block(self.in_channels, block_channels, identity_downsample, stride))
    self.in_channels = block_channels

    for layer in range(num_residual_block-1):
      layers.append(block(self.in_channels, block_channels))

    return nn.Sequential(*layers)

def ResNet(img_channel=3, num_classes=1000):
    return ResNet34(block, [3, 4, 6, 3], img_channel, num_classes)  

In [51]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = ResNet(img_channel=3, num_classes=1000).to(device)

summary(
    net,
    (32,3,224,224),
    col_width=25,
    col_names=["kernel_size","input_size", "output_size", "num_params"],
    row_settings=["var_names"])

Layer (type (var_name))                            Kernel Shape              Input Shape               Output Shape              Param #
ResNet34 (ResNet34)                                --                        [32, 3, 224, 224]         [32, 1000]                --
├─Conv2d (conv1)                                   [7, 7]                    [32, 3, 224, 224]         [32, 64, 112, 112]        9,472
├─BatchNorm2d (bn1)                                --                        [32, 64, 112, 112]        [32, 64, 112, 112]        128
├─ReLU (relu)                                      --                        [32, 64, 112, 112]        [32, 64, 112, 112]        --
├─MaxPool2d (maxpool)                              3                         [32, 64, 112, 112]        [32, 64, 56, 56]          --
├─Sequential (layer1)                              --                        [32, 64, 56, 56]          [32, 64, 56, 56]          --
│    └─block (0)                                   --              