In [1]:
import torchvision
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
def conv3x3(in_planes, out_planes, stride=1):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)

In [3]:
class BasicBlock(nn.Module):
    expansion = 1
    
    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride
        
    def forward(self, x):
        residual = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample is not None:
            residual = self.downsample(x)
            
        out += residual
        out = self.relu(out)
        
        return out

### Deeper Bottleneck Architecture

Because of concerns on the training time that we can afford, we modify the building block as a bottleneck design. For each residual function F, we use a stack of 3 layers instead of 2. The three layers are 1x1, 3x3, and 1x1 convolutions, where the 1x1 layers are responsible for reducing and then increasing(restoring) dimensions, leaving the 3x3 layer a bottleneck with smaller input/output dimensions.

The parameter-free identity shortcuts are particularly important for the bottleneck architectures. If the identity shortcut is replaced with projection, one can show that the time complexity and model size are doubled, as the shortcut is connected to the two high-dimensional ends. So identity shortcuts lead to more efficient models for the bottleneck designs.

In [4]:
class Bottleneck(nn.Module):
    expansion = 4
    
    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride
        
    def forward(self, x):
        residual = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        
        out = self.conv3(out)
        out = self.bn3(out)
        
        if self.downsample is not None:
            residual = self.downsample(x)
            
        out += residual
        out = self.relu(out)
        
        return out

## Network Architecture

**Plain Network.** The convolutional layers mostly have 3x3 filters and follow two simple design rules.
1) For the same output feature map size, the layers have the same number of filters.
2) If the feature map isze is halved, the number of filters is doubled so as to preserve the time complexity per layer

Perform downsampling directly by convolutional layers that have a stride of 2.

**Residual Network.** Based on the above plain network, we insert shortcut connections which turn the network into its counterpart residual version. 

The identity shortcuts can be directly used when the input and output are of the same dimensions. When the dimensions increase, we consider two options:

(A) The shortcut still performs identity mapping, with extra zero entries padded for increasing dimensions. **This option introduces no extra parameter.**
(B) The projection shortcut is used to match dimensions (done by 1x1 convolutions). For both options, when the shortcuts go across feature maps of two sizes, they are performed with a stride of 2.


### to-do

- Modify 4th layer for no-downsampling. Insert last_stride=1 as an argument
- Wide ResNet for ImageNet implement using this [link](https://github.com/szagoruyko/functional-zoo/blob/master/wide-resnet-50-2-export.ipynb)


In [22]:
class ResNet(nn.Module):
    
    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1])
        self.layer3 = self._make_layer(block, 256, layers[2])                
        self.layer4 = self._make_layer(block, 512, layers[3])
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)
    
    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )
            
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))
            
        return nn.Sequential(*layers)
    
    def forward(self, x):
        
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        
        return x

In [23]:
def resnet18(pretrained=False, **kwargs):
    """Constructs a ResNet-18 model.
    
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    
    return model

def resnet34(pretrained=False, **kwargs):
    
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    
    return model


In [24]:
def resnet50(pretrained=False, **kwargs):
    
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    
    return model

def resnet101(pretrained=False, **kwargs):
    
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    
    return model

def resnet152(pretrained=False, **kwargs):
    
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    
    return model

In [39]:
model = resnet18(num_classes=10)

In [40]:
test_inputs = torch.randn((1,3,32,32))

In [41]:
x = model(test_inputs)
print(x.shape)


torch.Size([1, 10])


In [42]:
for name, params in model.state_dict().items():
    print('name:{}, shape:{}'.format(name,params.data.size()))

name:conv1.weight, shape:torch.Size([64, 3, 7, 7])
name:bn1.weight, shape:torch.Size([64])
name:bn1.bias, shape:torch.Size([64])
name:bn1.running_mean, shape:torch.Size([64])
name:bn1.running_var, shape:torch.Size([64])
name:bn1.num_batches_tracked, shape:torch.Size([])
name:layer1.0.conv1.weight, shape:torch.Size([64, 64, 3, 3])
name:layer1.0.bn1.weight, shape:torch.Size([64])
name:layer1.0.bn1.bias, shape:torch.Size([64])
name:layer1.0.bn1.running_mean, shape:torch.Size([64])
name:layer1.0.bn1.running_var, shape:torch.Size([64])
name:layer1.0.bn1.num_batches_tracked, shape:torch.Size([])
name:layer1.0.conv2.weight, shape:torch.Size([64, 64, 3, 3])
name:layer1.0.bn2.weight, shape:torch.Size([64])
name:layer1.0.bn2.bias, shape:torch.Size([64])
name:layer1.0.bn2.running_mean, shape:torch.Size([64])
name:layer1.0.bn2.running_var, shape:torch.Size([64])
name:layer1.0.bn2.num_batches_tracked, shape:torch.Size([])
name:layer1.1.conv1.weight, shape:torch.Size([64, 64, 3, 3])
name:layer1.1.bn

In [30]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

conv1.weight tensor([[[[-0.0497, -0.0817, -0.0603,  ..., -0.0362, -0.0054, -0.0549],
          [-0.0591, -0.0106,  0.0173,  ..., -0.0327,  0.0594, -0.0422],
          [ 0.0393,  0.0549, -0.0377,  ..., -0.0699, -0.0089, -0.0287],
          ...,
          [ 0.0385,  0.0424,  0.0320,  ..., -0.0080, -0.0634, -0.0778],
          [ 0.0315, -0.0515, -0.0653,  ..., -0.0606,  0.0223, -0.0783],
          [-0.0237,  0.0015,  0.0174,  ...,  0.0591,  0.0454, -0.0031]],

         [[ 0.0542, -0.0580, -0.0248,  ..., -0.0531,  0.0125, -0.0086],
          [ 0.0617, -0.0446, -0.0692,  ..., -0.0054,  0.0580,  0.0564],
          [ 0.0047, -0.0246, -0.0233,  ..., -0.0693,  0.0115,  0.0443],
          ...,
          [ 0.0438,  0.0629, -0.0001,  ...,  0.0362, -0.0699, -0.0424],
          [-0.0765,  0.0230, -0.0533,  ..., -0.0634, -0.0233,  0.0175],
          [ 0.0251,  0.0469,  0.0058,  ...,  0.0630, -0.0739,  0.0651]],

         [[ 0.0078, -0.0796,  0.0719,  ..., -0.0479,  0.0713, -0.0395],
          [-0.082