In [10]:
# credit : https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/02-intermediate
# credit : https://blog.waya.ai/deep-residual-learning-9610bb62c355

# Residual network
Network depth is of crucial importance in neural network architectures, but deeper networks are more difficult to train. The residual learning framework eases the training of these networks, and enables them to be substantially deeper — leading to improved performance in both visual and non-visual tasks. These residual networks are much deeper than their ‘plain’ counterparts, yet they require a similar number of parameters (weights).

<img src="iden.png" width="350">

<img src="iden2.png" width="550">

## Residual Networks Behave Like Ensembles of Relatively Shallow Networks
https://arxiv.org/pdf/1605.06431.pdf

they can be seen as a collection of many paths of differing length. Moreover,
residual networks seem to enable very deep networks by leveraging only the short
paths during training. Residual networks avoid the vanishing gradient problem by introducing short paths
which can carry gradient throughout the extent of very deep networks.
### residual networks can be viewed as a collection of many paths, instead of a single ultra-deep network
<img src="path.png" width="650">
### they exhibit ensemble-like behavior in the sense that their performance smoothly correlates with the number of valid paths
<img src="path2.png" width="650">

# MAIN code

In [8]:
# ---------------------------------------------------------------------------- #
# An implementation of https://arxiv.org/pdf/1512.03385.pdf                    #
# See section 4.2 for the model architecture on CIFAR-10                       #
# Some part of the code was referenced from below                              #
# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py   #
# ---------------------------------------------------------------------------- #

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms


# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
num_epochs = 80
learning_rate = 0.001

# Image preprocessing modules
transform = transforms.Compose([
    transforms.Pad(4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32),
    transforms.ToTensor()])

# CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='../../data/',
                                             train=True, 
                                             transform=transform,
                                             download=True)

test_dataset = torchvision.datasets.CIFAR10(root='../../data/',
                                            train=False, 
                                            transform=transforms.ToTensor())

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=100, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=100, 
                                          shuffle=False)

# 3x3 convolution
def conv3x3(in_channels, out_channels, stride=1):
    return nn.Conv2d(in_channels, out_channels, kernel_size=3, 
                     stride=stride, padding=1, bias=False)

# Residual block
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResidualBlock, self).__init__()
        self.conv1 = conv3x3(in_channels, out_channels, stride)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(out_channels, out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample
        
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out

# ResNet
class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=10):
        super(ResNet, self).__init__()
        self.in_channels = 16
        self.conv = conv3x3(3, 16)
        self.bn = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(block, 16, layers[0])
        self.layer2 = self.make_layer(block, 32, layers[1], 2)
        self.layer3 = self.make_layer(block, 64, layers[2], 2)
        self.avg_pool = nn.AvgPool2d(8)
        self.fc = nn.Linear(64, num_classes)
        
    def make_layer(self, block, out_channels, blocks, stride=1):
        
        downsample = None
        if (stride != 1) or (self.in_channels != out_channels):
            downsample = nn.Sequential(
                conv3x3(self.in_channels, out_channels, stride=stride),
                nn.BatchNorm2d(out_channels))
            
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels
        
        for i in range(1, blocks):
            layers.append(block(out_channels, out_channels))
        return nn.Sequential(*layers)
    
    def forward(self, x):
        # x = 3x32x32
        out = self.conv(x)
        # out = 3x32x32
        out = self.bn(out)
        out = self.relu(out)
        out = self.layer1(out)
        # out = 16x32x32
        out = self.layer2(out)
        # out = 32x16x16
        out = self.layer3(out)
        # out = 64x8x8
        out = self.avg_pool(out)
        # out = 64x1x1
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        # out = 64
        return out
    
model = ResNet(ResidualBlock, [2, 2, 2]).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# For updating learning rate
def update_lr(optimizer, lr):    
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

# Train the model
total_step = len(train_loader)
curr_lr = learning_rate
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}"
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

    # Decay learning rate
    if (epoch+1) % 20 == 0:
        curr_lr /= 3
        update_lr(optimizer, curr_lr)

# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))

# Save the model checkpoint
# torch.save(model.state_dict(), 'resnet.ckpt')

Files already downloaded and verified
Epoch [1/80], Step [100/500] Loss: 1.5507
Epoch [1/80], Step [200/500] Loss: 1.5122
Epoch [1/80], Step [300/500] Loss: 1.3580
Epoch [1/80], Step [400/500] Loss: 1.2635
Epoch [1/80], Step [500/500] Loss: 1.1700
Epoch [2/80], Step [100/500] Loss: 0.9670
Epoch [2/80], Step [200/500] Loss: 1.0773
Epoch [2/80], Step [300/500] Loss: 1.0572
Epoch [2/80], Step [400/500] Loss: 0.8995
Epoch [2/80], Step [500/500] Loss: 1.0779
Epoch [3/80], Step [100/500] Loss: 0.9353
Epoch [3/80], Step [200/500] Loss: 0.7813
Epoch [3/80], Step [300/500] Loss: 1.0179
Epoch [3/80], Step [400/500] Loss: 0.8203
Epoch [3/80], Step [500/500] Loss: 0.7667
Epoch [4/80], Step [100/500] Loss: 0.8019
Epoch [4/80], Step [200/500] Loss: 0.8040
Epoch [4/80], Step [300/500] Loss: 0.6801
Epoch [4/80], Step [400/500] Loss: 0.8596
Epoch [4/80], Step [500/500] Loss: 0.7524
Epoch [5/80], Step [100/500] Loss: 0.7482
Epoch [5/80], Step [200/500] Loss: 0.7133
Epoch [5/80], Step [300/500] Loss: 0.5

Epoch [39/80], Step [200/500] Loss: 0.2681
Epoch [39/80], Step [300/500] Loss: 0.1521
Epoch [39/80], Step [400/500] Loss: 0.2268
Epoch [39/80], Step [500/500] Loss: 0.2148
Epoch [40/80], Step [100/500] Loss: 0.1699
Epoch [40/80], Step [200/500] Loss: 0.2497
Epoch [40/80], Step [300/500] Loss: 0.1564
Epoch [40/80], Step [400/500] Loss: 0.3062
Epoch [40/80], Step [500/500] Loss: 0.1750
Epoch [41/80], Step [100/500] Loss: 0.1412
Epoch [41/80], Step [200/500] Loss: 0.2013
Epoch [41/80], Step [300/500] Loss: 0.1395
Epoch [41/80], Step [400/500] Loss: 0.2018
Epoch [41/80], Step [500/500] Loss: 0.2250
Epoch [42/80], Step [100/500] Loss: 0.2846
Epoch [42/80], Step [200/500] Loss: 0.1468
Epoch [42/80], Step [300/500] Loss: 0.1383
Epoch [42/80], Step [400/500] Loss: 0.2277
Epoch [42/80], Step [500/500] Loss: 0.2074
Epoch [43/80], Step [100/500] Loss: 0.2900
Epoch [43/80], Step [200/500] Loss: 0.1559
Epoch [43/80], Step [300/500] Loss: 0.3226
Epoch [43/80], Step [400/500] Loss: 0.2450
Epoch [43/8

Epoch [77/80], Step [300/500] Loss: 0.1348
Epoch [77/80], Step [400/500] Loss: 0.1056
Epoch [77/80], Step [500/500] Loss: 0.1022
Epoch [78/80], Step [100/500] Loss: 0.0894
Epoch [78/80], Step [200/500] Loss: 0.1592
Epoch [78/80], Step [300/500] Loss: 0.0816
Epoch [78/80], Step [400/500] Loss: 0.0841
Epoch [78/80], Step [500/500] Loss: 0.1212
Epoch [79/80], Step [100/500] Loss: 0.3247
Epoch [79/80], Step [200/500] Loss: 0.1257
Epoch [79/80], Step [300/500] Loss: 0.2238
Epoch [79/80], Step [400/500] Loss: 0.1151
Epoch [79/80], Step [500/500] Loss: 0.0976
Epoch [80/80], Step [100/500] Loss: 0.1134
Epoch [80/80], Step [200/500] Loss: 0.1143
Epoch [80/80], Step [300/500] Loss: 0.0861
Epoch [80/80], Step [400/500] Loss: 0.1207
Epoch [80/80], Step [500/500] Loss: 0.1817
Accuracy of the model on the test images: 88.71 %
