### Depp Convolutional Neural Network

In this example, we are going to construct a network that looks like:

![Convolutional neural network](../images/mnist.png)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

### The Network class

In [2]:
class Network(nn.Module):
    
    def __init__(self, **kwargs):
        super(Network, self).__init__()
        
        # Hyper-parameters
        self.lr = kwargs.get('lr', 1e-2)
        self.max_iter = kwargs.get('max_iter', 10000)
        self.img_channels = kwargs.get('img_channels', 1)
        self.num_classes = kwargs.get('num_classes', 10)
        
        # 2 convolutional & 3 fully connected layers
        self.conv1 = nn.Conv2d(self.img_channels, 6, 1)
        self.conv2 = nn.Conv2d(6, 16, 1)
        flatten_size = self.conv2.out_channels * 8 * 8
        self.fc1 = nn.Linear(flatten_size, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, self.num_classes)
    
    def forward(self, x):
        # Convolutional layers
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        # Flatten layer
        x = x.view(-1, self._flatten(x))
        # Fully connected layers
        x = F.relu(self.fc1(x))     # relu + linear
        x = F.dropout(x, p=0.5)     # 50% dropout
        x = F.relu(self.fc2(x))     # relu + linear
        x = F.sigmoid(self.fc3(x))  # sigmoid + linear
        return x
    
    def _flatten(self, x):
        size = x.size()[1:]  # input shape excluding batch dim.
        return torch.Tensor(size).numel()

### importing the data

In [3]:
net = Network()

print(net)

Network(
  (conv1): Conv2d (1, 6, kernel_size=(1, 1), stride=(1, 1))
  (conv2): Conv2d (6, 16, kernel_size=(1, 1), stride=(1, 1))
  (fc1): Linear(in_features=1024, out_features=120)
  (fc2): Linear(in_features=120, out_features=84)
  (fc3): Linear(in_features=84, out_features=10)
)


In [4]:
params = list(net.parameters())
print(f'Length of the Network\'s parameters: {len(params)}')

Length of the Network's parameters: 10


In [5]:
conv1_params = params[0]
print(f'1st Convolution layer: {conv1_params.size()}')

ouput_params = params[-2]
print(f'Output params: {ouput_params.size()}')

1st Convolution layer: torch.Size([6, 1, 1, 1])
Output params: torch.Size([10, 84])


In [6]:
conv1_weights = net.conv1.weight
conv1_bias = net.conv1.bias
print(f'conv1_weights = {conv1_weights.size()}')
print(f'conv1_bias = {conv1_bias.size()}')

conv1_weights = torch.Size([6, 1, 1, 1])
conv1_bias = torch.Size([6])


In [7]:
output_weights = net.fc3.weight
output_bias = net.fc3.bias
print(f'Output weights = {output_weights.size()}')
print(f'Output bias = {output_bias.size()}')

Output weights = torch.Size([10, 84])
Output bias = torch.Size([10])


In [8]:
# Create fake input & output
X_input = Variable(torch.rand(1, 1, 32, 32))
y_true = Variable(torch.arange(1, 11))

# Make a prediction
y_pred = net(X_input)
print(y_pred)

Variable containing:
 0.5192  0.5030  0.4779  0.4736  0.5014  0.5227  0.5017  0.4992  0.5225  0.5162
[torch.FloatTensor of size 1x10]



In [9]:
# Loss function (Kullback Leibler Divergence)
loss_func = nn.KLDivLoss()

loss = loss_func(y_pred, y_true)
print(f'Loss = {loss}')

Loss = Variable containing:
 7.4218
[torch.FloatTensor of size 1]



In [10]:
# backpropagation
net.zero_grad()   # clears all existing gradients to prevent accumulation

# Before computing gradients
conv1_bias = net.conv1.bias.grad
print(f'Before: {conv1_bias}')

# The whole graph is differentiated w.r.t. the loss and all Variable 
# in d graph have .grad Variable accumulated with gradients.
loss.backward()

# After gradients
conv1_bias = net.conv1.bias.grad
print(f'After: {conv1_bias}')

Before: None
After: Variable containing:
1.00000e-03 *
  0.0000
  0.0000
  2.0978
  0.0000
  3.9257
  0.0000
[torch.FloatTensor of size 6]



In [11]:
# Update weights
optimizer = optim.Adam(net.parameters(), lr=1e-2)

for i in range(10000):
    optimizer.zero_grad()  # zero out the gradient buffer

    y_pred = net(X_input)

    loss = loss_func(y_pred, y_true)
    loss.backward()

    optimizer.step()  # Does gradient decent update.
    print(f'\rIter: {i+1:,}', end='')

Iter: 10,000

In [15]:
y_pred = net(X_input)
print(y_pred)
print(y_true)

Variable containing:
    1     1     1     1     1     1     1     1     1     1
[torch.FloatTensor of size 1x10]

Variable containing:
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
[torch.FloatTensor of size 10]

