In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import torch

import matplotlib.pyplot as plt

In [2]:
from torchvision import datasets, transforms

# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])
# Download and load the training data
trainset = datasets.MNIST('~/.pytorch/MNIST_data/', download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

In [3]:
dataiter = iter(trainloader)
images, labels = dataiter.next()
print(type(images))
print(images.shape)
print(labels.shape)

<class 'torch.Tensor'>
torch.Size([64, 1, 28, 28])
torch.Size([64])


In [4]:
def activation(x):
    return 1/(1+torch.exp(-x))

# Flatten the input images
#images.shape[0] is the batch size that is equal to 64
#-1 is equivalent of 784
inputs = images.view(images.shape[0], -1)

# Create parameters
w1 = torch.randn(784, 256)
b1 = torch.randn(256)

w2 = torch.randn(256, 10)
b2 = torch.randn(10)

#from input to hidden layer
h = activation(torch.mm(inputs, w1) + b1)

#from hidden layer to output
out = torch.mm(h, w2) + b2

We want to pass in an image to our network and get out a probability distribution over the classes that tells us the likely class(es) the image belongs to.

In [5]:
# Softmax function
def softmax(x):
    return torch.exp(x)/torch.sum(torch.exp(x), dim=1).view(-1,1)

probabilities = softmax(out)

## Building networks with PyTorch

In [4]:
from torch import nn

In [29]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.hidden = nn.Linear(784, 256)
        self.output = nn.Linear(256, 10)
        
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.hidden(x)
        x = self.sigmoid(x)
        x = self.output(x)
        x = self.softmax(x)
         
        return x

model = Network()
model

Network(
  (hidden): Linear(in_features=784, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
  (sigmoid): Sigmoid()
  (softmax): Softmax(dim=1)
)

We can do the same architetture in a cleaner way using the torch.nn.functional module.

In [5]:
import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self):
        super().__init__()
        # Inputs to hidden layer linear transformation
        self.hidden = nn.Linear(784, 256)
        # Output layer, 10 units - one for each digit
        self.output = nn.Linear(256, 10)
        
    def forward(self, x):
        # Hidden layer with sigmoid activation
        x = F.sigmoid(self.hidden(x))
        # Output layer with softmax activation
        x = F.softmax(self.output(x), dim=1)
        
        return x
    
model = Network()
model

Network(
  (hidden): Linear(in_features=784, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)

In [6]:
import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self):
        super().__init__()
        # Defining the layers, 128, 64, 10 units each
        self.fc1 = nn.Linear(784, 128)
        self.fc2 = nn.Linear(128, 64)
        # Output layer, 10 units - one for each digit
        self.fc3 = nn.Linear(64, 10)
        
    def forward(self, x):
        ''' Forward pass through the network, returns the output logits '''
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x), dim=1)
        
        return x

model = Network()
model

Network(
  (fc1): Linear(in_features=784, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=10, bias=True)
)

### Initializing weights and biases

The weights and such are automatically initialized for you, but it's possible to customize them

In [7]:
print(model.fc1.weight)
print(model.fc1.bias)

Parameter containing:
tensor([[ 0.0155,  0.0174, -0.0056,  ..., -0.0340, -0.0355, -0.0222],
        [-0.0040, -0.0357,  0.0069,  ..., -0.0307,  0.0221, -0.0314],
        [ 0.0080, -0.0236,  0.0210,  ..., -0.0354,  0.0245, -0.0065],
        ...,
        [-0.0293,  0.0064,  0.0294,  ..., -0.0275, -0.0154, -0.0019],
        [-0.0190,  0.0011, -0.0316,  ..., -0.0030, -0.0355,  0.0354],
        [ 0.0175,  0.0088, -0.0012,  ..., -0.0202,  0.0020, -0.0237]],
       requires_grad=True)
Parameter containing:
tensor([ 0.0120,  0.0247, -0.0237, -0.0140, -0.0316,  0.0273,  0.0235, -0.0230,
         0.0151, -0.0214,  0.0305, -0.0271,  0.0096,  0.0239, -0.0084, -0.0199,
         0.0214, -0.0285,  0.0344, -0.0066,  0.0305, -0.0092,  0.0169,  0.0063,
        -0.0224,  0.0217, -0.0110,  0.0345, -0.0130, -0.0084, -0.0317,  0.0052,
         0.0311,  0.0111,  0.0225,  0.0125,  0.0137, -0.0238, -0.0350,  0.0171,
        -0.0117, -0.0230, -0.0346, -0.0111, -0.0113,  0.0306,  0.0152, -0.0068,
        -0.0041

In [8]:
# Set biases to all zeros
model.fc1.bias.data.fill_(0)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])

In [9]:
# sample from random normal with standard dev = 0.01
model.fc1.weight.data.normal_(std=0.01)

tensor([[-0.0292,  0.0140,  0.0068,  ..., -0.0009,  0.0109, -0.0028],
        [ 0.0189,  0.0049, -0.0018,  ...,  0.0048,  0.0025,  0.0035],
        [ 0.0152,  0.0033,  0.0029,  ...,  0.0134, -0.0114,  0.0097],
        ...,
        [-0.0006,  0.0046, -0.0013,  ...,  0.0018,  0.0051,  0.0213],
        [-0.0162, -0.0092,  0.0026,  ...,  0.0080, -0.0035, -0.0030],
        [-0.0028, -0.0014,  0.0068,  ..., -0.0007,  0.0004,  0.0009]])

In [10]:
# sample from random normal with standard dev = 0.01
model.fc1.weight.data.normal_(std=0.01)

tensor([[ 3.8300e-03,  2.1343e-03,  4.5210e-03,  ...,  1.2556e-02,
          8.4191e-03, -7.7170e-03],
        [-1.3803e-02, -1.6898e-02, -1.6969e-02,  ...,  5.7594e-03,
          1.9165e-02,  1.9431e-02],
        [ 3.1676e-03,  1.5446e-03, -5.9974e-03,  ..., -1.5712e-02,
          1.0880e-02,  1.0024e-02],
        ...,
        [ 1.1801e-02, -7.1896e-03, -1.7923e-03,  ..., -1.0867e-02,
         -1.7662e-03, -3.2468e-03],
        [-1.4617e-02,  2.2918e-03, -7.0998e-03,  ...,  8.0740e-03,
         -1.9728e-02, -3.1923e-04],
        [ 2.0913e-03, -2.3715e-03,  7.7373e-05,  ...,  2.2230e-02,
          3.0970e-03,  2.0175e-02]])

In [20]:
# Grab some data 
dataiter = iter(trainloader)
images, labels = dataiter.next()

## Using nn.Sequential

In [21]:
#Architetture of the network
input_size = 784
hidden_size = [128, 64]
output_size = 10

#Build a feed forward network
model = nn.Sequential(nn.Linear(input_size, hidden_size[0]),
                     nn.ReLU(),
                     nn.Linear(hidden_size[0], hidden_size[1]),
                     nn.ReLU(),
                     nn.Linear(hidden_size[1], output_size),
                     nn.Softmax(dim=1))
print(model)

Sequential(
  (0): Linear(in_features=784, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=10, bias=True)
  (5): Softmax(dim=1)
)


In [22]:
# Forward pass through the network and display output
images, labels = next(iter(trainloader))
images.resize_(images.shape[0], 1, 784)
ps = model.forward(images[0,:])

In [None]:
ps

## Calculate the Gradient

In [8]:
from torch import nn
import torch.nn.functional as F

In [13]:
#Build a feed-froward network
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10),
                      nn.Softmax(dim=1))

#Loss function: Negative Log Likelihood Loss
criterion = nn.NLLLoss()

#Get the data
images, labels = next(iter(trainloader))
images = images.view(images.shape[0], 784)

logps = model(images)
loss = criterion(logps, labels)

In [14]:
#model[0] gives us the parameters for the first Linear layer
print('Before Backpropagation:' + str(model[0].weight.grad))

loss.backward()

print('After Backpropagation: '+ str(model[0].weight.grad))

Before Backpropagation:None
After Backpropagation: tensor([[-3.1699e-04, -3.1699e-04, -3.1699e-04,  ..., -3.1699e-04,
         -3.1699e-04, -3.1699e-04],
        [ 2.4233e-04,  2.4233e-04,  2.4233e-04,  ...,  2.4233e-04,
          2.4233e-04,  2.4233e-04],
        [ 9.0017e-05,  9.0017e-05,  9.0017e-05,  ...,  9.0017e-05,
          9.0017e-05,  9.0017e-05],
        ...,
        [ 1.5411e-04,  1.5411e-04,  1.5411e-04,  ...,  1.5411e-04,
          1.5411e-04,  1.5411e-04],
        [-5.1036e-04, -5.1036e-04, -5.1036e-04,  ..., -5.1036e-04,
         -5.1036e-04, -5.1036e-04],
        [ 1.3148e-04,  1.3148e-04,  1.3148e-04,  ...,  1.3148e-04,
          1.3148e-04,  1.3148e-04]])


How to use the gradient to update the weights

In [15]:
from torch import optim

In [25]:
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [36]:
#Build a feed-froward network
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64,10),
                      nn.Softmax(dim=1))

#Loss function: Negative Log Likelihood Loss
criterion = nn.NLLLoss()

#Get the data
images, labels = next(iter(trainloader))
images = images.view(64, 784)

#Clear the gradient, do this because gradients are accumulated
optimizer.zero_grad()

#Forward pass, then backward pass, then update weights
output = model.forward(images)
print('Initial weights - ', model[0].weight)

loss = criterion(output, labels)
loss.backward()

print('Gradient - ', model[0].weight.grad)

Initial weights -  Parameter containing:
tensor([[-0.0324, -0.0063, -0.0316,  ..., -0.0265, -0.0020,  0.0051],
        [-0.0072, -0.0100,  0.0199,  ...,  0.0355,  0.0174,  0.0256],
        [ 0.0054, -0.0211,  0.0330,  ..., -0.0315,  0.0265,  0.0215],
        ...,
        [-0.0195, -0.0081,  0.0065,  ...,  0.0164,  0.0235, -0.0157],
        [ 0.0226, -0.0145,  0.0066,  ...,  0.0064,  0.0251,  0.0202],
        [ 0.0260,  0.0083,  0.0239,  ..., -0.0185,  0.0235, -0.0209]],
       requires_grad=True)
Gradient -  tensor([[ 1.6906e-04,  1.6906e-04,  1.6906e-04,  ...,  1.6906e-04,
          1.6906e-04,  1.6906e-04],
        [-1.9482e-05, -1.9482e-05, -1.9482e-05,  ..., -1.9482e-05,
         -1.9482e-05, -1.9482e-05],
        [ 6.0669e-05,  6.0669e-05,  6.0669e-05,  ...,  6.0669e-05,
          6.0669e-05,  6.0669e-05],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-1.7794e-05, -1.7794e-05, -1.7794e-05,  ..., -1.7794

In [37]:
#take an update step and few the new weights
optimizer.step()
print('Updated weights - ', model[0].weight)

Updated weights -  Parameter containing:
tensor([[-0.0324, -0.0063, -0.0316,  ..., -0.0265, -0.0020,  0.0051],
        [-0.0072, -0.0100,  0.0199,  ...,  0.0355,  0.0174,  0.0256],
        [ 0.0054, -0.0211,  0.0330,  ..., -0.0315,  0.0265,  0.0215],
        ...,
        [-0.0195, -0.0081,  0.0065,  ...,  0.0164,  0.0235, -0.0157],
        [ 0.0226, -0.0145,  0.0066,  ...,  0.0064,  0.0251,  0.0202],
        [ 0.0260,  0.0083,  0.0239,  ..., -0.0185,  0.0235, -0.0209]],
       requires_grad=True)


## Summary - Training the Network

In [4]:
#Import modules
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch import optim

In [11]:
x,y = next(iter(trainloader))

model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64,10),
                      nn.Softmax(dim=1))

loss_func = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.03)

epochs = 10
for e in range(epochs):
    running_loss = 0
    for x,y in trainloader:
        #Flattening
        x = x.view(x.shape[0], -1)
        
        #Clear the gradient
        optimizer.zero_grad()
        
        #Training
        y_hat = model.forward(x)
        loss = loss_func(y_hat, y)
        loss.backward()
        optimizer.step()
        
        running_loss = running_loss + loss.item()
    
    else:
        print(f'Training loss: {running_loss/len(trainloader)}')

Training loss: -0.2729356850165802
Training loss: -0.6426626615432788
Training loss: -0.7630584062670848
Training loss: -0.8001316902098625
Training loss: -0.8121029971632113
Training loss: -0.8190476884847002
Training loss: -0.8238644276473568
Training loss: -0.8274958256338196
Training loss: -0.8301071693012709
Training loss: -0.8328505146350942
