In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import datasets, transforms

transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,)),
])

trainset = datasets.MNIST('MNIST_data/', download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

### calculating the loss

In [2]:
# Creating the model
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10)
                     )

# Defining the Cross Entropy Loss
criterion = nn.CrossEntropyLoss()

images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)

# Calculating the Logits
logits = model(images)

# Calculating the Loss
loss = criterion(logits, labels)

print(loss)

tensor(2.3060, grad_fn=<NllLossBackward>)


### Using NLLLoss with log softmax output

In [3]:
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10),
                      nn.LogSoftmax(dim=1)
)

criterion = nn.NLLLoss()

images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)

logits = model(images)

loss = criterion(logits, labels)
print(loss)

tensor(2.2991, grad_fn=<NllLossBackward>)


### Autograd- Working with Gradients

Now that we know how to calculate a loss, how do we use it to perform backpropagation? Torch provides a module, autograd, for automatically calculating the gradients of tensors. We can use it to calculate the gradients of all our parameters with respect to the loss. Autograd works by keeping track of operations performed on tensors, then going backwards through those operations, calculating gradients along the way. To make sure PyTorch keeps track of operations on a tensor and calculates the gradients, you need to set `requires_grad=True` on a tensor. You can do this at creation with the `requires_grad` keyword, or at any time with `x.requires_grad_(True)`.

---

You can turn off gradients for a block of code with the `torch.no_grad()`

Also, you can turn on or off gradients altogether with `torch.set_grad_enabled(True|False)`

The gradients are computed with respect to some variable z with `z.backward()`. This does a backward pass through the operations that created z.

In [4]:
x = torch.randn(2, 2, requires_grad=True)
print(x)

tensor([[-1.1798, -0.6411],
        [ 0.2666, -1.4253]], requires_grad=True)


In [5]:
y = x**2
print(y)

tensor([[1.3919, 0.4109],
        [0.0710, 2.0314]], grad_fn=<PowBackward0>)


In [6]:
# operation done for gradient
print(y.grad_fn)

<PowBackward0 object at 0x00000257C5494E80>


In [8]:
z = y.mean()
print(z)

tensor(0.9763, grad_fn=<MeanBackward0>)


In [10]:
# since we haven't still invoked the backwards method, the gradients are not yet calculated, therefore are empty
print(x.grad)

None


While calculating mathematically, the gradient of *z* with respect to *x* is:

![image.png](attachment:image.png)

In [11]:
# computing the gradient
z.backward()
# this computes gradient of z w.r.t. x
print(x.grad)
print(x/2)

tensor([[-0.5899, -0.3205],
        [ 0.1333, -0.7126]])
tensor([[-0.5899, -0.3205],
        [ 0.1333, -0.7126]], grad_fn=<DivBackward0>)


### Using Loss and Autograd Together

The gradients from the loss is used for updating the weights

In [12]:
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10),
                      nn.LogSoftmax(dim=1)
                     )

criterion = nn.NLLLoss()

images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)

logits = model(images)

loss = criterion(logits, labels)
print(loss)

tensor(2.2900, grad_fn=<NllLossBackward>)


In [13]:
print('Before backward pass: \n', model[0].weight.grad)

loss.backward()

print('After backward pass: \n', model[0].weight.grad)

Before backward pass: 
 None
After backward pass: 
 tensor([[ 1.6325e-03,  1.6325e-03,  1.6325e-03,  ...,  1.6325e-03,
          1.6325e-03,  1.6325e-03],
        [-6.5324e-05, -6.5324e-05, -6.5324e-05,  ..., -6.5324e-05,
         -6.5324e-05, -6.5324e-05],
        [-2.2056e-04, -2.2056e-04, -2.2056e-04,  ..., -2.2056e-04,
         -2.2056e-04, -2.2056e-04],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 2.4443e-04,  2.4443e-04,  2.4443e-04,  ...,  2.4443e-04,
          2.4443e-04,  2.4443e-04],
        [ 1.2610e-03,  1.2610e-03,  1.2610e-03,  ...,  1.2610e-03,
          1.2610e-03,  1.2610e-03]])


### Optimizers

In [14]:
from torch import optim

optimizer = optim.SGD(model.parameters(), lr=0.01)

The general steps in  pytorch for training:

- Make a forward pass through the network
- Use the network output to calculate the loss
- Perform a backward pass through the network with `loss.backward()` to calculate the gradients
- Take a step with the optimizer to update the weights

In [17]:
print("initial weights: \n", model[0].weight)

images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)

# Pytorch by default accumulates the gradients by summing them up
# therefore we need to clear all those gradients first
optimizer.zero_grad()

# Forward pass
op = model(images)
loss = criterion(op, labels)
loss.backward()
print('\nGradient: \n', model[0].weight.grad)

initial weights: 
 Parameter containing:
tensor([[-0.0290, -0.0023,  0.0269,  ...,  0.0183,  0.0299, -0.0282],
        [-0.0045, -0.0276, -0.0174,  ..., -0.0256,  0.0131,  0.0210],
        [-0.0194,  0.0220, -0.0201,  ..., -0.0066, -0.0354,  0.0093],
        ...,
        [ 0.0089, -0.0083,  0.0199,  ..., -0.0333,  0.0296, -0.0076],
        [-0.0023, -0.0268,  0.0169,  ..., -0.0058,  0.0144,  0.0014],
        [-0.0285, -0.0091, -0.0101,  ..., -0.0348,  0.0015, -0.0008]],
       requires_grad=True)

Gradient: 
 tensor([[ 1.4998e-04,  1.4998e-04,  1.4998e-04,  ...,  1.4998e-04,
          1.4998e-04,  1.4998e-04],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.9934e-03,  1.9934e-03,  1.9934e-03,  ...,  1.9934e-03,
          1.9934e-03,  1.9934e-03],
        ...,
        [-4.7124e-05, -4.7124e-05, -4.7124e-05,  ..., -4.7124e-05,
         -4.7124e-05, -4.7124e-05],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.000

In [18]:
# Updating the weights
optimizer.step()
print('Updated weights: \n', model[0].weight)

Updated weights: 
 Parameter containing:
tensor([[-0.0290, -0.0023,  0.0269,  ...,  0.0183,  0.0299, -0.0282],
        [-0.0045, -0.0276, -0.0174,  ..., -0.0256,  0.0131,  0.0210],
        [-0.0194,  0.0220, -0.0201,  ..., -0.0066, -0.0354,  0.0093],
        ...,
        [ 0.0089, -0.0083,  0.0199,  ..., -0.0333,  0.0296, -0.0076],
        [-0.0023, -0.0268,  0.0169,  ..., -0.0058,  0.0144,  0.0014],
        [-0.0286, -0.0091, -0.0101,  ..., -0.0349,  0.0015, -0.0009]],
       requires_grad=True)
