# A GENTLE INTRODUCTION TO TORCH.AUTOGRAD 

https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#sphx-glr-beginner-blitz-autograd-tutorial-py

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark
%config Completer.use_jedi = False

In [2]:
import pandas as pd

In [3]:
#import matplotlib.pyplot as plt
#%matplotlib inline
#from IPython.core.pylabtools import figsize
#figsize(12, 8)

#import seaborn as sns
#sns.set_theme()

#pd.set_option("max_columns", None)
#pd.set_option("max_rows", None)

#from IPython.display import Markdown, display
#def md(arg):
#    display(Markdown(arg))

#from pandas_profiling import ProfileReport
# report = ProfileReport(#DataFrame here#, minimal=True)
# report.to

#import pyarrow.parquet as pq
# df = pq.ParquetDataset(path_to_folder_with_parquets, filesystem=None).read_pandas().to_pandas()

In [4]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v
!cat /proc/cpuinfo |grep 'model name'|head -n 1 |sed -e 's/model\ name/CPU/'
!free -h |cut -d'i' -f1  |grep -v total

Python implementation: CPython
Python version       : 3.7.9
IPython version      : 7.19.0

Compiler    : GCC 8.3.0
OS          : Linux
Release     : 5.8.0-7630-generic
Machine     : x86_64
Processor   : 
CPU cores   : 8
Architecture: 64bit

Git hash: 2510f7738f3ed9d66809d590b483f6ee509698d4

Git repo: https://github.com/ysraell/examples.git

Git branch: master

pandas    : 1.2.1
json      : 2.0.9
numpy     : 1.19.5
ipywidgets: 7.6.3
sys       : 3.7.9 (default, Jan 12 2021, 17:26:22) 
[GCC 8.3.0]

CPU	: Intel(R) Xeon(R) CPU E3-1241 v3 @ 3.50GHz
Mem:           31G
Swap:         4.0G


In [5]:
import torch, torchvision
model = torchvision.models.resnet18(pretrained=True)
data = torch.rand(1, 3, 64, 64)
labels = torch.rand(1, 1000)

Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /root/.cache/torch/hub/checkpoints/resnet18-5c106cde.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [6]:
prediction = model(data)

In [11]:
prediction.shape

torch.Size([1, 1000])

In [12]:
loss = (prediction - labels).sum()
loss.backward() # backward pass

In [13]:
optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)

In [14]:
optim.step() #gradient descent

In [15]:
import torch

a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)

In [16]:
Q = 3*a**3 - b**2

In [17]:
Q

tensor([-12.,  65.], grad_fn=<SubBackward0>)

In [18]:
external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)

In [19]:
print(9*a**2 == a.grad)
print(-2*b == b.grad)

tensor([True, True])
tensor([True, True])


In [22]:
x = torch.rand(5, 5)
y = torch.rand(5, 5)
z = torch.rand((5, 5), requires_grad=True)

a = x + y
print(f"Does `a` require gradients? : {a.requires_grad}")
b = x + z
print(f"Does `b` require gradients?: {b.requires_grad}")

Does `a` require gradients? : False
Does `b` require gradients?: True


In [23]:
from torch import nn, optim

model = torchvision.models.resnet18(pretrained=True)

# Freeze all the parameters in the network
for param in model.parameters():
    param.requires_grad = False

In [25]:
model.fc

Linear(in_features=512, out_features=1000, bias=True)

In [26]:
model.fc = nn.Linear(512, 10)
model.fc

Linear(in_features=512, out_features=10, bias=True)

In [27]:
optimizer = optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9)

In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)



$$
    \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
    \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)
$$


In [31]:
params = list(net.parameters())
print(len(params))
print(params[0].size())  # conv1's .weight

10
torch.Size([6, 1, 3, 3])


In [32]:
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)

tensor([[ 0.1267, -0.0987, -0.0999, -0.0923,  0.0331,  0.0562,  0.1144, -0.1260,
          0.1484, -0.1374]], grad_fn=<AddmmBackward>)


In [33]:
net.zero_grad()
out.backward(torch.randn(1, 10))

In [34]:
output = net(input)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor(0.6516, grad_fn=<MseLossBackward>)


In [39]:
loss.grad_fn

<MseLossBackward at 0x7f0ab38458d0>

In [40]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

<MseLossBackward object at 0x7f0ab38458d0>
<AddmmBackward object at 0x7f0ab3845310>
<AccumulateGrad object at 0x7f0ab3845690>


In [41]:
net.zero_grad()     # zeroes the gradient buffers of all parameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 0.0053, -0.0048, -0.0003,  0.0042,  0.0146, -0.0158])


In [42]:
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

In [43]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update