<a href="https://colab.research.google.com/github/z-arabi/pytorchTutorial/blob/master/03_autograd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

In [43]:
# The autograd package provides automatic differentiation 
# for all operations on Tensors

# requires_grad = True -> tracks all operations on the tensor. 
# calculate the local gradients for us

# only Tensors of floating point and complex dtype can require gradients 
x = torch.randn(3, requires_grad=True)
y = x + 2

# y was created as a result of an operation, so it has a grad_fn attribute.
# grad_fn: references a Function that has created the Tensor
print(x) # created by the user -> grad_fn is None
print(y)
print(y.grad_fn)

tensor([ 0.7066, -1.7620,  2.1604], requires_grad=True)
tensor([2.7066, 0.2380, 4.1604], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x7fa003cd0310>


In [44]:
# gradient is not related to the initial value
# y=x+2 > dy/dx=1 > mean= 0.333 > all the wights of features(indexes) are the same
z = y.mean()
z.backward()
print(x.grad)

tensor([0.3333, 0.3333, 0.3333])


In [45]:
# Do more operations on y
z = y * y * 3 #z=3y^2
print(z)
z = z.mean() #mean=y1^2+y2^2+y3^2 > 1 value > scalar
print(z)

tensor([21.9774,  0.1700, 51.9269], grad_fn=<MulBackward0>)
tensor(24.6914, grad_fn=<MeanBackward0>)


In [46]:
# Let's compute the gradients with backpropagation
# When we finish our computation we can call .backward() and have all the gradients computed automatically.
# The gradient for this tensor will be accumulated into .grad attribute.
# It is the partial derivate of the function w.r.t. the tensor

z.backward() # calculate dz/dx = 6y
print(x.grad) # has gradient values > mean=6y/3=y

# Generally speaking, torch.autograd is an engine for computing vector-Jacobian product
# It computes partial derivates while applying the chain rule

tensor([5.7466, 0.8094, 8.6541])


In [47]:
# without autogradient
x = torch.randn(3, requires_grad=False)
y = x + 2

print(x)
print(y)
print(y.grad_fn)

z = y * y * 3
print(z)
z = z.mean()
print(z)

# error > hen u use backward that you have the grad function
# z.backward() 
# print(x.grad) # dz/dx

tensor([ 1.1011,  0.7401, -1.4518])
tensor([3.1011, 2.7401, 0.5482])
None
tensor([28.8509, 22.5252,  0.9016])
tensor(17.4259)


In [15]:
# Model with non-scalar output:
# If a Tensor is non-scalar (more than 1 elements), we need to specify arguments for backward() 
# specify a gradient argument that is a tensor of matching shape.
# needed for vector-Jacobian product

x = torch.randn(3, requires_grad=True)

y = x * 2
for _ in range(10):
    y = y * 2

print(y)
print(y.shape)

# calculate the grad with these weights 
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
y.backward(v)
print(x.grad)

tensor([-696.2772, -833.7890, 1378.6473], grad_fn=<MulBackward0>)
torch.Size([3])
tensor([2.0480e+02, 2.0480e+03, 2.0480e-01])


In [16]:
# Stop a tensor from tracking history:
# For example during our training loop when we want to update our weights
# then this update operation should not be part of the gradient computation
# - x.requires_grad_(False)
# - x.detach()
# - wrap in 'with torch.no_grad():'

In [48]:
# .requires_grad_(...) changes an existing flag in-place.
a = torch.randn(2, 2)
print(a.requires_grad)
b = ((a * 3) / (a - 1))
print(b.grad_fn)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

False
None
True
<SumBackward0 object at 0x7fa003ce2990>


In [49]:
# .detach(): get a new Tensor with the same content but no gradient computation:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
b = a.detach()
print(b.requires_grad)

True
False


In [50]:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)

True
False


In [52]:
# backward() accumulates the gradient for this tensor into .grad attribute.
# !!! We need to be careful during optimization !!!
# Use .zero_() to empty the gradients before a new optimization step!
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad

    # this is important! It affects the final weights & output
    weights.grad.zero_()

print(weights)
print(model_output)

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([0.1000, 0.1000, 0.1000, 0.1000], requires_grad=True)
tensor(4.8000, grad_fn=<SumBackward0>)


In [53]:
# Optimizer has zero_grad() method
optimizer = torch.optim.SGD([weights], lr=0.1)
# During training:
optimizer.step()
optimizer.zero_grad()