In [None]:

import torch
# The autograd package provides automatic differentiation 
# for all operations on Tensors

# requires_grad = True -> tracks all operations on the tensor. 
x = torch.randn(3, requires_grad=True)
y = x + 2

# y was created as a result of an operation, so it has a grad_fn attribute.
# grad_fn: references a Function that has created the Tensor
print(x) # created by the user -> grad_fn is None
print(y)
print(y.grad_fn)

# Do more operations on y
z = y * y * 3
print(z)
z = z.mean()
print(z)

# Let's compute the gradients with backpropagation
# When we finish our computation we can call .backward() and have all the gradients computed automatically.
# The gradient for this tensor will be accumulated into .grad attribute.
# It is the partial derivate of the function w.r.t. the tensor

z.backward()
print(x.grad) # dz/dx

# Generally speaking, torch.autograd is an engine for computing vector-Jacobian product
# It computes partial derivates while applying the chain rule

# -------------
# Model with non-scalar output:
# If a Tensor is non-scalar (more than 1 elements), we need to specify arguments for backward() 
# specify a gradient argument that is a tensor of matching shape.
# needed for vector-Jacobian product

x = torch.randn(3, requires_grad=True)

y = x * 2
for _ in range(10):
    y = y * 2

print(y)
print(y.shape)

v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
y.backward(v)
print(x.grad)

# -------------
# Stop a tensor from tracking history:
# For example during our training loop when we want to update our weights
# then this update operation should not be part of the gradient computation
# - x.requires_grad_(False)
# - x.detach()
# - wrap in 'with torch.no_grad():'

# .requires_grad_(...) changes an existing flag in-place.
a = torch.randn(2, 2)
print(a.requires_grad)
b = ((a * 3) / (a - 1))
print(b.grad_fn)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

# .detach(): get a new Tensor with the same content but no gradient computation:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
b = a.detach()
print(b.requires_grad)

# wrap in 'with torch.no_grad():'
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)

# -------------
# backward() accumulates the gradient for this tensor into .grad attribute.
# !!! We need to be careful during optimization !!!
# Use .zero_() to empty the gradients before a new optimization step!
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad

    # this is important! It affects the final weights & output
    weights.grad.zero_()

print(weights)
print(model_output)

# Optimizer has zero_grad() method
# optimizer = torch.optim.SGD([weights], lr=0.1)
# During training:
# optimizer.step()
# optimizer.zero_grad()

In [1]:
#autograd package of pytorch helps in calcualting gradients
#Gradients are important for all model optimizations

In [2]:
import torch

In [15]:
#torch.rand generates random values from a uniform distribution in the range [0, 1)
#torch.randn generates random values from a standard normal distribution (Gaussian distribution with mean 0 and standard deviation 1)

x = torch.randn(3, requires_grad=True) #If False, then there is an error
print(x)

y = x+2
print(y) # y has an attribute called grad_fn which here is Addbackward because we did y = x+2. Backward because it uses backpropagation

z=y.mean()
print(z)

z.backward() #This is dz/dx. For scalars(here, z) we can directly do z.backward

print(x.grad) #Access the gradient

v= torch.tensor([1.0,2.5,6.0])
y.backward(v) #this is also dy/dx but since y is a vector, we have to specify a vector of same size as parameter for backward
print(x.grad)

tensor([-0.8984,  1.3736,  0.3578], requires_grad=True)
tensor([1.1016, 3.3736, 2.3578], grad_fn=<AddBackward0>)
tensor(2.2777, grad_fn=<MeanBackward0>)
tensor([0.3333, 0.3333, 0.3333])
tensor([1.3333, 2.8333, 6.3333])


In [20]:
#To stop autofrad from tracking gradients, we can do it in the following ways

#Method 1
#x.requires_grad_(True)
#print(x)

#Method 2
#y=x.detach()
#print(y)

#Method 3
with torch.no_grad():
    y = x+2
    print(y)
#clearly, y does not have requires_grad

tensor([1.1016, 3.3736, 2.3578])


In [25]:
#Training Example
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    model_output = (weights*3).mean()
    model_output.backward()

    print(weights.grad)


#Clearly, the grads are getting summed up after every iteration. This is wrong because obviously, grad must be same in all iterations as
#weights and model_output are same. So, do as follows:

print("\n")

weights = torch.ones(4, requires_grad=True)
for epoch in range(3):
    model_output = (weights*3).mean()
    model_output.backward()

    print(weights.grad)

    weights.grad.zero_()

tensor([0.7500, 0.7500, 0.7500, 0.7500])
tensor([1.5000, 1.5000, 1.5000, 1.5000])
tensor([2.2500, 2.2500, 2.2500, 2.2500])


tensor([0.7500, 0.7500, 0.7500, 0.7500])
tensor([0.7500, 0.7500, 0.7500, 0.7500])
tensor([0.7500, 0.7500, 0.7500, 0.7500])


In [28]:
#Linear Regression example
x=torch.tensor(1.0)
y=torch.tensor(2.0)

w=torch.tensor(1.0, requires_grad=True)

# forward pass
y_cap = w * x

loss = (y_cap - y)**2

print(loss)

#backward pass

loss.backward()
print(w.grad)

tensor(1., grad_fn=<PowBackward0>)
tensor(-2.)
