In [1]:
import torch

x = torch.randn(3)
print(x)

tensor([ 1.8111, -1.0581, -1.0703])


The requires_grad=True creates a compuatational graph

In [2]:
x =torch.randn(3,requires_grad=True)
x

tensor([ 0.8675, -1.1722, -0.0580], requires_grad=True)

In [3]:
y = x+2

In [4]:
print(y)

tensor([2.8675, 0.8278, 1.9420], grad_fn=<AddBackward0>)


In [5]:
z = y*y
z.grad_fn

<MulBackward0 at 0x781fe960dde0>

In [9]:
loss = z.mean()

In [10]:
loss.backward() # dz/dx

In [12]:
print(x.grad)

tensor([1.9117, 0.5518, 1.2946])


loss.backward() is a key method used in training machine learning models. It computes the gradients of the loss function with respect to all tensors in the computation graph that have requires_grad=True. These gradients are stored in the .grad attribute of the respective tensors

Backward Pass:

    When loss.backward() is called, PyTorch computes:
    ∂loss∂xi=2n(xi−ytrue,i).
    ∂xi​∂loss​=n2​(xi​−ytrue,i​).

Gradient Output: The .grad attribute of xx contains the derivative:
x.grad=2n(x−ytrue)
x.grad=n2​(x−ytrue​)

In [19]:
y_true = torch.tensor([1.0,2.1,3.0],)
y_pred = torch.tensor([1.30,2.79,4.02],requires_grad=True)
MSE = torch.mean((y_pred - y_true)**2)
MSE.backward()
print(y_pred.grad)

tensor([0.2000, 0.4600, 0.6800])


In [21]:
v = torch.tensor([1,10,100],dtype=torch.float32)
SR = (y_pred - y_true)**2
SR.backward(v)

print(y_pred.grad)

tensor([  0.8000,  14.2600, 204.6800])


In [34]:
x = torch.tensor([2.,3.,4.],requires_grad=True)
y = x**2
equal_gradients = torch.tensor([1.,1.,1.],dtype=torch.float32)
y.backward(equal_gradients)
print("Equal gradients",x.grad)


Equal gradients tensor([4., 6., 8.])


In [35]:
x = torch.tensor([2.,3.,4.],requires_grad=True)
y = x**2
gradients = torch.tensor([1.,.1,.01],dtype=torch.float32)

y.backward(gradients)
print("Weighted Gradients",x.grad)

Weighted Gradients tensor([4.0000, 0.6000, 0.0800])


In [37]:
x.requires_grad_(False)
print(x)

tensor([2., 3., 4.])


In [39]:
t = torch.tensor([1,2,3],requires_grad=True,dtype=torch.float32)
print(t)
t0 = t.detach()
print(t0)

tensor([1., 2., 3.], requires_grad=True)
tensor([1., 2., 3.])


The with torch.no_grad: context in PyTorch is used to temporarily disable gradient tracking. This is particularly useful for operations where you don't want to compute gradients, such as during inference, evaluation, or updating parameters manually without affecting the computation graph

In [42]:
y = t+2
print(y)

with torch.no_grad():
    y = t+2
    print(y)

tensor([3., 4., 5.], grad_fn=<AddBackward0>)
tensor([3., 4., 5.])


In [45]:
weights = torch.ones(4,requires_grad=True)

for epoch in range(3):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])


How Accumulation Happens

    During Backward Pass:
        When you call loss.backward(), PyTorch computes the gradients for each parameter with respect to the loss and adds these gradients to the existing values in .grad.

    Addition Instead of Replacement:
        PyTorch does not overwrite the .grad attribute during backpropagation. Instead, it adds the new gradients to the existing ones.

In [48]:
weights = torch.ones(4,requires_grad=True)

for epoch in range(3):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)
    weights.grad.zero_() #resets the weights fo the next iteration

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
