# Week 3
# day5: 26 Aug 2022


Loss Functions and Gradient Descent

1. Demonstrate your understanding of MSE and Cross Entropy Loss by implementing them in Numpy
2. Implement equivalent "backward" functions that would compute the derivative of the above loss functions in Numpy
3. Verify the gradients computed by your function with that of PyTorch's autograd

In [1]:
import math
import numpy as np
import torch

In [2]:
def linear_regression(x, m, b):
    yhat = x * m + b
    return yhat


def MSE(y, yhat):
    return np.square(np.subtract(y, yhat)).mean()

In [3]:
def der_MSE_wrt_m(x, y, yhat):
    sum = 0
    for i in range(len(yhat)):
        sum += -2 * x[i] * (y[i] - yhat[i])
    return sum / len(y)


def der_MSE_wrt_b(y, yhat):
    sum = 0
    for i in range(len(yhat)):
        sum += -2 * (y[i] - yhat[i])
    return sum / len(y)

In [4]:
x = np.array([1.0, 3.0, 7.0, 2.0, 5.0, 4.0])
y = np.array([4.0, 9.0, 20.0, 7.0, 15.0, 11.0])
m = 2.8
b = 0.6
yhat = linear_regression(x, m, b)
print("predicted y:", yhat)
print("MSE:", MSE(y, yhat))
print("gradient wrt m:", der_MSE_wrt_m(x, y, yhat))
print("gradient wrt b:", der_MSE_wrt_b(y, yhat))

predicted y: [ 3.4  9.  20.2  6.2 14.6 11.8]
MSE: 0.3066666666666666
gradient wrt m: 0.1333333333333274
gradient wrt b: -0.2666666666666682


In [5]:
x = torch.tensor([1.0, 3.0, 7.0, 2.0, 5.0, 4.0], requires_grad=True)
y = torch.tensor([4.0, 9.0, 20.0, 7.0, 15.0, 11.0])
m = torch.tensor([2.8], requires_grad=True)
b = torch.tensor([0.6], requires_grad=True)
yhat = linear_regression(x, m, b)
print("predicted y:", yhat)
mse = torch.nn.functional.mse_loss(linear_regression(x, m, b), y)
print("MSE:", mse)
mse.backward()
print("gradient wrt m:", m.grad)
print("gradient wrt b:", b.grad)

predicted y: tensor([ 3.4000,  9.0000, 20.2000,  6.2000, 14.6000, 11.8000],
       grad_fn=<AddBackward0>)
MSE: tensor(0.3067, grad_fn=<MseLossBackward0>)
gradient wrt m: tensor([0.1333])
gradient wrt b: tensor([-0.2667])


In [6]:
x = np.array([1.0])
y = np.array([4.0])
m = 3.0
b = 0.6
yhat = linear_regression(x, m, b)
print("predicted y:", yhat)
print("MSE:", MSE(y, yhat))
print("gradient wrt m:", der_MSE_wrt_m(x, y, yhat))
print("gradient wrt b:", der_MSE_wrt_b(y, yhat))

predicted y: [3.6]
MSE: 0.15999999999999992
gradient wrt m: -0.7999999999999998
gradient wrt b: -0.7999999999999998


In [7]:
x = torch.tensor([1.0], requires_grad=True)
y = torch.tensor([4.0])
m = torch.tensor([3.0], requires_grad=True)
b = torch.tensor([0.6], requires_grad=True)
yhat = linear_regression(x, m, b)
print("predicted y:", yhat)
mse = torch.nn.functional.mse_loss(linear_regression(x, m, b), y)
print("MSE:", mse)
mse.backward()
print("gradient wrt m:", m.grad)
print("gradient wrt b:", b.grad)

predicted y: tensor([3.6000], grad_fn=<AddBackward0>)
MSE: tensor(0.1600, grad_fn=<MseLossBackward0>)
gradient wrt m: tensor([-0.8000])
gradient wrt b: tensor([-0.8000])


In [44]:
# sigmoid function
def sigmoid_forward(x):
    return 1 / (1 + np.exp(-x))


def sigmoid_backward(x):
    sigmoid = 1 / (1 + np.exp(-x))
    return sigmoid * (1 - sigmoid)


# cross entropy loss
def BCE(yhat, y):
    return -(y * np.log(yhat) + (1 - y) * np.log(1 - yhat)).mean()


def BCE_wrt_z(y, yhat):
    der_BCE = (yhat - y) / (yhat * (1 - yhat))
    der_sigmoid = sigmoid_backward(yhat)
    return der_BCE * der_sigmoid

In [45]:
z = np.array([-2, 1])
y = np.array([0, 1])
sigmoid_forward_v = np.vectorize(sigmoid_forward)
yhat = sigmoid_forward_v(z)
print("predicted y:", sigmoid_forward_v(z))
loss = BCE(yhat, y)
print("Cross entropy loss:", loss)
print("Grad:", BCE_wrt_z(y, yhat))

predicted y: [0.11920292 0.73105858]
Cross entropy loss: 0.22009484928059767
Grad: [ 0.28282793 -0.30006058]


In [46]:
z = torch.tensor([-2.0, 1.0], requires_grad=True)
y = torch.tensor([0.0, 1.0], requires_grad=False)
yhat = torch.sigmoid(z)
print("predicted y:", yhat)
loss = torch.nn.BCELoss()(yhat, y)
print("Cross entropy loss:", loss)
loss.backward()
print("Grad:", z.grad)


def getBack(var_grad_fn):
    print(var_grad_fn)
    for fn in var_grad_fn.next_functions:
        if fn[0]:
            try:
                tensor = getattr(fn[0], "variable")
                print(fn[0])
                print("Tensor with grad found:", tensor)
                print(" - gradient:", tensor.grad)
                print()
            except AttributeError as e:
                getBack(fn[0])


getBack(loss.grad_fn)

predicted y: tensor([0.1192, 0.7311], grad_fn=<SigmoidBackward0>)
Cross entropy loss: tensor(0.2201, grad_fn=<BinaryCrossEntropyBackward0>)
Grad: tensor([ 0.0596, -0.1345])
<BinaryCrossEntropyBackward0 object at 0x7f131930e9e0>
<SigmoidBackward0 object at 0x7f12e2463250>
<AccumulateGrad object at 0x7f12e2463f40>
Tensor with grad found: tensor([-2.,  1.], requires_grad=True)
 - gradient: tensor([ 0.0596, -0.1345])



In [40]:
z = np.array([-12.1, -5.0, 18.0, 29.0, 17.0])
y = np.array([0, 0, 1, 1, 1])
yhat = sigmoid_forward(z)
print("Predicted y:", sigmoid_forward(z))
loss = BCE(yhat, y)
print("Cross entropy loss:", loss)
print("Grad:", BCE_wrt_z(y, yhat))

Predicted y: [5.55948233e-06 6.69285092e-03 9.99999985e-01 1.00000000e+00
 9.99999959e-01]
Cross entropy loss: 0.0013441929233032343
Grad: [ 0.25000139  0.25168167 -0.19661194 -0.19661193 -0.19661195]


In [41]:
z = torch.tensor([-12.1, -5.0, 18.0, 29.0, 17.0], requires_grad=True)
y = torch.tensor([0.0, 0.0, 1.0, 1.0, 1.0], requires_grad=False)
print("Predicted y:", torch.sigmoid(z))
loss = torch.nn.BCELoss()(torch.sigmoid(z), y)
print("Cross entropy loss:", loss)
loss.backward()
print("Grad", z.grad)

Predicted y: tensor([5.5595e-06, 6.6929e-03, 1.0000e+00, 1.0000e+00, 1.0000e+00],
       grad_fn=<SigmoidBackward0>)
Cross entropy loss: tensor(0.0013, grad_fn=<BinaryCrossEntropyBackward0>)
Grad tensor([1.1119e-06, 1.3386e-03, 0.0000e+00, 0.0000e+00, 0.0000e+00])


# Autograd backward traversal

In [13]:
import torch

x = torch.tensor([2.0, 5.0], requires_grad=True)
y = torch.tensor([4.0, 3.0], requires_grad=True)
z = x * y
print("Product:", z)
external_grad = torch.ones_like(x)
z.backward(gradient=external_grad)
print("Grad wrt x:", x.grad)
print("Grad wrt y:", y.grad)


def getBack(var_grad_fn):
    print(var_grad_fn)
    for fn in var_grad_fn.next_functions:
        if fn[0]:
            try:
                tensor = getattr(fn[0], "variable")
                print(fn[0])
                print("Tensor with grad found:", tensor)
                print(" - gradient:", tensor.grad)
                print()
            except AttributeError as e:
                getBack(fn[0])


getBack(z.grad_fn)

Product: tensor([ 8., 15.], grad_fn=<MulBackward0>)
Grad wrt x: tensor([4., 3.])
Grad wrt y: tensor([2., 5.])
<MulBackward0 object at 0x7f132af36440>
<AccumulateGrad object at 0x7f132aeffca0>
Tensor with grad found: tensor([2., 5.], requires_grad=True)
 - gradient: tensor([4., 3.])

<AccumulateGrad object at 0x7f13283f0460>
Tensor with grad found: tensor([4., 3.], requires_grad=True)
 - gradient: tensor([2., 5.])



why do we pass external gradients as no of ones as that of the size of tensor we backpropagate in backward()?

Since we have to trace back each element of the tensor and dz/dz = 1 .Hence, we pass external gradients as no of ones as that of the size of tensor

In [42]:
# Example of random gradient outout being passed
import torch

x = torch.tensor([2.0, 5.0], requires_grad=True)
y = torch.tensor([4.0, 3.0], requires_grad=True)
z = x * y
print("Product:", z)
external_grad = torch.rand_like(z)
z.backward(gradient=external_grad)
print("Grad wrt x:", x.grad)
print("Grad wrt y:", y.grad)

Product: tensor([ 8., 15.], grad_fn=<MulBackward0>)
Grad wrt x: tensor([1.5435, 0.3765])
Grad wrt y: tensor([0.7717, 0.6275])
