# Week 3
# day5: 26 Aug 2022


Loss Functions and Gradient Descent

1. Demonstrate your understanding of MSE and Cross Entropy Loss by implementing them in Numpy
2. Implement equivalent "backward" functions that would compute the derivative of the above loss functions in Numpy
3. Verify the gradients computed by your function with that of PyTorch's autograd

In [1]:
import math
import numpy as np
import torch

## Mean Square Error

In [2]:
#linear regression
def linear_regression(x, m, b):
    yhat = x * m + b
    return yhat

# Mean Square Error (MSE)
def MSE(y, yhat):
    return np.square(np.subtract(y, yhat)).mean()

In [3]:
# Derivative of MSE

def der_MSE_wrt_m(x, y, yhat):
    sum = 0
    for i in range(len(yhat)):
        sum += -2 * x[i] * (y[i] - yhat[i])
    return sum / len(y)


def der_MSE_wrt_b(y, yhat):
    sum = 0
    for i in range(len(yhat)):
        sum += -2 * (y[i] - yhat[i])
    return sum / len(y)

In [4]:
# implementation of Mean Square Error using numpy
x = np.array([1.0, 3.0, 7.0, 2.0, 5.0, 4.0])
y = np.array([4.0, 9.0, 20.0, 7.0, 15.0, 11.0])
m = 2.8
b = 0.6
yhat = linear_regression(x, m, b)
print("predicted y:", yhat)

predicted y: [ 3.4  9.  20.2  6.2 14.6 11.8]


In [5]:
# Computing loss and back propagation
print("MSE:", MSE(y, yhat))
print("gradient wrt m:", der_MSE_wrt_m(x, y, yhat))
print("gradient wrt b:", der_MSE_wrt_b(y, yhat))

MSE: 0.3066666666666666
gradient wrt m: 0.1333333333333274
gradient wrt b: -0.2666666666666682


In [6]:
# verification of MSE with torch and backpropagation using autograd
x = torch.tensor([1.0, 3.0, 7.0, 2.0, 5.0, 4.0], requires_grad=True)
y = torch.tensor([4.0, 9.0, 20.0, 7.0, 15.0, 11.0])
m = torch.tensor([2.8], requires_grad=True)
b = torch.tensor([0.6], requires_grad=True)
yhat = linear_regression(x, m, b)
print("predicted y:", yhat)

predicted y: tensor([ 3.4000,  9.0000, 20.2000,  6.2000, 14.6000, 11.8000],
       grad_fn=<AddBackward0>)


In [7]:
# Computing loss and back propagation using autograd
mse = torch.nn.functional.mse_loss(linear_regression(x, m, b), y)
print("MSE:", mse)
mse.backward()
print("gradient wrt m:", m.grad)
print("gradient wrt b:", b.grad)

MSE: tensor(0.3067, grad_fn=<MseLossBackward0>)
gradient wrt m: tensor([0.1333])
gradient wrt b: tensor([-0.2667])


## Cross Entropy Loss

In [29]:
# sigmoid function
def sigmoid_forward(x):
    return 1 / (1 + np.exp(-x))

# derivative of sigmoid
def sigmoid_backward(x):
    sigmoid = 1 / (1 + np.exp(-x))
    return sigmoid * (1 - sigmoid)


# Binary cross entropy loss (BCE)
def BCE(yhat, y):
    return -(y * np.log(yhat) + (1 - y) * np.log(1 - yhat)).mean()

# derivative of BCE #dow(BCE)/dow(z)
def der_BCE_wrt_z(y, yhat):
    der_BCE = -(y/yhat -(1-y)/(1-yhat)) #dow(BCE)/dow(yhat)
    der_sigmoid = sigmoid_backward(yhat) #dow(yhat)/dow(z)
    return  der_sigmoid *der_BCE

In [30]:
# implementation of Sigmoid fn
z = np.array([-12.1, -5.0, 18.0, 29.0, 17.0])
y = np.array([0, 0, 1, 1, 1])
yhat = sigmoid_forward(z)
print("Predicted y:", sigmoid_forward(z))

Predicted y: [5.55948233e-06 6.69285092e-03 9.99999985e-01 1.00000000e+00
 9.99999959e-01]


In [31]:
# computation of loss and backpropagation
loss = BCE(yhat, y)
print("Cross entropy loss:", loss)
print("Grad:", der_BCE_wrt_z(y, yhat))

Cross entropy loss: 0.0013441929233032343
Grad: [ 0.25000139  0.25168167 -0.19661194 -0.19661193 -0.19661195]


In [21]:
# verification of BCE with torch and backpropagation using autograd
z = torch.tensor([-12.1, -5.0, 18.0, 29.0, 17.0], requires_grad=True)
y = torch.tensor([0.0, 0.0, 1.0, 1.0, 1.0], requires_grad=False)
print("Predicted y:", torch.sigmoid(z))

Predicted y: tensor([5.5595e-06, 6.6929e-03, 1.0000e+00, 1.0000e+00, 1.0000e+00],
       grad_fn=<SigmoidBackward0>)


In [22]:
# computation of loss and backpropagation using autograd
loss = torch.nn.BCELoss()(torch.sigmoid(z), y)
print("Cross entropy loss:", loss)
loss.backward()
print("Grad", z.grad)

Cross entropy loss: tensor(0.0013, grad_fn=<BinaryCrossEntropyBackward0>)
Grad tensor([1.1119e-06, 1.3386e-03, 0.0000e+00, 0.0000e+00, 0.0000e+00])


# Autograd backward traversal

In [13]:
import torch

x = torch.tensor([2.0, 5.0], requires_grad=True)
y = torch.tensor([4.0, 3.0], requires_grad=True)
z = x * y
print("Product:", z)
external_grad = torch.ones_like(x)
z.backward(gradient=external_grad)
print("Grad wrt x:", x.grad)
print("Grad wrt y:", y.grad)


def getBack(var_grad_fn):
    print(var_grad_fn)
    for fn in var_grad_fn.next_functions:
        if fn[0]:
            try:
                tensor = getattr(fn[0], "variable")
                print(fn[0])
                print("Tensor with grad found:", tensor)
                print(" - gradient:", tensor.grad)
                print()
            except AttributeError as e:
                getBack(fn[0])


getBack(z.grad_fn)

Product: tensor([ 8., 15.], grad_fn=<MulBackward0>)
Grad wrt x: tensor([4., 3.])
Grad wrt y: tensor([2., 5.])
<MulBackward0 object at 0x7fdee1612da0>
<AccumulateGrad object at 0x7fdee1612b60>
Tensor with grad found: tensor([2., 5.], requires_grad=True)
 - gradient: tensor([4., 3.])

<AccumulateGrad object at 0x7fdee1613070>
Tensor with grad found: tensor([4., 3.], requires_grad=True)
 - gradient: tensor([2., 5.])



why do we pass external gradients as no of ones as that of the size of tensor we backpropagate in backward()?

Since we have to trace back each element of the tensor and dz/dz = 1 it can be replaced by weights.Hence, we pass external gradients as no of ones as that of the size of tensor

In [14]:
# Example of random gradient outout being passed
import torch

x = torch.tensor([2.0, 5.0], requires_grad=True)
y = torch.tensor([4.0, 3.0], requires_grad=True)
z = x * y
print("Product:", z)
external_grad = torch.rand_like(z)
z.backward(gradient=external_grad)
print("Grad wrt x:", x.grad)
print("Grad wrt y:", y.grad)

Product: tensor([ 8., 15.], grad_fn=<MulBackward0>)
Grad wrt x: tensor([1.9012, 0.0511])
Grad wrt y: tensor([0.9506, 0.0852])
