In [None]:
import torch 

In [16]:
torch.manual_seed(0)

<torch._C.Generator at 0x1c93dd43290>

In [17]:
#make helper functions
def sigmoid_activation(x):
    return 1/(1+torch.exp(-x))

def sigmoid_derivative(a): #used when weight update happens;
    return a * (1-a)

def softmax(z):
    e = torch.exp(z - z.max(dim=1, keepdim=True)[0])
    return e/ e.sum(dim=1, keepdim=True)

def cross_entropy(y_hat, y_onehot):
    return -(y_onehot * torch.log(y_hat + 1e-12)).sum(dim=1).mean()

In [18]:
#sample dataset
x = torch.randn(1, 2) 
y_idx = torch.tensor([1]) #class label(0,1)
y_onehot = torch.zeros(1, 2); #row vector made of zeroes
y_onehot[0, y_idx] = 1.0 #turns the above made vector into one-hot
print(y_onehot)

tensor([[0., 1.]])


In [19]:
#parameters
w1 = torch.randn(2, 2, requires_grad=True)  #2 inputs, 2 neurons in hidden layer
b1 = torch.zeros(1, 2, requires_grad=True) #bias for hidden layer
w2 = torch.randn(2, 2, requires_grad=True)  #2 neurons in hidden layer, 2 output classes
b2 = torch.zeros(1, 2, requires_grad=True) #bias for output lay

In [20]:
#manual forward propagation
z1 = x@w1 + b1 #input to hidden layer
a1 = sigmoid_activation(z1) #sigmoid activation for hidden layer
z2 = a1@w2 + b2 #input to output layer
a2 = softmax(z2) #softmax activation for output layer
cross_entropy_loss = cross_entropy(a2, y_onehot) #cross-entropy loss

In [21]:
#manual backpropagation
delta2 = (a2 - y_onehot) #we are getting delta of output layer 
dw2_manual = delta2.t() @ a1 #weight gradient for hidden to output layer
db2 = delta2.sum(dim=0) #bias gradient for b2

delta1 = (delta2 @ w2.t()) * sigmoid_derivative(a1) #we are getting delta of hidden layer
dw1_manual = x.t() @ delta1 #weight gradient for input to hidden layer
db1 = delta1.sum(dim=0) #bias gradient for input to hidden layer

In [22]:
#for comparison, let's use autograd
for p in [w1, b1, w2, b2]:
    if p.grad is not None:
        p.grad.zero_() #zero the gradients

In [23]:
#forward with autograd
z1_g = x@w1 + b1
a1_g = torch.sigmoid(z1_g)
z2_g = a1_g@w2 + b2
a2_g = torch.nn.functional.softmax(z2_g, dim=1)
loss_g = -(y_onehot * torch.log(a2_g + 1e-12)).sum(dim=1).mean()

In [24]:
#backward with autograd
loss_g.backward(retain_graph=True)

In [25]:
for p in [w1, b1, w2, b2]:
    print(p.shape, p.grad is None)


torch.Size([2, 2]) False
torch.Size([1, 2]) False
torch.Size([2, 2]) False
torch.Size([1, 2]) False


In [26]:
#read autograd gradients
dw2_autograd = w2.grad.clone()
db2_autograd = b2.grad.clone().squeeze()
dw1_autograd = w1.grad.clone()
db1_autograd = b1.grad.clone().squeeze()

In [27]:
#compare
print("Loss Manual:", cross_entropy_loss.item(), "Loss Autograd:", loss_g.item())
print("dw2 manual:\n", dw2_manual, "\ndw2 autograd:\n", dw2_autograd)
print("db2 manual:\n", db2, "\ndb2 autograd:\n", db2_autograd)
print("dw1 manual:\n", dw1_manual, "\ndw1 autograd:\n", dw1_autograd)
print("db1 manual:\n", db1, "\ndb1 autograd:\n", db1_autograd)

Loss Manual: 0.568365216255188 Loss Autograd: 0.568365216255188
dw2 manual:
 tensor([[ 0.0198,  0.3397],
        [-0.0198, -0.3397]], grad_fn=<MmBackward0>) 
dw2 autograd:
 tensor([[ 0.0198, -0.0198],
        [ 0.3397, -0.3397]])
db2 manual:
 tensor([ 0.4335, -0.4335], grad_fn=<SumBackward1>) 
db2 autograd:
 tensor([ 0.4335, -0.4335])
dw1 manual:
 tensor([[-0.0127, -0.0358],
        [ 0.0024,  0.0068]], grad_fn=<MmBackward0>) 
dw1 autograd:
 tensor([[-0.0127, -0.0358],
        [ 0.0024,  0.0068]])
db1 manual:
 tensor([-0.0082, -0.0232], grad_fn=<SumBackward1>) 
db1 autograd:
 tensor([-0.0082, -0.0232])
