In [1]:
import torch

### Hyper-Parameters

$N$: batch size  
$D$: feature dimension  
$H$: hidden dimension  
$O$: output dimension

### Input tensors:  

Features: $ X \in \mathbb{R}^{N \times D}$  
Labels: $ Y \in \mathbb{R}^{N \times 1}$

### Parameters to learn:

Hidden layer: weight $W_h \in \mathbb{R}^{D \times H}$, bias $B_h \in \mathbb{R}^{1 \times H}$  
Output layer: weight $W_o \in \mathbb{R}^{H \times O}$, bias $B_h \in \mathbb{R}^{1 \times O}$

### Forward

\begin{align} 
Z_h &= X W_h + \mathbf{1}_N \cdot B_h \\
A_h &= \frac{1}{1 + \exp{(-Z_h)}} \\
Z_o &= A_h W_o + \mathbf{1}_N \cdot B_o \\
A_o &= \frac{\exp(Z_o)}{\exp(Z_o) \cdot \mathbf{1}_O \cdot \mathbf{1}_O^T} \\
\ell &= \frac{ (-Y \odot \log(A_o) \cdot \mathbf{1}_O)^T \cdot \mathbf{1}_N }{N}
\end{align}

### Back propagation

For $\frac{\partial \ell}{\partial A_o}$:
$$
\begin{align}
d\ell &= \frac{1}{N} tr \left( \left(-Y \odot d\left(\log \left( A_o \right) \right) \cdot \mathbf{1}_O \right)^T \cdot \mathbf{1}_N\right) \\
&= \frac{1}{N} tr \left( \left(-Y \odot (\log'(A_o) \odot dA_o) \cdot \mathbf{1}_O \right)^T \cdot \mathbf{1}_N \right) \\
&= \frac{1}{N} tr \left( \mathbf{1}_N^T \cdot  \left(-Y \odot (\log'(A_o) \odot dA_o) \cdot \mathbf{1}_O \right) \right) \\
&= \frac{1}{N} tr \left( \mathbf{1}_O \cdot \mathbf{1}_N^T \cdot  \left(-Y \odot \log'(A_o) \odot dA_o \right) \right) \\
&= \frac{1}{N} tr \left( (\mathbf{1}_N \cdot \mathbf{1}_O^T )^T \cdot  \left(-Y \odot \log'(A_o) \right)  \odot dA_o \right) \\
&= \frac{1}{N} tr \left( \left( \left(\mathbf{1}_N \cdot \mathbf{1}_O^T \right)  \odot  \left(-Y \odot \log'(A_o) \right) \right)^T  \cdot dA_o \right) \\
&= \frac{1}{N} tr \left( \left(  -Y \oslash A_o  \right)^T  \cdot dA_o \right) \\
&=  (  -\frac{1}{N} Y \oslash A_o  )^T  \cdot dA_o   = (\frac{\partial \ell}{\partial A_o})^TdA_o \\
\end{align}
$$
$$\Rightarrow \frac{\partial \ell}{\partial A_o} = -\frac{1}{N} Y \oslash A_o $$









In [63]:
N = 10
D = 2
H = 5
O = 3

x = torch.rand([N, D])
y = torch.rand([N, 1])
y.requires_grad_()

z_o = torch.rand([N, O])
z_o.requires_grad_()

a_o = torch.exp(z_o) / ( torch.matmul(torch.matmul(torch.exp(z_o), torch.ones([O, 1])), torch.ones(1, O)))
l = torch.matmul(torch.matmul(-y * torch.log(a_o), torch.ones([O, 1])).T, torch.ones([N, 1])) / N

dl_da = -y / a_o / N
print(dl_da / torch.autograd.grad(l, a_o)[0])




tensor([[1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000]], grad_fn=<DivBackward0>)
