![document.png](attachment:document.png)

# Definitions

$ W_{m \times n} =
\begin{bmatrix}
w_{11} & \cdots & w_{1n} \\
\vdots & \ddots & \vdots \\
w_{m1} & \cdots & w_{mn}
\end{bmatrix}$ weight matrix

$I_{1 \times m} = [i_1 \cdots i_m]$ input vector

$O_{1 \times n} = [o_1 \cdots o_n]$ output vector, where $o_\alpha = f\Big(\overbrace{w_{1\alpha}i_1+ \cdots + w_{m\alpha}i_m}^{a_\alpha}\Big)$

$T_{1 \times n} = [t_1 \cdots t_n]$ target vector

$$l = \frac{\epsilon(o_1, t_1) + \cdots + \epsilon(o_n, t_n)}{n} = \frac{1}{n} \Big(\epsilon\big(\overbrace{f(w_{11}i_1+ \cdots + w_{m1}i_m)}^{o_1}, t_1\big) + \cdots + \epsilon\big(\overbrace{f(w_{1n}i_1+ \cdots + w_{mn}i_m)}^{o_n}, t_n\big)\Big)$$


# Backward pass
$$ \frac{\partial l}{\partial w_{\alpha \beta}} = \frac{1}{n} \frac{\partial\epsilon(o_\alpha, t_\alpha)}{\partial o_\alpha} \frac{\partial f(a_\alpha)}{\partial a_\alpha} i_\beta $$

# Example:
$$\epsilon(o_\alpha, t_\alpha) =  (o_\alpha - t_\alpha)^2 \therefore \frac{\partial\epsilon(o_\alpha, t_\alpha)}{\partial o_\alpha} = 2(o_\alpha - t_\alpha)$$

$$f(a_\alpha) = \tanh(a_\alpha) \therefore \frac{\partial f(a_\alpha)}{\partial a_\alpha}=1 - \tanh^2(a_\alpha)$$

***

$$ \frac{\partial l}{\partial w_{\alpha \beta}} = \Big(\frac{2}{n}\Big)(o_\alpha - t_\alpha) \big(1 - \tanh^2(a_\alpha)\big) i_\beta $$

# Imports & definitions

In [1]:
import torch
import numpy as np

np.random.seed(0)
torch.set_default_tensor_type('torch.DoubleTensor')
m = 10    # input
n = 5     # output

# Numpy

In [2]:
I = np.random.random_sample((m))
T = np.random.random_sample((n))
W = np.random.random_sample((m, n))
A = I @ W
O = np.tanh(A)
l = np.average((O-T)**2)
dl = np.matmul(np.reshape(I,(m,-1)), np.reshape((2/n)*(O-T)*(1-np.tanh(A)**2),(-1,n)))

# Pytorch

In [3]:
I_t = torch.tensor(I)
T_t = torch.tensor(T)
W_t = torch.tensor(W, requires_grad=True)
A_t = I_t @ W_t
O_t = torch.tanh(A_t)
l_t = (O_t-T_t).pow(2).mean()
l_t.backward()
dl_t = W_t.grad.data

In [4]:
print(np.allclose(dl, dl_t.numpy()))

True


# Pytorch.nn Linear

In [5]:
linear = torch.nn.Linear(m, n, bias=False)
with torch.no_grad():
    linear.weight.copy_(W_t.data.t())
O_l = torch.tanh(linear(I_t))
criterion = torch.nn.MSELoss()
l_l = criterion(O_l, T_t)
linear.zero_grad()
l_l.backward()
dl_l = linear.weight.grad.data

In [6]:
print(np.allclose(dl, dl_l.numpy().T))

True


In [7]:
print(np.allclose(O, O_l.detach().numpy()))

True


In [25]:
print(O)

[0.99248036 0.99244429 0.99889554 0.99232696 0.97565233]


In [26]:
print(O_l.detach().numpy())

[0.99248036 0.99244429 0.99889554 0.99232696 0.97565233]
