### Inconsistencies that I found when calculating derivatives by manual calculation and automatic differentiation methods, please help to find the error.

In [54]:
import numpy as np
import torch
import torch.nn as nn

***
In the example a NN with one hidden layer and sigmoid activation is considered. The NN has a matrix ${U}$ m x d of parameters for the hidden layer and a vector $\vec{W}$ 1 x m of parameters for the output layer.
***

In [55]:
class SingleLayerNet(nn.Module):
    def __init__(self, m=10, d=3):
        '''m: # hidden nodes
           d: # dimensions of x'''
        super().__init__()
        self.m = m
        self.linear1 = nn.Linear(d, m, bias=False)
        nn.init.normal_(self.linear1.weight)  # initialize with std gaussian
        self.linear2 = nn.Linear(m, 1, bias=False)
        nn.init.normal_(self.linear2.weight)
        #self.linear2.weight.requires_grad_(False)
        # self.linear2.weight.bernoulli_()
        # self.linear2.weight *= 2
        # self.linear2.weight -= 1
        
    def forward(self, x):
        x = self.linear1(x)
        #print("net-1: ", x)
        x = torch.sigmoid(x)
        #print("net-2: ", x)
        output = self.linear2(x).squeeze()/np.sqrt(self.m)  # scale by 1/sqrt(m)
        return output


***
NN and an array of input data are initialised below.
***

In [56]:
#NN initialisation
width = 10
testNet = SingleLayerNet(width)
weights1 = testNet.linear1.weight.detach().numpy()
weights2 = testNet.linear2.weight.detach().numpy()

#Input data
inp = np.random.normal(size=(3, 1)).astype(np.float32)

***
$\mathbf{\text{The derivation formulas for the network with respect to the coefficients $\vec{W},{U}$ are as follows:}}$<br>
***
&emsp;This NN is equivalent to the following expression:
$$f(\vec{W},{U}) = \frac{1}{\sqrt{m}} \sum \limits _{k=1} ^{m} \frac{w_k}{(1+exp(-\sum \limits _{s=1} ^{d} u_{k,s} \cdot x_s))}$$
&emsp;Derivative with respect to the coefficient $w_k$:
$$\frac{\partial f}{\partial w_k} = \frac{1}{\sqrt{m}} \frac{1}{(1+exp(-\sum \limits _{s=1} ^{d} u_{k,s} \cdot x_s))}$$
&emsp;Derivative with respect to the coefficient $u_{k,N}$:
$$\frac{\partial f}{\partial u_{k,N}} = \frac{1}{\sqrt{m}} \frac{x_N \cdot w_k \cdot exp(-\sum \limits _{s=1} ^{d} u_{k,s} \cdot x_s)}{(1+exp(-\sum \limits _{s=1} ^{d} u_{k,s} \cdot x_s))^2}$$

***
$\mathbf{\text{Calculating derivatives using symbolic formulas:}}$<br>
***

In [64]:
tmp0 = np.matmul(weights1, inp).squeeze()
tmp = np.exp(-1*tmp0)
dw_calc = (1/np.sqrt(10))/(1+tmp)

tmp2=(1+tmp)*(1+tmp)
dtmp2=weights2*(1/np.sqrt(10))*tmp/tmp2 #
du_calc = np.transpose(np.matmul(inp, dtmp2))


***
$\mathbf{\text{Calculating derivatives using automatic differentiation:}}$<br>
***

In [62]:
xx = torch.tensor(inp.transpose(), dtype=torch.float32)
out = testNet.forward(xx)
df = torch.autograd.grad(out, (testNet.linear1.weight, testNet.linear2.weight)\
    , retain_graph=True, create_graph=True, allow_unused=True)
du_ad = df[0].detach().numpy()
dw_ad = df[1].detach().numpy().squeeze()


***
$\mathbf{\text{Comparison of the results of manual calculation and automatic differentiation:}}$<br>
***

In [63]:
print("Matrix of derivatives with respect to coefficients U by automatic differentiation:\n", du_ad)
print("Matrix of derivatives with respect to coefficients U by manual calculation:\n", du_calc)

Matrix of derivatives with respect to coefficients U by automatic differentiation:
 [[-0.06903538 -0.03336043  0.01319817]
 [ 0.0153444   0.00741498 -0.00293354]
 [ 0.04801453  0.02320239 -0.00917941]
 [ 0.00712414  0.00344265 -0.00136199]
 [-0.06112475 -0.02953772  0.01168582]
 [ 0.03676222  0.01776486 -0.0070282 ]
 [-0.08028518 -0.03879675  0.01534891]
 [-0.00929642 -0.00449237  0.00177729]
 [ 0.03989149  0.01927703 -0.00762645]
 [-0.05342617 -0.02581749  0.01021401]]
Matrix of derivatives with respect to coefficients U by manual calculation:
 [[-0.06903539 -0.03336044  0.01319818]
 [ 0.0153444   0.00741498 -0.00293354]
 [ 0.04801453  0.02320239 -0.00917941]
 [ 0.00712414  0.00344265 -0.00136199]
 [-0.06112474 -0.02953772  0.01168582]
 [ 0.03676222  0.01776486 -0.0070282 ]
 [-0.08028519 -0.03879676  0.01534891]
 [-0.00929642 -0.00449237  0.00177729]
 [ 0.03989148  0.01927703 -0.00762645]
 [-0.05342617 -0.02581749  0.01021401]]


In [60]:
print("Matrix of derivatives with respect to coefficients W by automatic differentiation:\n", dw_ad)
print("Matrix of derivatives with respect to coefficients W by manual calculation:\n", dw_calc)

Matrix of derivatives with respect to coefficients W by automatic differentiation:
 [0.24249876 0.2841476  0.08382694 0.00687479 0.06238763 0.20009805
 0.14201619 0.00755101 0.141299   0.15766416]
Matrix of derivatives with respect to coefficients W by manual calculation:
 [0.24249874 0.2841476  0.08382694 0.00687479 0.06238762 0.20009805
 0.14201619 0.00755101 0.14129898 0.15766415]


The derivatives on W coefficients are the same, but on U coefficients they are not. I think I got the symbolic expressions for derivatives right, maybe the error is in "𝐂𝐚𝐥𝐜𝐮𝐥𝐚𝐭𝐢𝐧𝐠 𝐝𝐞𝐫𝐢𝐯𝐚𝐭𝐢𝐯𝐞𝐬 𝐮𝐬𝐢𝐧𝐠 𝐬𝐲𝐦𝐛𝐨𝐥𝐢𝐜 𝐟𝐨𝐫𝐦𝐮𝐥𝐚𝐬", although I have checked several times. Either I am using torch.autograd.grad incorrectly. Anyway, any help would be appreciated.

In [65]:
du_ad - du_calc

array([[ 7.4505806e-09,  3.7252903e-09, -1.8626451e-09],
       [-2.7939677e-09, -9.3132257e-10,  6.9849193e-10],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [-7.4505806e-09, -1.8626451e-09,  9.3132257e-10],
       [ 3.7252903e-09,  3.7252903e-09, -9.3132257e-10],
       [ 7.4505806e-09,  3.7252903e-09, -1.8626451e-09],
       [-9.3132257e-10, -4.6566129e-10,  1.1641532e-10],
       [ 3.7252903e-09,  3.7252903e-09, -9.3132257e-10],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00]], dtype=float32)