### Define an input vector with one large number

In [1]:
import numpy as np

X = np.array([1, 20, 3, 4])

### Softmax function

In [2]:
def single_param_sm(z_i, X):
    return np.exp(z_i) / np.sum(np.exp(X))

In [3]:
def sm(X):
    out = []
    for i, z_i in enumerate(X):
        out.append(single_param_sm(z_i, X))
    return np.array(out)

In [4]:
out = sm(X)
out

array([5.60279554e-09, 9.99999840e-01, 4.13993706e-08, 1.12535157e-07])

### Gradient of the Softmax function

In [5]:
def sm_grad_wrt_zj(X, j): # we take the gradient with respect to the parameter at index j
    grad = []
    for i, z_i in enumerate(X):
        if i == j:
            grad.append(single_param_sm(z_i, X) * (1 - single_param_sm(z_i, X)))
        else:
            grad.append(- single_param_sm(z_i, X) * single_param_sm(X[j], X))
    return grad

In [6]:
sm_grad_wrt_zj(X,0) # problem: the gradient is really small -> the network learns really slowly..

[5.602795512290805e-09,
 -5.602794649827122e-09,
 -2.3195220901427037e-16,
 -6.305114748344254e-16]

### Fix: scale the vector/matrix by $d_k$

Here $d_k = 4$, so we multiply X by $\frac{1}{\sqrt{4}} = \frac{1}{2}$.

In [7]:
def sm_grad_wrt_zj(X, j): # we take the gradient with respect to the parameter at index j
    
    scaled_X = X / np.sqrt(X.shape[0])
    
    grad = []
    for i, z_i in enumerate(X):
        if i == j:
            grad.append(single_param_sm(z_i, scaled_X) * (1 - single_param_sm(z_i, scaled_X)))
        else:
            grad.append(- single_param_sm(z_i, scaled_X) * single_param_sm(X[j], scaled_X))
    return grad

In [8]:
sm_grad_wrt_zj(X,0) # much better!

[0.0001233188924307067,
 -2.714948028707061,
 -1.1239715748585796e-07,
 -3.0552715076425723e-07]