## $softmax(z_i) = \frac {e^{z_i}} {\sum _c{e ^ {z_c}}}$


In [17]:
# coding=utf-8
import math
def softmax(zi, X):
    return math.exp(zi) / sum([math.exp(zc) for zc in X])

def Softmax(X, normalize=True):
    M = max(X) if normalize else 0
    total = sum([math.exp(zc - M) for zc in X])
    rst = []
    for z in X:
        rst.append(math.exp(z - M) / total)
    return rst
    
X = [1,3,5,2,4]
print(Softmax(X, False))

[0.011656230956039609, 0.08612854443626872, 0.6364086465588309, 0.031684920796124276, 0.23412165725273665]


## softmax 求导

#### $p_i = y_i = softmax(z_i) = \frac {e^{z_i}} {\sum _c{e ^ {z_c}}} $ 

----
#### 当 i = j
#### $\frac {\partial y_i} {\partial z_i} = \frac {\partial {\left ({\frac {e^{z_i}} {\sum _c{e ^ {z_c}}}}\right)}} {\partial z_i}$ = $\frac{e^{z_i}\left( {\sum _c{e ^ {z_c}} - e^{z_i}}\right)}{ \left({\sum _c{e ^ {z_c}}}\right)^2}$ = $\frac {e^{z_i}} {\sum _c{e ^ {z_c}}}  * \frac {\sum _c{e ^ {z_c}} - e^{z_i}} {\sum _c{e ^ {z_c}}} $ = $\frac {e^{z_i}} {\sum _c{e ^ {z_c}}}  * \left(1 - {\frac {e^{z_i}} {\sum _c{e ^ {z_c}}}} \right) $ = $p_i * (1 - p_i)$
-----
#### 当 $i \neq  j$
#### $\frac {\partial y_i} {\partial z_j} = \frac {\partial {\left ({\frac {e^{z_i}} {\sum _c{e ^ {z_c}}}}\right)}} {\partial z_j}$ = $-\frac{e^{z_i}*e^{z_j}}{\sum _c{e ^ {z_c}}}$ = $-{\frac {e^{z_i}} {\sum _c{e ^ {z_c}}}}* {\frac {e^{z_j}} {\sum _c{e ^ {z_c}}}}$ = $-p_i * p_j$
-----
### 
$$
\frac {\partial y_i} {\partial z_j} = \begin{cases}
p_i * (1 - p_i), & \text{i = j}   \\
-p_i * p_j, & \text{i $\neq$ j}
\end{cases}
$$

In [27]:
# dyi/dxj = yi(1-yi)  i==j
# dyi/dxj = -yiyj    i!=j
def softmax_dx(X, i=0):
    dx = 0
    pi = softmax(X[i], X)
    for j in range(len(X)):
        if i == j:
            dx += pi * (1 - pi)
        if i != j:
            pj = softmax(X[j], X)
            dx -=  pi * pj
    return dx
X = [1,3,5,2,4]
print(softmax_dx(X, 0))

-8.673617379884035e-19


### Softmax 损失函数(交叉熵）
### $ loss_i = 1-p_i => \log(1 - p_i) = - \log p_i = -\log(\frac{e^{z_i}}{\sum_c{e^{z_c}}}) = -(z_i - \log \sum_c{e^{z_c}})$
### $ L = -\sum_c{y_c}{\log p_c},  y_c = 0  或者  y_c = 1 $ 

### $ \frac {\partial L} {\partial z_i} = -\sum_c{y_c}\frac {\partial {\log p_c} }{\partial z_i} =  -\sum_c{y_c}\frac {\partial {\log p_c} }{\partial p_c}\frac {\partial  p_c}{\partial z_i} = -\sum_c{y_c} \frac {1}{p_c} \frac {\partial  p_c}{\partial z_i} $

### $\frac {\partial  p_c}{\partial z_i} 就是softmax求导的结果$
$$
\frac {\partial y_i} {\partial z_j} = \begin{cases}
p_i * (1 - p_i), & \text{i = j}   \\
-p_i * p_j, & \text{i $\neq$ j}
\end{cases}
$$

### $ \frac {\partial L} {\partial z_i} =  \left (-y_i* \frac {1}{p_i} \right ) * p_i (1-p_i) - \sum_{c \neq i} y_c* \frac {1}{p_c} * (-p_i*p_c) = -y_i*(1-p_i) + \sum_{c \neq i} y_c * p_i $
### $= p_i \left ( y_i + \sum_{c\neq i} y_c \right) - y_i = p_i \sum_c y_c - y_i = p_i - y_i$
$\sum_c y_c  = 1$


<img src="./../data/img/softmax_loss.webp">

In [36]:
# loss func  L = - ΣΣtki log(yki)
# dL/dxi = dL/dyj * dyj/dxi = yi - xi
def loss_dx(Y, P, i=0):
    """
     loss_i = -Y[i] * (1 - P[i])
     for c in range(len(Y)):
         if c != i:
            loss_i += Y[c] * P[i]
    """
    return P[i] - Y[i]

W = [4, -4, 3]
P = [0.7308, 0.0003, 0.2689]
Y = [0, 0, 1]
learningrate = 0.0001
for i in range(len(W)):
    loss = loss_dx(Y, P, i)
    W[i] += learningrate * loss
    print(loss, W[i])


0.7308 4.00007308
0.0003 -3.99999997
-0.7311000000000001 2.99992689
