### $p_i = softmax(z_i) = \frac {e^{z_i}} {\sum _c{e ^ {z_c}}}$
### $P = [p_0, p_1,...,p_C], Z = [z_0, z_1, ..., z_C], P = softmax(Z), Y=[y_0, y_1, ...., y_C] 真实label$
### $p = sigmoid(z) = \frac {e^z} {1 + e^z} $  sigmoid（二分类） 是softmax（多分类）的特殊形式


In [17]:
# coding=utf-8
import math
def softmax(zi, X):
    return math.exp(zi) / sum([math.exp(zc) for zc in X])

def Softmax(X, normalize=True):
    M = max(X) if normalize else 0
    total = sum([math.exp(zc - M) for zc in X])
    rst = []
    for z in X:
        rst.append(math.exp(z - M) / total)
    return rst
    
X = [1,3,5,2,4]
print(Softmax(X, False))

[0.011656230956039609, 0.08612854443626872, 0.6364086465588309, 0.031684920796124276, 0.23412165725273665]


## softmax 求偏导

----
#### 当 i = j
#### $\frac {\partial p_i} {\partial z_i} = \frac {\partial {\left ({\frac {e^{z_i}} {\sum _c{e ^ {z_c}}}}\right)}} {\partial z_i}$ = $\frac{e^{z_i}\left( {\sum _c{e ^ {z_c}} - e^{z_i}}\right)}{ \left({\sum _c{e ^ {z_c}}}\right)^2}$ = $\frac {e^{z_i}} {\sum _c{e ^ {z_c}}}  * \frac {\sum _c{e ^ {z_c}} - e^{z_i}} {\sum _c{e ^ {z_c}}} $ = $\frac {e^{z_i}} {\sum _c{e ^ {z_c}}}  * \left(1 - {\frac {e^{z_i}} {\sum _c{e ^ {z_c}}}} \right) $ = $p_i * (1 - p_i)$
-----
#### 当 $i \neq  j$
#### $\frac {\partial p_i} {\partial z_j} = \frac {\partial {\left ({\frac {e^{z_i}} {\sum _c{e ^ {z_c}}}}\right)}} {\partial z_j}$ = $-\frac{e^{z_i}*e^{z_j}}{\sum _c{e ^ {z_c}}}$ = $-{\frac {e^{z_i}} {\sum _c{e ^ {z_c}}}}* {\frac {e^{z_j}} {\sum _c{e ^ {z_c}}}}$ = $-p_i * p_j$
-----
$
\frac {\partial p_i} {\partial z_j} = \begin{cases}
p_i * (1 - p_i), & \text{i = j}   \\
-p_i * p_j, & \text{i $\neq$ j}
\end{cases}
$

In [27]:
# dyi/dxj = yi(1-yi)  i==j
# dyi/dxj = -yiyj    i!=j
def softmax_dx(X, i=0):
    dx = 0
    pi = softmax(X[i], X)
    for j in range(len(X)):
        if i == j:
            dx += pi * (1 - pi)
        if i != j:
            pj = softmax(X[j], X)
            dx -=  pi * pj
    return dx
X = [1,3,5,2,4]
print(softmax_dx(X, 0))

-8.673617379884035e-19


### Softmax 损失函数(交叉熵）
### $ p_i 是预测值[0,1] 概率，y_i 是真实label， y_i ∈ \{0,1\}$
### 极大释然估计(模型已定，参数未知）
### $\theta 表示模型参数 , D = \{x_1,x_2,x_3....x_n\} 表示样本集合$
### $l(\theta) = p(D|\theta) = \Pi_i^N p(x_i|\theta)  = \Pi_i^N\Pi_c^C p_{ic}^{y_{ic}}$
### 如果是二分类 $ p(x_i|\theta) = \Pi_c^C p_{ic}^{y_{ic}} = p_i^{y_i} * (1-p_i)^{1-y_i}$
### $ \hat \theta = \underset{\theta}{\operatorname{argmax}} H(\theta) = \underset{\theta}{\operatorname{argmax}}\, \log \,l(\theta) = \sum _i^N \sum_c^C \log p_{ic}^{y_{ic}} = \sum _i^N \sum_c^C y_{ic} \log p_{ic}  $
### $ L = -\sum_i^N \sum_c^C{y_{ic}}{\log p_{ic}} = -\sum_i^N \sum_c^C {y_{ic}}(z_{ic} - \log\sum_c e^{ic}), 交叉熵 = 极大释然求最小值 $ 
### 只考虑一个样本的情况， $Z = [z_1, z_2...z_C]$
### $ \frac {\partial L} {\partial z_i} = -\sum_c^C{y_c}\frac {\partial {\log p_c} }{\partial z_i} =  -\sum_c^C{y_c}\frac {\partial {\log p_c} }{\partial p_c}\frac {\partial  p_c}{\partial z_i} = -\sum_c^C{y_c} \frac {1}{p_c} \frac {\partial  p_c}{\partial z_i} $

### $\frac {\partial  p_c}{\partial z_i} 就是softmax求导的结果$
$
\frac {\partial p_i} {\partial z_j} = \begin{cases}
p_i * (1 - p_i), & \text{i = j}   \\
-p_i * p_j, & \text{i $\neq$ j}
\end{cases}
$

### $ \frac {\partial L} {\partial z_i} =  \left (-y_i* \frac {1}{p_i} \right ) * p_i (1-p_i) - \sum_{c \neq i} y_c* \frac {1}{p_c} * (-p_i*p_c) = -y_i*(1-p_i) + \sum_{c \neq i} y_c * p_i $
### $= p_i \left ( y_i + \sum_{c\neq i} y_c \right) - y_i = p_i \sum_c y_c - y_i = p_i - y_i$
$\sum_c y_c  = 1$

### $ \frac {\partial L} {\partial Z} = P - Y$

<img src="./data/img/softmax_loss.webp">

In [41]:
# loss func  L = - ΣΣtki log(yki)
# dL/dxi = dL/dyj * dyj/dxi = yi - xi
def loss_dx(Y, P, i=0):
    """
     loss_i = -Y[i] * (1 - P[i])
     for c in range(len(Y)):
         if c != i:
            loss_i += Y[c] * P[i]
    """
    return P[i] - Y[i]
X = [1, 3, 5]
W = [4, -4, 3]
b = [4, -4, 3]
P = [0.7308, 0.0003, 0.2689]
Y = [0, 0, 1]
learningrate = 0.0001

# softmax(z) = [p1,p2,33]
# label Y= [y1,y2,y3]
# Z = σ(WX + b)
# δL/δw = δL/δz * δz/δw = x * δL/δz
# δL/δb = δL/δz * δz/δb = δL/δz
# 新的梯度 δL/δX = δL/δz * δz/δX = W * δL/δz

Gradient = P - Y
W -= learningrate * Grad * X
b -= learningrate * Grad

for i in range(len(Y)):
    grad = loss_dx(Y, P, i)
    W[i] -= learningrate * grad * X[i]
    B[i] -= learningrate * grad



TypeError: unsupported operand type(s) for -: 'list' and 'list'