In [1]:
import numpy as np


def softmax_loss_naive(W, X, y, reg):
    """
    Softmax loss function, naive implementation (with loops)

    Inputs have dimension D, there are C classes, and we operate on minibatches
    of N examples.

    Inputs:
    - W: A numpy array of shape (D, C) containing weights.
    - X: A numpy array of shape (N, D) containing a minibatch of data.
    - y: A numpy array of shape (N,) containing training labels; y[i] = c means
      that X[i] has label c, where 0 <= c < C.
    - reg: (float) regularization strength

    Returns a tuple of:
    - loss as single float
    - gradient with respect to weights W; an array of same shape as W
    """
    # Initialize the loss and gradient to zero.
    loss = 0.0
    dW = np.zeros_like(W)
    num_classes = W.shape[1]
    num_train = X.shape[0]

    for i in range(num_train):
        scores = X[i].dot(W)
        scores -= np.max(scores)  # Numeric stability fix
        sum_exp_scores = np.sum(np.exp(scores))
        correct_class_score = scores[y[i]]
        loss += -correct_class_score + np.log(sum_exp_scores)

        for j in range(num_classes):
            softmax_output = np.exp(scores[j]) / sum_exp_scores
            if j == y[i]:
                dW[:, j] += (-1 + softmax_output) * X[i]
            else:
                dW[:, j] += softmax_output * X[i]

    loss /= num_train
    dW /= num_train

    # Add regularization to the loss and gradient
    loss += 0.5 * reg * np.sum(W * W)
    dW += reg * W

    return loss, dW

某样本 $i$ 在类别 $j$ 上的得分：

$$
s_j = X_i W_{:,j} \\
$$

---

softmax函数

$$
\text{softmax}(s_j) = \frac{e^{s_j}}{\sum_{k=1}^C e^{s_k}}
$$

其中： $C$ 是类别数

---

损失函数

1. 交叉熵损失：

$$
L_i = -\log(\frac{e^{s_{y_i}}}{\sum_{k=1}^C e^{s_k}})
= -s_{y_i} + \log(\sum_{k=1}^C e^{s_k})
$$


其中： $s_{y_i}$ 是样本 $i$ 在正确类别 $y_i$ 上的分数


2. 总损失：

$$
L = \frac{1}{N} \sum_{i=1}^N L_i + \frac{\lambda}{2} \sum_j \sum_{k=1}^C W_{j,k}^2
$$

其中：

+ $N$ 是样本数
+ $\frac{\lambda}{2} \sum_j \sum_k W_{j,k}^2$ 是正则项， $\lambda$ 是正则化强度

---

对 $L_i$ 求导

1. 对 $-s_{y_i}$ 求导：

$$
\frac{\partial (-s_{y_i})}{\partial W_{:,j}} = \frac{\partial (-X_i W_{:,y_i})}{\partial W_{:,j}}
$$

+ 当 $j = y_i$ 时：

$$
\frac{\partial (-X_i W_{:,y_i})}{\partial W_{:,j}} = \frac{\partial (-X_i W_{:,y_i})}{\partial W_{:,y_i}} = -X_i
$$

+ 当 $j \neq y_i$ 时：

$$
\frac{\partial (-X_i W_{:,y_i})}{\partial W_{:,j}} = 0
$$


2. 对 $\log(\sum_{k=1}^C e^{s_k})$ 求导

$$
\begin{aligned}
\frac{\partial \log(\sum_{k=1}^C e^{s_k})}{\partial W_{:,j}} &= \frac{1}{\sum_{k=1}^C e^{s_k}} \cdot \frac{\partial (\sum_{k=1}^C e^{s_k})}{\partial W_{:,j}} \\ &= \frac{1}{\sum_{k=1}^C e^{s_k}} \cdot \frac{\partial e^{s_j}}{\partial W_{:,j}} \\ &= \frac{e^{s_j}}{\sum_{k=1}^C e^{s_k}} \cdot \frac{\partial s_j}{\partial W_{:,j}} \\ &= \frac{e^{s_j}}{\sum_{k=1}^C e^{s_k}} \cdot \frac{\partial (X_i W_{:,j})}{\partial W_{:,j}} \\ &= \text{softmax}(s_j) X_i
\end{aligned}
$$


3. 整理得：

$$
\frac{\partial L_i}{\partial W_{:,j}} = 
\begin{cases}
(\text{softmax}(s_j) - 1) X_i& \text{if } j = y_i \\
\text{softmax}(s_j) X_i & \text{otherwise}
\end{cases}
$$

---

最后：

$$
\frac{\partial L}{\partial W} = \frac{1}{N} \frac{\partial L_i}{\partial W_{:,j}} + \lambda W
$$