In [2]:
import numpy as np


def svm_loss_naive(W, X, y, reg):
    """
    Structured SVM loss function, naive implementation (with loops).

    Inputs have dimension D, there are C classes, and we operate on minibatches
    of N examples.

    Inputs:
    - W: A numpy array of shape (D, C) containing weights.
    - X: A numpy array of shape (N, D) containing a minibatch of data.
    - y: A numpy array of shape (N,) containing training labels; y[i] = c means
      that X[i] has label c, where 0 <= c < C.
    - reg: (float) regularization strength

    Returns a tuple of:
    - loss as single float
    - gradient with respect to weights W; an array of same shape as W
    """
    dW = np.zeros(W.shape)  # initialize the gradient as zero

    # compute the loss and the gradient
    num_classes = W.shape[1]
    num_train = X.shape[0]
    loss = 0.0
    for i in range(num_train):
        scores = X[i].dot(W)
        correct_class_score = scores[y[i]]
        for j in range(num_classes):
            if j == y[i]:
                continue
            margin = scores[j] - correct_class_score + 1  # note delta = 1
            if margin > 0:
                loss += margin
                dW[:, j] += X[i]
                dW[:, y[i]] -= X[i]

    # Right now the loss is a sum over all training examples, but we want it
    # to be an average instead so we divide by num_train.
    loss /= num_train
    dW /= num_train

    # Add regularization to the loss and gradient.
    loss += 0.5 * reg * np.sum(W * W)
    dW += reg * W

    return loss, dW

某样本 $i$ 在类别 $j$ 上的得分：

$$
s_j = X_i W_{:,j} \\
$$


---

损失函数：

$$
L_i = \sum_{j \neq y_i}^C \max(0, s_j - s_{y_i} + 1) \\
$$

其中：

+ $C$ 是类别数
+ $s_{y_i}$ 是样本 $i$ 在正确类别 $y_i$ 上的分数


总损失：

$$
L = \frac{1}{N} \sum_{i=1}^N L_i + \frac{\lambda}{2} \sum_j \sum_{k=1}^C W_{j,k}^2
$$

其中： 

+ $N$ 是样本数
+ $\frac{\lambda}{2} \sum_j \sum_{k=1}^C W_{j,k}^2$ 是正则项， $\lambda$ 是正则化强度

---

当 $j \neq y_i$ 时：

$$
\begin{aligned}
\frac{\partial (s_j - s_{y_i} + 1)}{\partial W_{:,k}} &= \frac{\partial s_j}{\partial W_{:,k}} - \frac{\partial s_{y_i}}{\partial W_{:,k}} \\ &= \frac{\partial (X_i W_{:,j})}{\partial W_{:,k}} - \frac{\partial (X_i W_{:,y_i})}{\partial W_{:,k}}
\end{aligned}
$$

+ 当 $k \neq y_i$ 时， $\frac {\partial (X_i W_{:,y_i})}{\partial W_{:,k}} = 0$ ，此时：

$$
\frac{\partial (s_j - s_{y_i} + 1)}{\partial W_{:,k}} = X_i
$$


+ 当 $k = y_i$ 时：

$$
\frac{\partial (s_j - s_{y_i} + 1)}{\partial W_{:,k}} = -X_i
$$

整理得：

$$
\frac{\partial L_i}{\partial W_{:,k}} = -X_{i,y_i} + \sum_{k \neq y_i, s_k - s_{y_i} + 1 > 0}^C X_{i,k}
$$

用字母 $j$ 代替字母 $k$得：

$$
\frac{\partial L_i}{\partial W_{:,j}} = -X_{i,y_i} + \sum_{j \neq y_i, s_j - s_{y_i} + 1 > 0}^C X_{i,j}
$$

---

最后：

$$
\frac{\partial L}{\partial W} = \frac{1}{N} \frac{\partial L_i}{\partial W_{:,j}} + \lambda W
$$