# Backpropagation

**反向传播**


In [1]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

单一梯度


In [2]:
dvalues = np.array([[1.0, 1.0, 1.0]])

# fmt: off
# 有 3 组权重，每个神经元一组；有 4 个输入，因此为 4 个权重
weights = np.array([[0.2, 0.8, -0.5, 1],
                    [0.5, -0.91, 0.26, -0.5],
                    [-0.26, -0.27, 0.17, 0.87]]).T
# fmt: on

dx0 = sum(weights[0] * dvalues[0])
dx1 = sum(weights[1] * dvalues[0])
dx2 = sum(weights[2] * dvalues[0])
dx3 = sum(weights[3] * dvalues[0])

dinputs = np.array([dx0, dx1, dx2, dx3])

# weights 与 dvalues 相乘时，dvalues 将自动按行广播来匹配到 weights(4, 3)  shape dvalues(3, 4)
dinputs2 = np.sum(weights * dvalues, axis=1)

dinputs3 = np.dot(dvalues[0], weights.T)


dinputs, dinputs2, dinputs3

(array([ 0.44, -0.38, -0.07,  1.37]),
 array([ 0.44, -0.38, -0.07,  1.37]),
 array([ 0.44, -0.38, -0.07,  1.37], dtype=float32))

批梯度


In [3]:
# fmt: off
dvalues = np.array([[1.0, 1.0, 1.0],
                    [2.0, 2.0, 2.0],
                    [3.0, 3.0, 3.0]])
# 有 3 组权重，每个神经元一组；有 4 个输入，因此为 4 个权重
weights = np.array([[0.2, 0.8, -0.5, 1],
                    [0.5, -0.91, 0.26, -0.5],
                    [-0.26, -0.27, 0.17, 0.87]])
# fmt: on

dinputs = np.dot(weights.T, dvalues)  # np.dot(dvalues, weights)
print("dinputs =\n", dinputs)

# fmt: off
inputs = np.array([[1, 2, 3, 2.5],
                   [2.0, 5.0, -1.0, 2.0],
                   [-1.5, 2.7, 3.3, -0.8]])
# fmt: on

dweights = np.dot(inputs.T, dvalues)
print("dweights =\n", dweights)

# 每个神经元一个偏置
biases = np.array([2, 3, 0.5])
# 对值求和，在样本（第 1 个轴）上进行，保持维度
dbiases = np.sum(dvalues, axis=0, keepdims=True)
print("dbiases =\n", dbiases)

dinputs =
 [[ 0.42  0.42  0.42]
 [-1.83 -1.83 -1.83]
 [ 0.53  0.53  0.53]
 [ 2.61  2.61  2.61]]
dweights =
 [[ 0.5  0.5  0.5]
 [20.1 20.1 20.1]
 [10.9 10.9 10.9]
 [ 4.1  4.1  4.1]]
dbiases =
 [[6. 6. 6.]]


In [4]:
# Dense layer
class Layer_Dense:
  # Layer initialization
  def __init__(self, n_inputs, n_neurons):
    # Initialize weights and biases
    self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
    self.biases = np.zeros((1, n_neurons))

  # Forward pass
  def forward(self, inputs):
    # Remember input values
    self.inputs = inputs
    # Calculate output values from inputs, weights and biases
    self.output = np.dot(inputs, self.weights) + self.biases

  # Backward pass
  def backward(self, dvalues):
    # Gradients on parameters
    self.dweights = np.dot(self.inputs.T, dvalues)
    self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
    # Gradient on values
    self.dinputs = np.dot(dvalues, self.weights.T)


In [5]:
# ReLU activation
class Activation_ReLU:
  # Forward pass
  def forward(self, inputs):
    # Remember input values
    self.inputs = inputs
    # Calculate output values from inputs
    self.output = np.maximum(0, inputs)

  # Backward pass
  def backward(self, dvalues):
    # Since we need to modify original variable,
    # let's make a copy of values first
    self.dinputs = dvalues.copy()

    # Zero gradient where input values were negative
    self.dinputs[self.inputs <= 0] = 0

In [6]:
# Softmax activation
class Activation_Softmax:
  # Forward pass
  def forward(self, inputs):
    # Remember input values
    self.inputs = inputs

    # Get unnormalized probabilities
    exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
    # Normalize them for each sample
    probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

    self.output = probabilities

  # Backward pass
  def backward(self, dvalues):
    # Create uninitialized array
    self.dinputs = np.empty_like(dvalues)

    # Enumerate outputs and gradients
    for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
      # Flatten output array
      single_output = single_output.reshape(-1, 1)
      # Calculate Jacobian matrix of the output
      # S_{i,j}KroneckerDelta_{i,j} - S_{i,j}S_{i,k}
      jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)

      # Calculate sample-wise gradient
      # and add it to the array of sample gradients
      self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

In [7]:
# Common loss class
class Loss:
  # Calculates the data and regularization losses
  # given model output and ground truth values
  def calculate(self, output, y):
    # Calculate sample losses
    sample_losses = self.forward(output, y)

    # Calculate mean loss
    data_loss = np.mean(sample_losses)

    # Return loss
    return data_loss

In [29]:
# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):
  # Forward pass
  def forward(self, y_pred, y_true):
    # Number of samples in a batch
    samples = len(y_pred)

    # Clip data to prevent division by 0
    # Clip both sides to not drag mean towards any value
    y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

    # Probabilities for target values -
    # only if categorical labels
    if len(y_true.shape) == 1:
      correct_confidences = y_pred_clipped[range(samples), y_true]

    # Mask values - only for one-hot encoded labels
    elif len(y_true.shape) == 2:
      correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

    # Losses
    negative_log_likelihoods = -np.log(correct_confidences)
    return negative_log_likelihoods

  # Backward pass
  def backward(self, dvalues, y_true):
    # Number of samples
    samples = len(dvalues)
    # Number of labels in every sample
    # We'll use the first sample to count them
    labels = len(dvalues[0])

    # If labels are sparse, turn them into one-hot vector
    if len(y_true.shape) == 1:
      y_true = np.eye(labels)[y_true]

    # np.mean(-y_true / dvalues)
    # Calculate gradient
    self.dinputs = -y_true / dvalues
    # Normalize gradient
    self.dinputs = self.dinputs / samples  # np.mean(-y_true / dvalues)

In [30]:
# Softmax classifier - combined Softmax activation
# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy:
  # Creates activation and loss function objects
  def __init__(self):
    self.activation = Activation_Softmax()
    self.loss = Loss_CategoricalCrossentropy()

  # Forward pass
  def forward(self, inputs, y_true):
    # Output layer's activation function
    self.activation.forward(inputs)
    # Set the output
    self.output = self.activation.output
    # Calculate and return loss value
    return self.loss.calculate(self.output, y_true)

  # Backward pass
  def backward(self, dvalues, y_true):
    # Number of samples
    samples = len(dvalues)

    # If labels are one-hot encoded,
    # turn them into discrete values
    if len(y_true.shape) == 2:
      y_true = np.argmax(y_true, axis=1)

    # Copy so we can safely modify
    self.dinputs = dvalues.copy()
    # Calculate gradient
    self.dinputs[range(samples), y_true] -= 1
    # Normalize gradient
    self.dinputs = self.dinputs / samples

In [33]:
np.random.seed(39)

# Create dataset
X, y = spiral_data(samples=100, classes=3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 3)

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(3, 3)

# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# Perform a forward pass of our training data through this layer
dense1.forward(X)

# Perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)

# Perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)

# Perform a forward pass through the activation/loss function
# takes the output of second dense layer here and returns loss
loss = loss_activation.forward(dense2.output, y)
# Let's see output of the first few samples:
print(loss_activation.output[:5])

# Print loss value
print("loss:", loss)

# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis=1)
if len(y.shape) == 2:
  y = np.argmax(y, axis=1)
accuracy = np.mean(predictions == y)

# Print accuracy
print("acc:", accuracy)

# Backward pass
loss_activation.backward(loss_activation.output, y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

# Print gradients
print("dense1.dweights:\n", dense1.dweights)
print("dense1.dbiases:\n", dense1.dbiases)
print("dense2.dweights:\n", dense2.dweights)
print("dense2.dbiases:\n", dense2.dbiases)


[[0.33333334 0.33333334 0.33333334]
 [0.3333335  0.3333335  0.33333305]
 [0.33333367 0.33333358 0.33333272]
 [0.33333355 0.33333385 0.33333263]
 [0.333333   0.33333418 0.33333278]]
loss: 1.0986123
acc: 0.32
dense1.dweights:
 [[ 3.2762854e-04  2.5372763e-04 -3.7113682e-04]
 [-5.4703956e-05 -3.7952783e-04  2.2306606e-04]]
dense1.dbiases:
 [[-0.00032174 -0.00043721  0.00053705]]
dense2.dweights:
 [[-9.95226146e-05  3.27997783e-04 -2.28475197e-04]
 [ 1.01648984e-04 -2.43361646e-04  1.41712662e-04]
 [ 2.35507177e-04 -3.19360319e-04  8.38531341e-05]]
dense2.dbiases:
 [[-7.707160e-06  5.553011e-07  6.938586e-06]]


比较两种算法的速度


In [35]:
from timeit import timeit


softmax_outputs = np.array([[0.7, 0.1, 0.2], [0.1, 0.5, 0.4], [0.02, 0.9, 0.08]])
class_targets = np.array([0, 1, 1])


def f1():
  softmax_loss = Activation_Softmax_Loss_CategoricalCrossentropy()
  softmax_loss.backward(softmax_outputs, class_targets)
  dvalues1 = softmax_loss.dinputs


def f2():
  activation = Activation_Softmax()
  activation.output = softmax_outputs
  loss = Loss_CategoricalCrossentropy()
  loss.backward(softmax_outputs, class_targets)
  activation.backward(loss.dinputs)
  dvalues2 = activation.dinputs


t1 = timeit(lambda: f1(), number=10000)
t2 = timeit(lambda: f2(), number=10000)
print(t1, t2)
print(t2 / t1)

0.07109954100451432 0.2592502910119947
3.646300487306017
