-
Notifications
You must be signed in to change notification settings - Fork 0
/
backpropagate.m
68 lines (54 loc) · 2.93 KB
/
backpropagate.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
function [ weight_changes, deltas ] = backpropagate( dJ, network, feedforward_results, network_error, varargin )
%BACKPROPAGATE Performs the error backpropagation
p = inputParser;
addRequired(p, 'dJ');
addRequired(p, 'network');
addRequired(p, 'results');
addRequired(p, 'error');
addParameter(p, 'fse', 0);
parse(p, dJ, network, feedforward_results, network_error, varargin{:});
network = p.Results.network;
feedforward_results = p.Results.results;
network_error = p.Results.error;
fse = p.Results.fse;
% prepare the error deltas and gradients
deltas = cell(numel(network), 1);
weight_changes = cell(numel(network), 1);
% calculate the error gradient on the network's output layer
network_results = feedforward_results{end};
deltas{end} = dJ(network_error); % TODO: double check!
% Calculate the weight change on the network output.
% This works, because the input field already contains the required
% "1"-activation of the bias neuron.
weight_changes{end} = deltas{end} * network_results.input'; % TODO: explain why
% perform the actual backpropagation
for j = numel(network)-1 : -1 : 1
% obtain the results of the forward propagation
result = feedforward_results{j};
layer = result.layer;
net = result.net; % TODO: find a better name here
input = result.input; % TODO: find a better name here
% obtain the results of the following layer (i.e. downstream)
next_result = feedforward_results{j+1};
next_layer = next_result.layer;
% We have two options here:
% Either we calculate the delta including the bias neurons
% (extending the activation gradient with 1 or zero) and then
% simply don't backpropagate it - or - we remove the biases
% from the weight matrix and backpropagate as usual.
% This example follows the second approach,
% which is why we are taking only columns after the first one.
next_weights = next_layer.theta(:, 2:end);
next_delta = deltas{j+1}; % TODO: explain why
activation_gradient = layer.dsigma( net ) + fse;
delta = (next_weights' * next_delta) .* activation_gradient; % TODO: explain, e.g. http://stats.stackexchange.com/a/130605/26843
% collect the delta for backpropagation
% to the preceding layer
deltas{j} = delta;
% calculate the weight change
weight_changes{j} = delta * input';
clear e result layer net input;
clear downstream_result downstream_layer downstream_weights;
clear activation_gradient delta;
end % for each layer
end