Skip to content

Commit

Permalink
second_order_training
Browse files Browse the repository at this point in the history
  • Loading branch information
yechengxi committed Aug 22, 2017
1 parent 6e98601 commit cde5bf0
Show file tree
Hide file tree
Showing 10 changed files with 244 additions and 121 deletions.
3 changes: 3 additions & 0 deletions CoreModules/activations/modu.m
@@ -1,4 +1,7 @@
function y = modu(x,dzdy)
%ModU activation function
%Ye, C., Yang, Y., Fermuller, C., & Aloimonos, Y. (2017).
%On the Importance of Consistency in Training Deep Neural Networks. arXiv preprint arXiv:1708.00631.

if nargin <= 1 || isempty(dzdy)
y = abs(x) ;
Expand Down
31 changes: 21 additions & 10 deletions CoreModules/optim/adagrad.m
@@ -1,7 +1,18 @@
function [ net,res,opts ] = adagrad( net,res,opts )
%NET_APPLY_GRAD_SGD Summary of this function goes here
% Detailed explanation goes here
% Modified Adagrad using second-order information:
% 1. Duchi, J., Hazan, E., & Singer, Y. (2011).
% Adaptive subgradient methods for online learning and stochastic optimization. Journal of Machine Learning Research, 12(Jul), 2121-2159.
% 2. Ye, C., Yang, Y., Fermuller, C., & Aloimonos, Y. (2017).
% On the Importance of Consistency in Training Deep Neural Networks. arXiv preprint arXiv:1708.00631.
%

if ~isfield(opts.parameters,'second_order')
opts.parameters.second_order=0;
end
if opts.parameters.second_order
[ net,res,opts ] = gradient_decorrelation( net,res,opts );
end

if ~isfield(opts.parameters,'weightDecay')
opts.parameters.weightDecay=1e-4;
end
Expand All @@ -11,18 +22,18 @@
end

for layer=1:numel(net.layers)
if isfield(net.layers{1,layer},'weights')
if isfield(net.layers{layer},'weights')

if ~isfield(net.layers{1,layer},'momentum')||(isfield(opts,'reset_mom')&&opts.reset_mom==1)
net.layers{1,layer}.momentum{1}=zeros(size(net.layers{1,layer}.weights{1}),'like',net.layers{1,layer}.weights{1});
net.layers{1,layer}.momentum{2}=zeros(size(net.layers{1,layer}.weights{2}),'like',net.layers{1,layer}.weights{2});
if ~isfield(net.layers{layer},'momentum')||(isfield(opts,'reset_mom')&&opts.reset_mom==1)
net.layers{layer}.momentum{1}=zeros(size(net.layers{layer}.weights{1}),'like',net.layers{layer}.weights{1});
net.layers{layer}.momentum{2}=zeros(size(net.layers{layer}.weights{2}),'like',net.layers{layer}.weights{2});
end

net.layers{1,layer}.momentum{1}=net.layers{1,layer}.momentum{1}+res(layer).dzdw.^2;
net.layers{1,layer}.weights{1}=net.layers{1,layer}.weights{1}-opts.parameters.lr*res(layer).dzdw./(net.layers{1,layer}.momentum{1}.^0.5+opts.parameters.eps)- opts.parameters.weightDecay * net.layers{1,layer}.weights{1};
net.layers{layer}.momentum{1}=net.layers{layer}.momentum{1}+res(layer).dzdw.^2;
net.layers{layer}.weights{1}=net.layers{layer}.weights{1}-opts.parameters.lr*res(layer).dzdw./(net.layers{layer}.momentum{1}.^0.5+opts.parameters.eps)- opts.parameters.weightDecay * net.layers{layer}.weights{1};

net.layers{1,layer}.momentum{2}=net.layers{1,layer}.momentum{2}+res(layer).dzdb.^2;
net.layers{1,layer}.weights{2}=net.layers{1,layer}.weights{2}-opts.parameters.lr*res(layer).dzdb./(net.layers{1,layer}.momentum{2}.^0.5+opts.parameters.eps);
net.layers{layer}.momentum{2}=net.layers{layer}.momentum{2}+res(layer).dzdb.^2;
net.layers{layer}.weights{2}=net.layers{layer}.weights{2}-opts.parameters.lr*res(layer).dzdb./(net.layers{layer}.momentum{2}.^0.5+opts.parameters.eps);

end
end
Expand Down
63 changes: 35 additions & 28 deletions CoreModules/optim/adam.m
@@ -1,57 +1,64 @@
function [ net,res,opts ] = adam( net,res,opts )
%NET_APPLY_GRAD_SGD Summary of this function goes here
% Detailed explanation goes here
% Modified Adam using second-order information.
% 1. Kingma, D., & Ba, J. (2014).
% Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980.
% 2. Ye, C., Yang, Y., Fermuller, C., & Aloimonos, Y. (2017).
% On the Importance of Consistency in Training Deep Neural Networks. arXiv preprint arXiv:1708.00631.

if ~isfield(opts.parameters,'second_order')
opts.parameters.second_order=0;
end
if opts.parameters.second_order
[ net,res,opts ] = gradient_decorrelation( net,res,opts );
end

if ~isfield(opts.parameters,'weightDecay')
opts.parameters.weightDecay=0;
end


if (~isfield(opts.parameters,'mom2'))
opts.parameters.mom2=0.999;
end

if ~isfield(net,'iterations')||(isfield(opts,'reset_mom')&&opts.reset_mom==1)
net.iterations=0;
end


if ~isfield(opts.parameters,'eps')
opts.parameters.eps=1e-8;
end

if ~isfield(net,'iterations')||(isfield(opts,'reset_mom')&&opts.reset_mom==1)
net.iterations=0;
end

net.iterations=net.iterations+1;

mom_factor=(1-opts.parameters.mom.^net.iterations);
mom_factor2=(1-opts.parameters.mom2.^net.iterations);

for layer=1:numel(net.layers)
if isfield(net.layers{1,layer},'weights')%strcmp(net.layers{layer}.type,'conv')||strcmp(net.layers{layer}.type,'mlp')
if ~isfield(net.layers{1,layer},'momentum')||(isfield(opts,'reset_mom')&&opts.reset_mom==1)||length(net.layers{1,layer}.momentum)<4
net.layers{1,layer}.momentum{1}=zeros(size(net.layers{1,layer}.weights{1}),'like',net.layers{1,layer}.weights{1});
net.layers{1,layer}.momentum{2}=zeros(size(net.layers{1,layer}.weights{2}),'like',net.layers{1,layer}.weights{2});
net.layers{1,layer}.momentum{3}=net.layers{1,layer}.momentum{1};%initialize
net.layers{1,layer}.momentum{4}=net.layers{1,layer}.momentum{2};%initialize
if isfield(net.layers{layer},'weights')
if ~isfield(net.layers{layer},'momentum')||(isfield(opts,'reset_mom')&&opts.reset_mom==1)||length(net.layers{layer}.momentum)<4
net.layers{layer}.momentum{1}=zeros(size(net.layers{layer}.weights{1}),'like',net.layers{layer}.weights{1});
net.layers{layer}.momentum{2}=zeros(size(net.layers{layer}.weights{2}),'like',net.layers{layer}.weights{2});
net.layers{layer}.momentum{3}=net.layers{layer}.momentum{1};%initialize
net.layers{layer}.momentum{4}=net.layers{layer}.momentum{2};%initialize

end
end
end


mom_factor=(1-opts.parameters.mom.^net.iterations);
mom_factor2=(1-opts.parameters.mom2.^net.iterations);


for layer=1:numel(net.layers)
if isfield(net.layers{1,layer},'weights')
if isfield(net.layers{layer},'weights')

net.layers{1,layer}.momentum{1}=opts.parameters.mom.*net.layers{1,layer}.momentum{1}+(1-opts.parameters.mom).*res(layer).dzdw;
net.layers{1,layer}.momentum{3}=opts.parameters.mom.*net.layers{1,layer}.momentum{3}+(1-opts.parameters.mom).*res(layer).dzdw.^2;
net.layers{1,layer}.weights{1}=net.layers{1,layer}.weights{1}-opts.parameters.lr*net.layers{1,layer}.momentum{1} ...
./(net.layers{1,layer}.momentum{3}.^0.5+opts.parameters.eps) .*mom_factor2^0.5./mom_factor ...
- opts.parameters.weightDecay * net.layers{1,layer}.weights{1};
net.layers{layer}.momentum{1}=opts.parameters.mom.*net.layers{layer}.momentum{1}+(1-opts.parameters.mom).*res(layer).dzdw;
net.layers{layer}.momentum{3}=opts.parameters.mom.*net.layers{layer}.momentum{3}+(1-opts.parameters.mom).*res(layer).dzdw.^2;
net.layers{layer}.weights{1}=net.layers{layer}.weights{1}-opts.parameters.lr*net.layers{layer}.momentum{1} ...
./(net.layers{layer}.momentum{3}.^0.5+opts.parameters.eps) .*mom_factor2^0.5./mom_factor ...
- opts.parameters.weightDecay * net.layers{layer}.weights{1};

net.layers{1,layer}.momentum{2}=opts.parameters.mom.*net.layers{1,layer}.momentum{2}+(1-opts.parameters.mom).*res(layer).dzdb;
net.layers{1,layer}.momentum{4}=opts.parameters.mom.*net.layers{1,layer}.momentum{4}+(1-opts.parameters.mom).*res(layer).dzdb.^2;
net.layers{1,layer}.weights{2}=net.layers{1,layer}.weights{2}-opts.parameters.lr*net.layers{1,layer}.momentum{2} ...
./(net.layers{1,layer}.momentum{4}.^0.5+opts.parameters.eps) .*mom_factor2^0.5./mom_factor;
net.layers{layer}.momentum{2}=opts.parameters.mom.*net.layers{layer}.momentum{2}+(1-opts.parameters.mom).*res(layer).dzdb;
net.layers{layer}.momentum{4}=opts.parameters.mom.*net.layers{layer}.momentum{4}+(1-opts.parameters.mom).*res(layer).dzdb.^2;
net.layers{layer}.weights{2}=net.layers{layer}.weights{2}-opts.parameters.lr*net.layers{layer}.momentum{2} ...
./(net.layers{layer}.momentum{4}.^0.5+opts.parameters.eps) .*mom_factor2^0.5./mom_factor;

end
end
Expand Down
77 changes: 77 additions & 0 deletions CoreModules/optim/gradient_decorrelation.m
@@ -0,0 +1,77 @@
function [ net,res,opts ] = gradient_decorrelation( net,res,opts )
% Decorrelating gradient descents using second-order information.
% Ye, C., Yang, Y., Fermuller, C., & Aloimonos, Y. (2017).
% On the Importance of Consistency in Training Deep Neural Networks. arXiv preprint arXiv:1708.00631.

if ~isfield(opts.parameters,'lambda_sgd2')
opts.parameters.lambda_sgd2=1e0;
end
if ~isfield(opts.parameters,'large_matrix_inversion')
opts.parameters.large_matrix_inversion=0;
end
if ~isfield(opts.parameters,'max_inv_size')
opts.parameters.max_inv_size=500;
end
if ~isfield(opts.parameters,'decorr_bias')
opts.parameters.decorr_bias=1;
end

max_inv_size=opts.parameters.max_inv_size;
lambda=opts.parameters.lambda_sgd2;


for layer=1:numel(net.layers)
if isfield(net.layers{layer},'weights')&&~isempty(net.layers{layer}.weights)

dzdw=res(layer).dzdw;
dzdb=res(layer).dzdb;

if length(net.layers{layer}.weights)==2
x=res(layer).x;
batch_dim=length(size(x));%This assumes the batch size must be >1
if batch_dim==4%2d cnn
x=permute(x,[3,1,2,4]);x=reshape(x,size(x,1),[]);
dzdw=permute(dzdw,[1,2,4,3]);new_size=size(dzdw);dzdw=reshape(dzdw,prod(new_size(1:3)),new_size(4));
K=size(dzdw,1)/numel(dzdb);dzdb=repelem(dzdb(:),K,1);
end
if batch_dim==3%1d cnn
x=permute(x,[2,1,3]);x=reshape(x,size(x,1),[]);
dzdw=permute(dzdw,[1,3,2]);new_size=size(dzdw);dzdw=reshape(dzdw,prod(new_size(1:2)),new_size(3));
K=size(dzdw,1)/numel(dzdb);dzdb=repelem(dzdb(:),K,1);
end
subsample=1;batch_size=size(x,2);
if batch_size>1e4,subsample=ceil(min(50,batch_size/1e4));end
if subsample>1,x=x(:,1:subsample:end);end
if opts.parameters.decorr_bias==1
%insert bias
x=[ones(1,size(x,2),'like',x);x];
dzdw=[dzdb,dzdw];
end
if size(dzdw,2)<=max_inv_size %small scale inversion
dzdw=dzdw/(x*x'./size(x,2)+lambda*eye(size(x,1),'like',x));
elseif opts.parameters.large_matrix_inversion %divide large scale into smaller scale
order=randperm(size(dzdw,2));
for i=1:max_inv_size:length(order) %could have been parallelized
block_size=min(max_inv_size,length(order)-i+1);
idx=order(i:i+block_size-1);x_tmp=x(idx,:);
dzdw(:,idx)=dzdw(:,idx)/(x_tmp*x_tmp'./size(x_tmp,2)+lambda*eye(size(x_tmp,1),'like',x));
end
end
if opts.parameters.decorr_bias==1
dzdb=dzdw(:,1);dzdw(:,1)=[];
end
if batch_dim==4,dzdw=reshape(dzdw,new_size);dzdw=permute(dzdw,[1,2,4,3]);end
if batch_dim==3,dzdw=reshape(dzdw,new_size);dzdw=permute(dzdw,[1,3,2]);end
if batch_dim>2%for cnn:
%dzdb is decorrelated with dzdw, take average to smooth the results.
dzdb=reshape(mean(reshape(dzdb(:),K,[]),1),size(res(layer).dzdb));
end
res(layer).dzdw=dzdw;
res(layer).dzdb=dzdb;
end

end
end

end

49 changes: 27 additions & 22 deletions CoreModules/optim/rmsprop.m
@@ -1,24 +1,30 @@
function [ net,res,opts ] = rmsprop( net,res,opts )
%NET_APPLY_GRAD_SGD Summary of this function goes here
% Detailed explanation goes here
% Modified RMSProp using second-order information.
% 1.Tieleman, T. and Hinton, G. Lecture 6.5 - RMSProp, COURSERA: Neural Networks for Machine Learning.
% Technical report, 2012.
% 2.Ye, C., Yang, Y., Fermuller, C., & Aloimonos, Y. (2017).
% On the Importance of Consistency in Training Deep Neural Networks. arXiv preprint arXiv:1708.00631.

if ~isfield(opts.parameters,'second_order')
opts.parameters.second_order=0;
end
if opts.parameters.second_order
[ net,res,opts ] = gradient_decorrelation( net,res,opts );
end

if ~isfield(opts.parameters,'weightDecay')
opts.parameters.weightDecay=1e-4;
end


if ~isfield(opts,'results')||~isfield(opts.results,'lrs')
opts.results.lrs=[];%%not really necessary
if ~isfield(opts.parameters,'clip')
opts.parameters.clip=1e0;
end
opts.results.lrs=[opts.results.lrs;gather(opts.parameters.lr)];

if ~isfield(opts.parameters,'eps')
opts.parameters.eps=1e-6;
end

if ~isfield(opts.parameters,'clip')
opts.parameters.clip=1e0;
end

if ~isfield(net,'iterations')||(isfield(opts,'reset_mom')&&opts.reset_mom==1)
net.iterations=0;
end
Expand All @@ -28,29 +34,28 @@
mom_factor=(1-opts.parameters.mom.^net.iterations);

for layer=1:numel(net.layers)
if isfield(net.layers{1,layer},'weights')
if ~isfield(net.layers{1,layer},'momentum')||(isfield(opts,'reset_mom')&&opts.reset_mom==1)
net.layers{1,layer}.momentum{1}=zeros(size(net.layers{1,layer}.weights{1}),'like',net.layers{1,layer}.weights{1});
net.layers{1,layer}.momentum{2}=zeros(size(net.layers{1,layer}.weights{2}),'like',net.layers{1,layer}.weights{2});
if isfield(net.layers{layer},'weights')
if ~isfield(net.layers{layer},'momentum')||(isfield(opts,'reset_mom')&&opts.reset_mom==1)
net.layers{layer}.momentum{1}=zeros(size(net.layers{layer}.weights{1}),'like',net.layers{layer}.weights{1});
net.layers{layer}.momentum{2}=zeros(size(net.layers{layer}.weights{2}),'like',net.layers{layer}.weights{2});

end

net.layers{1,layer}.momentum{1}=opts.parameters.mom.*net.layers{1,layer}.momentum{1}+(1-opts.parameters.mom).*res(layer).dzdw.^2;

normalized_grad=res(layer).dzdw./(net.layers{1,layer}.momentum{1}.^0.5+opts.parameters.eps)./mom_factor;
if isfield(opts.parameters,'clip')
net.layers{layer}.momentum{1}=opts.parameters.mom.*net.layers{layer}.momentum{1}+(1-opts.parameters.mom).*(res(layer).dzdw.^2);
normalized_grad=res(layer).dzdw./(net.layers{layer}.momentum{1}.^0.5+opts.parameters.eps)./mom_factor;
if isfield(opts.parameters,'clip')&&opts.parameters.clip>0
mask=abs(normalized_grad)>opts.parameters.clip;
normalized_grad(mask)=sign(normalized_grad(mask)).*opts.parameters.clip;
end
net.layers{1,layer}.weights{1}=net.layers{1,layer}.weights{1}-opts.parameters.lr*normalized_grad- opts.parameters.weightDecay * net.layers{1,layer}.weights{1};
net.layers{layer}.weights{1}=net.layers{layer}.weights{1}-opts.parameters.lr*normalized_grad- opts.parameters.weightDecay * net.layers{layer}.weights{1};

net.layers{1,layer}.momentum{2}=opts.parameters.mom.*net.layers{1,layer}.momentum{2}+(1-opts.parameters.mom).*res(layer).dzdb.^2;
normalized_grad=res(layer).dzdb./(net.layers{1,layer}.momentum{2}.^0.5+opts.parameters.eps)./mom_factor;
if isfield(opts.parameters,'clip')
net.layers{layer}.momentum{2}=opts.parameters.mom.*net.layers{layer}.momentum{2}+(1-opts.parameters.mom).*(res(layer).dzdb.^2);
normalized_grad=res(layer).dzdb./(net.layers{layer}.momentum{2}.^0.5+opts.parameters.eps)./mom_factor;
if isfield(opts.parameters,'clip')&&opts.parameters.clip>0
mask=abs(normalized_grad)>opts.parameters.clip;
normalized_grad(mask)=sign(normalized_grad(mask)).*opts.parameters.clip;
end
net.layers{1,layer}.weights{2}=net.layers{1,layer}.weights{2}-opts.parameters.lr*normalized_grad;
net.layers{layer}.weights{2}=net.layers{layer}.weights{2}-opts.parameters.lr*normalized_grad;
end
end

Expand Down

0 comments on commit cde5bf0

Please sign in to comment.