Skip to content

Commit

Permalink
added option for ReLU activations
Browse files Browse the repository at this point in the history
  • Loading branch information
unixpickle committed Apr 25, 2017
1 parent 8f28fdc commit 82d7261
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 18 deletions.
53 changes: 53 additions & 0 deletions activation.go
@@ -0,0 +1,53 @@
package sgdstore

import (
"github.com/unixpickle/anydiff"
"github.com/unixpickle/anynet"
"github.com/unixpickle/anyvec"
)

// Activation is an activation function.
type Activation int

// Supported activation functions.
const (
Tanh Activation = iota
ReLU
)

// Forward applies the activation function in the forward
// direction.
func (a Activation) Forward(in anydiff.Res) anydiff.Res {
switch a {
case Tanh:
return anydiff.Tanh(in)
case ReLU:
return anydiff.ClipPos(in)
}
panic("unsupported activation")
}

// Backward applies backward propagation, given the output
// from the forward pass and the upstream vector.
func (a Activation) Backward(out, upstream anydiff.Res) anydiff.Res {
switch a {
case Tanh:
return anydiff.Mul(anydiff.Complement(anydiff.Square(out)), upstream)
case ReLU:
mask := out.Output().Copy()
anyvec.GreaterThan(mask, mask.Creator().MakeNumeric(0))
return anydiff.Mul(upstream, anydiff.NewConst(mask))
}
panic("unsupported activation")
}

// Layer returns a compatible anynet.Layer.
func (a Activation) Layer() anynet.Layer {
switch a {
case Tanh:
return anynet.Tanh
case ReLU:
return anynet.ReLU
}
panic("unsupported activation")
}
11 changes: 7 additions & 4 deletions block.go
Expand Up @@ -20,6 +20,7 @@ func init() {
// Block is an RNN block that uses a Net as its memory.
type Block struct {
InitParams []*anydiff.Var
Activation Activation

// Gates which transform the input into various vectors
// used to train and query the current Net.
Expand Down Expand Up @@ -58,7 +59,7 @@ type Block struct {
// queryBatch * layerSizes[len(layerSizes)-1]
//
func LinearBlock(c anyvec.Creator, blockIn, trainBatch, queryBatch, numSteps int,
lrBias float64, layerSizes ...int) *Block {
lrBias float64, activation Activation, layerSizes ...int) *Block {
if len(layerSizes) < 2 {
panic("not enough layer sizes")
} else if trainBatch < 1 || queryBatch < 1 {
Expand All @@ -69,14 +70,15 @@ func LinearBlock(c anyvec.Creator, blockIn, trainBatch, queryBatch, numSteps int
TrainInput: anynet.NewFC(c, blockIn, trainBatch*layerSizes[0]),
TrainTarget: anynet.Net{
anynet.NewFC(c, blockIn, trainBatch*layerSizes[len(layerSizes)-1]),
anynet.Tanh,
activation.Layer(),
},
StepSize: anynet.Net{
anynet.NewFC(c, blockIn, 1).AddBias(c.MakeNumeric(math.Log(lrBias))),
anynet.Exp,
},
Query: anynet.NewFC(c, blockIn, queryBatch*layerSizes[0]),
Steps: numSteps,
Query: anynet.NewFC(c, blockIn, queryBatch*layerSizes[0]),
Steps: numSteps,
Activation: activation,
}

layerSize := layerSizes[0]
Expand Down Expand Up @@ -147,6 +149,7 @@ func (b *Block) Step(s anyrnn.State, in anyvec.Vector) anyrnn.Res {
net := &Net{
Parameters: anydiff.Fuse(poolReses...),
Num: n,
Activation: b.Activation,
}
batchSize := gateOuts[0].Output().Len() / (net.InSize() * n)
newNet := net.Train(gateOuts[0], gateOuts[1], gateOuts[2], batchSize, b.Steps)
Expand Down
3 changes: 2 additions & 1 deletion experiments/audioset/model.go
Expand Up @@ -21,7 +21,8 @@ func learnerBlock(name string, sgdSteps, numFeatures, numOut int) anyrnn.Block {
inLayer,
anyrnn.NewVanilla(c, numFeatures+numOut, 384, anynet.Tanh),
anyrnn.NewVanilla(c, 384, 384, anynet.Tanh),
sgdstore.LinearBlock(c, 384, 16, 2, sgdSteps, 0.2, 32, 256, 32),
sgdstore.LinearBlock(c, 384, 16, 2, sgdSteps, 0.2,
sgdstore.Tanh, 32, 256, 32),
&anyrnn.LayerBlock{
Layer: anynet.Net{
anynet.NewFC(c, 64, 64),
Expand Down
6 changes: 4 additions & 2 deletions experiments/omniglot/model.go
Expand Up @@ -20,7 +20,8 @@ func NewModel(name string, sgdSteps, outCount int) anyrnn.Block {
normInputLayer(c, outCount, numPixels),
anyrnn.NewVanilla(c, numPixels+outCount, 384, anynet.Tanh),
anyrnn.NewVanilla(c, 384, 384, anynet.Tanh),
sgdstore.LinearBlock(c, 384, 16, 2, sgdSteps, 0.2, 32, 256, 32),
sgdstore.LinearBlock(c, 384, 16, 2, sgdSteps, 0.2, sgdstore.Tanh,
32, 256, 32),
&anyrnn.LayerBlock{
Layer: anynet.Net{
anynet.NewFC(c, 64, 64),
Expand All @@ -37,7 +38,8 @@ func NewModel(name string, sgdSteps, outCount int) anyrnn.Block {
anyrnn.NewVanilla(c, 384, 384, anynet.Tanh),
&anyrnn.Parallel{
Block1: &anyrnn.LayerBlock{Layer: anynet.Net{}},
Block2: sgdstore.LinearBlock(c, 384, 16, 2, sgdSteps, 0.2, 32, 256, 32),
Block2: sgdstore.LinearBlock(c, 384, 16, 2, sgdSteps, 0.2,
sgdstore.Tanh, 32, 256, 32),
Mixer: &anynet.AddMixer{
In1: anynet.NewFC(c, 384, 64),
In2: anynet.NewFC(c, 64, 64),
Expand Down
3 changes: 2 additions & 1 deletion experiments/poly_approx/model.go
Expand Up @@ -26,7 +26,8 @@ func NewModel(name string, sgdSteps int) anyrnn.Block {
anynet.Tanh,
},
},
sgdstore.LinearBlock(c, 32, 2, 2, sgdSteps, 1, 16, 32, 16),
sgdstore.LinearBlock(c, 32, 2, 2, sgdSteps, 1, sgdstore.Tanh,
16, 32, 16),
},
},
&anyrnn.LayerBlock{Layer: anynet.NewFC(c, 32, 1)},
Expand Down
24 changes: 14 additions & 10 deletions net.go
Expand Up @@ -9,7 +9,7 @@ import (
// NetBatch is a batch of dynamic feed-forward multi-layer
// perceptrons.
//
// Each layer is implicitly followed by a tanh.
// Each layer is implicitly followed by an activation.
type Net struct {
// Parameters stores the weights and biases of the
// network.
Expand All @@ -23,6 +23,9 @@ type Net struct {

// Num is the number of networks in the batch.
Num int

// Activation is the activation function.
Activation Activation
}

// Apply applies the networks to a batch of input batches,
Expand All @@ -33,7 +36,7 @@ func (n *Net) Apply(inBatch anydiff.Res, batchSize int) anydiff.Res {
panic("mismatching bias and weight count")
}
for i := 0; i < len(params); i += 2 {
inBatch = applyLayer(params[i], params[i+1], inBatch, batchSize, n.Num)
inBatch = n.applyLayer(params[i], params[i+1], inBatch, batchSize, n.Num)
}
return inBatch
})
Expand Down Expand Up @@ -78,7 +81,7 @@ func (n *Net) Train(inBatch, target, stepSize anydiff.Res, batchSize,
// caller.
func (n *Net) step(inBatch, target, stepSize anydiff.Res, batchSize int) *Net {
newParams := anydiff.PoolMulti(n.Parameters, func(params []anydiff.Res) anydiff.MultiRes {
grad := applyBackprop(params, inBatch, target, batchSize, n.Num)
grad := n.applyBackprop(params, inBatch, target, batchSize, n.Num)
return anydiff.PoolMulti(grad, func(grads []anydiff.Res) anydiff.MultiRes {
var newParams []anydiff.Res
for i, g := range grads[1:] {
Expand All @@ -97,17 +100,18 @@ func (n *Net) step(inBatch, target, stepSize anydiff.Res, batchSize int) *Net {
}

// applyLayer applies a single layer.
func applyLayer(weights, biases, inBatch anydiff.Res, batchSize, numNets int) anydiff.Res {
func (n *Net) applyLayer(weights, biases, inBatch anydiff.Res, batchSize,
numNets int) anydiff.Res {
inMat, weightMat := layerMats(weights, biases, inBatch, batchSize, numNets)
inBatch = anydiff.BatchedMatMul(false, true, inMat, weightMat).Data
return anydiff.Tanh(batchedAddRepeated(inBatch, biases, numNets))
return n.Activation.Forward(batchedAddRepeated(inBatch, biases, numNets))
}

// applyBackprop applies the networks and performs
// backward-propagation.
// The result is [inGrad, param1Grad, param2Grad, ...].
// The caller should pool the input parameters.
func applyBackprop(params []anydiff.Res, in, target anydiff.Res,
func (n *Net) applyBackprop(params []anydiff.Res, in, target anydiff.Res,
batchSize, numNets int) anydiff.MultiRes {
if len(params) == 0 {
scaler := target.Output().Creator().MakeNumeric(
Expand All @@ -122,13 +126,13 @@ func applyBackprop(params []anydiff.Res, in, target anydiff.Res,
inMat, weightMat := layerMats(params[0], params[1], in, batchSize, numNets)
matOut := anydiff.BatchedMatMul(false, true, inMat, weightMat).Data
biasOut := batchedAddRepeated(matOut, params[1], numNets)
tanhOut := anydiff.Tanh(biasOut)
return anydiff.PoolFork(tanhOut, func(tanhOut anydiff.Res) anydiff.MultiRes {
nextOut := applyBackprop(params[2:], tanhOut, target, batchSize, numNets)
actOut := n.Activation.Forward(biasOut)
return anydiff.PoolFork(actOut, func(actOut anydiff.Res) anydiff.MultiRes {
nextOut := n.applyBackprop(params[2:], actOut, target, batchSize, numNets)
return anydiff.PoolMulti(nextOut, func(x []anydiff.Res) anydiff.MultiRes {
outGrad := x[0]
laterGrads := x[1:]
pg := anydiff.Mul(anydiff.Complement(anydiff.Square(tanhOut)), outGrad)
pg := n.Activation.Backward(actOut, outGrad)
return anydiff.PoolFork(pg, func(pg anydiff.Res) anydiff.MultiRes {
productGrad := &anydiff.MatrixBatch{
Data: pg,
Expand Down

0 comments on commit 82d7261

Please sign in to comment.