added option for ReLU activations

unixpickle · Apr 25, 2017 · 82d7261 · 82d7261
1 parent 8f28fdc
commit 82d7261
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 18 deletions.
diff --git a/activation.go b/activation.go
@@ -0,0 +1,53 @@
+package sgdstore
+
+import (
+	"github.com/unixpickle/anydiff"
+	"github.com/unixpickle/anynet"
+	"github.com/unixpickle/anyvec"
+)
+
+// Activation is an activation function.
+type Activation int
+
+// Supported activation functions.
+const (
+	Tanh Activation = iota
+	ReLU
+)
+
+// Forward applies the activation function in the forward
+// direction.
+func (a Activation) Forward(in anydiff.Res) anydiff.Res {
+	switch a {
+	case Tanh:
+		return anydiff.Tanh(in)
+	case ReLU:
+		return anydiff.ClipPos(in)
+	}
+	panic("unsupported activation")
+}
+
+// Backward applies backward propagation, given the output
+// from the forward pass and the upstream vector.
+func (a Activation) Backward(out, upstream anydiff.Res) anydiff.Res {
+	switch a {
+	case Tanh:
+		return anydiff.Mul(anydiff.Complement(anydiff.Square(out)), upstream)
+	case ReLU:
+		mask := out.Output().Copy()
+		anyvec.GreaterThan(mask, mask.Creator().MakeNumeric(0))
+		return anydiff.Mul(upstream, anydiff.NewConst(mask))
+	}
+	panic("unsupported activation")
+}
+
+// Layer returns a compatible anynet.Layer.
+func (a Activation) Layer() anynet.Layer {
+	switch a {
+	case Tanh:
+		return anynet.Tanh
+	case ReLU:
+		return anynet.ReLU
+	}
+	panic("unsupported activation")
+}
diff --git a/block.go b/block.go
@@ -20,6 +20,7 @@ func init() {
 // Block is an RNN block that uses a Net as its memory.
 type Block struct {
 	InitParams []*anydiff.Var
+	Activation Activation
 
 	// Gates which transform the input into various vectors
 	// used to train and query the current Net.
@@ -58,7 +59,7 @@ type Block struct {
 //     queryBatch * layerSizes[len(layerSizes)-1]
 //
 func LinearBlock(c anyvec.Creator, blockIn, trainBatch, queryBatch, numSteps int,
-	lrBias float64, layerSizes ...int) *Block {
+	lrBias float64, activation Activation, layerSizes ...int) *Block {
 	if len(layerSizes) < 2 {
 		panic("not enough layer sizes")
 	} else if trainBatch < 1 || queryBatch < 1 {
@@ -69,14 +70,15 @@ func LinearBlock(c anyvec.Creator, blockIn, trainBatch, queryBatch, numSteps int
 		TrainInput: anynet.NewFC(c, blockIn, trainBatch*layerSizes[0]),
 		TrainTarget: anynet.Net{
 			anynet.NewFC(c, blockIn, trainBatch*layerSizes[len(layerSizes)-1]),
-			anynet.Tanh,
+			activation.Layer(),
 		},
 		StepSize: anynet.Net{
 			anynet.NewFC(c, blockIn, 1).AddBias(c.MakeNumeric(math.Log(lrBias))),
 			anynet.Exp,
 		},
-		Query: anynet.NewFC(c, blockIn, queryBatch*layerSizes[0]),
-		Steps: numSteps,
+		Query:      anynet.NewFC(c, blockIn, queryBatch*layerSizes[0]),
+		Steps:      numSteps,
+		Activation: activation,
 	}
 
 	layerSize := layerSizes[0]
@@ -147,6 +149,7 @@ func (b *Block) Step(s anyrnn.State, in anyvec.Vector) anyrnn.Res {
 		net := &Net{
 			Parameters: anydiff.Fuse(poolReses...),
 			Num:        n,
+			Activation: b.Activation,
 		}
 		batchSize := gateOuts[0].Output().Len() / (net.InSize() * n)
 		newNet := net.Train(gateOuts[0], gateOuts[1], gateOuts[2], batchSize, b.Steps)

diff --git a/experiments/audioset/model.go b/experiments/audioset/model.go
@@ -21,7 +21,8 @@ func learnerBlock(name string, sgdSteps, numFeatures, numOut int) anyrnn.Block {
 			inLayer,
 			anyrnn.NewVanilla(c, numFeatures+numOut, 384, anynet.Tanh),
 			anyrnn.NewVanilla(c, 384, 384, anynet.Tanh),
-			sgdstore.LinearBlock(c, 384, 16, 2, sgdSteps, 0.2, 32, 256, 32),
+			sgdstore.LinearBlock(c, 384, 16, 2, sgdSteps, 0.2,
+				sgdstore.Tanh, 32, 256, 32),
 			&anyrnn.LayerBlock{
 				Layer: anynet.Net{
 					anynet.NewFC(c, 64, 64),

diff --git a/experiments/omniglot/model.go b/experiments/omniglot/model.go
@@ -20,7 +20,8 @@ func NewModel(name string, sgdSteps, outCount int) anyrnn.Block {
 			normInputLayer(c, outCount, numPixels),
 			anyrnn.NewVanilla(c, numPixels+outCount, 384, anynet.Tanh),
 			anyrnn.NewVanilla(c, 384, 384, anynet.Tanh),
-			sgdstore.LinearBlock(c, 384, 16, 2, sgdSteps, 0.2, 32, 256, 32),
+			sgdstore.LinearBlock(c, 384, 16, 2, sgdSteps, 0.2, sgdstore.Tanh,
+				32, 256, 32),
 			&anyrnn.LayerBlock{
 				Layer: anynet.Net{
 					anynet.NewFC(c, 64, 64),
@@ -37,7 +38,8 @@ func NewModel(name string, sgdSteps, outCount int) anyrnn.Block {
 			anyrnn.NewVanilla(c, 384, 384, anynet.Tanh),
 			&anyrnn.Parallel{
 				Block1: &anyrnn.LayerBlock{Layer: anynet.Net{}},
-				Block2: sgdstore.LinearBlock(c, 384, 16, 2, sgdSteps, 0.2, 32, 256, 32),
+				Block2: sgdstore.LinearBlock(c, 384, 16, 2, sgdSteps, 0.2,
+					sgdstore.Tanh, 32, 256, 32),
 				Mixer: &anynet.AddMixer{
 					In1: anynet.NewFC(c, 384, 64),
 					In2: anynet.NewFC(c, 64, 64),

diff --git a/experiments/poly_approx/model.go b/experiments/poly_approx/model.go
@@ -26,7 +26,8 @@ func NewModel(name string, sgdSteps int) anyrnn.Block {
 							anynet.Tanh,
 						},
 					},
-					sgdstore.LinearBlock(c, 32, 2, 2, sgdSteps, 1, 16, 32, 16),
+					sgdstore.LinearBlock(c, 32, 2, 2, sgdSteps, 1, sgdstore.Tanh,
+						16, 32, 16),
 				},
 			},
 			&anyrnn.LayerBlock{Layer: anynet.NewFC(c, 32, 1)},

diff --git a/net.go b/net.go
@@ -9,7 +9,7 @@ import (
 // NetBatch is a batch of dynamic feed-forward multi-layer
 // perceptrons.
 //
-// Each layer is implicitly followed by a tanh.
+// Each layer is implicitly followed by an activation.
 type Net struct {
 	// Parameters stores the weights and biases of the
 	// network.
@@ -23,6 +23,9 @@ type Net struct {
 
 	// Num is the number of networks in the batch.
 	Num int
+
+	// Activation is the activation function.
+	Activation Activation
 }
 
 // Apply applies the networks to a batch of input batches,
@@ -33,7 +36,7 @@ func (n *Net) Apply(inBatch anydiff.Res, batchSize int) anydiff.Res {
 			panic("mismatching bias and weight count")
 		}
 		for i := 0; i < len(params); i += 2 {
-			inBatch = applyLayer(params[i], params[i+1], inBatch, batchSize, n.Num)
+			inBatch = n.applyLayer(params[i], params[i+1], inBatch, batchSize, n.Num)
 		}
 		return inBatch
 	})
@@ -78,7 +81,7 @@ func (n *Net) Train(inBatch, target, stepSize anydiff.Res, batchSize,
 // caller.
 func (n *Net) step(inBatch, target, stepSize anydiff.Res, batchSize int) *Net {
 	newParams := anydiff.PoolMulti(n.Parameters, func(params []anydiff.Res) anydiff.MultiRes {
-		grad := applyBackprop(params, inBatch, target, batchSize, n.Num)
+		grad := n.applyBackprop(params, inBatch, target, batchSize, n.Num)
 		return anydiff.PoolMulti(grad, func(grads []anydiff.Res) anydiff.MultiRes {
 			var newParams []anydiff.Res
 			for i, g := range grads[1:] {
@@ -97,17 +100,18 @@ func (n *Net) step(inBatch, target, stepSize anydiff.Res, batchSize int) *Net {
 }
 
 // applyLayer applies a single layer.
-func applyLayer(weights, biases, inBatch anydiff.Res, batchSize, numNets int) anydiff.Res {
+func (n *Net) applyLayer(weights, biases, inBatch anydiff.Res, batchSize,
+	numNets int) anydiff.Res {
 	inMat, weightMat := layerMats(weights, biases, inBatch, batchSize, numNets)
 	inBatch = anydiff.BatchedMatMul(false, true, inMat, weightMat).Data
-	return anydiff.Tanh(batchedAddRepeated(inBatch, biases, numNets))
+	return n.Activation.Forward(batchedAddRepeated(inBatch, biases, numNets))
 }
 
 // applyBackprop applies the networks and performs
 // backward-propagation.
 // The result is [inGrad, param1Grad, param2Grad, ...].
 // The caller should pool the input parameters.
-func applyBackprop(params []anydiff.Res, in, target anydiff.Res,
+func (n *Net) applyBackprop(params []anydiff.Res, in, target anydiff.Res,
 	batchSize, numNets int) anydiff.MultiRes {
 	if len(params) == 0 {
 		scaler := target.Output().Creator().MakeNumeric(
@@ -122,13 +126,13 @@ func applyBackprop(params []anydiff.Res, in, target anydiff.Res,
 	inMat, weightMat := layerMats(params[0], params[1], in, batchSize, numNets)
 	matOut := anydiff.BatchedMatMul(false, true, inMat, weightMat).Data
 	biasOut := batchedAddRepeated(matOut, params[1], numNets)
-	tanhOut := anydiff.Tanh(biasOut)
-	return anydiff.PoolFork(tanhOut, func(tanhOut anydiff.Res) anydiff.MultiRes {
-		nextOut := applyBackprop(params[2:], tanhOut, target, batchSize, numNets)
+	actOut := n.Activation.Forward(biasOut)
+	return anydiff.PoolFork(actOut, func(actOut anydiff.Res) anydiff.MultiRes {
+		nextOut := n.applyBackprop(params[2:], actOut, target, batchSize, numNets)
 		return anydiff.PoolMulti(nextOut, func(x []anydiff.Res) anydiff.MultiRes {
 			outGrad := x[0]
 			laterGrads := x[1:]
-			pg := anydiff.Mul(anydiff.Complement(anydiff.Square(tanhOut)), outGrad)
+			pg := n.Activation.Backward(actOut, outGrad)
 			return anydiff.PoolFork(pg, func(pg anydiff.Res) anydiff.MultiRes {
 				productGrad := &anydiff.MatrixBatch{
 					Data: pg,