Gluon PReLU, ELU, SELU, Swish (apache#9662)

* prelu, elu, selu, swish * update * fix infer shape * update infer shape * update
zheng-da · Jun 28, 2018 · 547c1d0 · 547c1d0
1 parent 559ccff
commit 547c1d0
Show file tree

Hide file tree

Showing 8 changed files with 285 additions and 84 deletions.
diff --git a/cpp-package/example/mlp.cpp b/cpp-package/example/mlp.cpp
@@ -61,14 +61,15 @@ void MLP() {
   vector<Symbol> biases(nLayers);
   vector<Symbol> outputs(nLayers);
 
+  Symbol null_sym;
   for (int i = 0; i < nLayers; i++) {
     string istr = to_string(i);
     weights[i] = Symbol::Variable(string("w") + istr);
     biases[i] = Symbol::Variable(string("b") + istr);
     Symbol fc = FullyConnected(string("fc") + istr,
       i == 0? sym_x : outputs[i-1],
       weights[i], biases[i], layerSizes[i]);
-    outputs[i] = LeakyReLU(string("act") + istr, fc, LeakyReLUActType::kLeaky);
+    outputs[i] = LeakyReLU(string("act") + istr, fc, null_sym, LeakyReLUActType::kLeaky);
   }
   auto sym_out = SoftmaxOutput("softmax", outputs[nLayers - 1], sym_label);
 

diff --git a/python/mxnet/gluon/nn/__init__.py b/python/mxnet/gluon/nn/__init__.py
@@ -24,3 +24,5 @@
 from .basic_layers import *
 
 from .conv_layers import *
+
+from .activations import *
diff --git a/python/mxnet/gluon/nn/activations.py b/python/mxnet/gluon/nn/activations.py
@@ -0,0 +1,209 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable= arguments-differ
+"""Basic neural network layers."""
+__all__ = ['Activation', 'LeakyReLU', 'PReLU', 'ELU', 'SELU', 'Swish']
+
+from ... import initializer
+from ..block import HybridBlock
+
+
+class Activation(HybridBlock):
+    r"""Applies an activation function to input.
+
+    Parameters
+    ----------
+    activation : str
+        Name of activation function to use.
+        See :func:`~mxnet.ndarray.Activation` for available choices.
+
+
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
+
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
+    """
+    def __init__(self, activation, **kwargs):
+        self._act_type = activation
+        super(Activation, self).__init__(**kwargs)
+
+    def _alias(self):
+        return self._act_type
+
+    def hybrid_forward(self, F, x):
+        return F.Activation(x, act_type=self._act_type, name='fwd')
+
+    def __repr__(self):
+        s = '{name}({_act_type})'
+        return s.format(name=self.__class__.__name__,
+                        **self.__dict__)
+
+
+class LeakyReLU(HybridBlock):
+    r"""Leaky version of a Rectified Linear Unit.
+
+    It allows a small gradient when the unit is not active
+
+    .. math::
+
+        f\left(x\right) = \left\{
+            \begin{array}{lr}
+               \alpha x & : x \lt 0 \\
+                      x & : x \geq 0 \\
+            \end{array}
+        \right.\\
+
+    Parameters
+    ----------
+    alpha : float
+        slope coefficient for the negative half axis. Must be >= 0.
+
+
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
+
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
+    """
+    def __init__(self, alpha, **kwargs):
+        assert alpha >= 0, "Slope coefficient for LeakyReLU must be no less than 0."
+        super(LeakyReLU, self).__init__(**kwargs)
+        self._alpha = alpha
+
+    def hybrid_forward(self, F, x):
+        return F.LeakyReLU(x, act_type='leaky', slope=self._alpha, name='fwd')
+
+    def __repr__(self):
+        s = '{name}({alpha})'
+        return s.format(name=self.__class__.__name__,
+                        alpha=self._alpha)
+
+
+class PReLU(HybridBlock):
+    r"""Parametric leaky version of a Rectified Linear Unit.
+    <https://arxiv.org/abs/1502.01852>`_ paper.
+
+    It learns a gradient when the unit is not active
+
+    .. math::
+
+        f\left(x\right) = \left\{
+            \begin{array}{lr}
+               \alpha x & : x \lt 0 \\
+                      x & : x \geq 0 \\
+            \end{array}
+        \right.\\
+
+    where alpha is a learned parameter.
+
+    Parameters
+    ----------
+    alpha_initializer : Initializer
+        Initializer for the `embeddings` matrix.
+
+
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
+
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
+    """
+    def __init__(self, alpha_initializer=initializer.Constant(0.25), **kwargs):
+        super(PReLU, self).__init__(**kwargs)
+        with self.name_scope():
+            self.alpha = self.params.get('alpha', shape=(1,), init=alpha_initializer)
+
+    def hybrid_forward(self, F, x, alpha):
+        return F.LeakyReLU(x, gamma=alpha, act_type='prelu', name='fwd')
+
+
+class ELU(HybridBlock):
+    r"""
+    Exponential Linear Unit (ELU)
+        "Fast and Accurate Deep Network Learning by Exponential Linear Units", Clevert et al, 2016
+        https://arxiv.org/abs/1511.07289
+        Published as a conference paper at ICLR 2016
+
+    Parameters
+    ----------
+    alpha : float
+        The alpha parameter as described by Clevert et al, 2016
+
+
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
+
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
+    """
+    def __init__(self, alpha=1.0, **kwargs):
+        super(ELU, self).__init__(**kwargs)
+        self._alpha = alpha
+
+    def hybrid_forward(self, F, x):
+        return F.where(x > 0, x, self._alpha * (F.exp(x) - 1.0))
+
+
+class SELU(HybridBlock):
+    r"""
+    Scaled Exponential Linear Unit (SELU)
+        "Self-Normalizing Neural Networks", Klambauer et al, 2017
+        https://arxiv.org/abs/1706.02515
+
+
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
+
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
+    """
+    def __init__(self, **kwargs):
+        super(SELU, self).__init__(**kwargs)
+        self._scale = 1.0507009873554804934193349852946
+        self._alpha = 1.6732632423543772848170429916717
+
+    def hybrid_forward(self, F, x):
+        return self._scale * F.where(x > 0, x, self._alpha * (F.exp(x) - 1.0))
+
+
+class Swish(HybridBlock):
+    r"""
+    Swish Activation function
+        https://arxiv.org/pdf/1710.05941.pdf
+
+    Parameters
+    ----------
+    beta : float
+        swish(x) = x * sigmoid(beta*x)
+
+
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
+
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
+    """
+
+    def __init__(self, beta=1.0, **kwargs):
+        super(Swish, self).__init__(**kwargs)
+        self._beta = beta
+
+    def hybrid_forward(self, F, x):
+        return x * F.sigmoid(self._beta * x, name='fwd')
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
@@ -18,12 +18,12 @@
 # coding: utf-8
 # pylint: disable= arguments-differ
 """Basic neural network layers."""
-__all__ = ['Sequential', 'HybridSequential', 'Dense', 'Activation',
-           'Dropout', 'BatchNorm', 'InstanceNorm', 'LeakyReLU', 'Embedding',
-           'Flatten', 'Lambda', 'HybridLambda']
+__all__ = ['Sequential', 'HybridSequential', 'Dense', 'Dropout', 'Embedding',
+           'BatchNorm', 'InstanceNorm', 'Flatten', 'Lambda', 'HybridLambda']
 import warnings
 import numpy as np
 
+from .activations import Activation
 from ..block import Block, HybridBlock
 from ..utils import _indent
 from ... import nd, sym
@@ -216,38 +216,6 @@ def __repr__(self):
                         layout='{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0]))
 
 
-class Activation(HybridBlock):
-    r"""Applies an activation function to input.
-
-    Parameters
-    ----------
-    activation : str
-        Name of activation function to use.
-        See :func:`~mxnet.ndarray.Activation` for available choices.
-
-
-    Inputs:
-        - **data**: input tensor with arbitrary shape.
-
-    Outputs:
-        - **out**: output tensor with the same shape as `data`.
-    """
-    def __init__(self, activation, **kwargs):
-        self._act_type = activation
-        super(Activation, self).__init__(**kwargs)
-
-    def _alias(self):
-        return self._act_type
-
-    def hybrid_forward(self, F, x):
-        return F.Activation(x, act_type=self._act_type, name='fwd')
-
-    def __repr__(self):
-        s = '{name}({_act_type})'
-        return s.format(name=self.__class__.__name__,
-                        **self.__dict__)
-
-
 class Dropout(HybridBlock):
     """Applies Dropout to the input.
 
@@ -380,46 +348,6 @@ def __repr__(self):
                                            for k, v in self._kwargs.items()]))
 
 
-class LeakyReLU(HybridBlock):
-    r"""Leaky version of a Rectified Linear Unit.
-
-    It allows a small gradient when the unit is not active
-
-    .. math::
-
-        f\left(x\right) = \left\{
-            \begin{array}{lr}
-               \alpha x & : x \lt 0 \\
-                      x & : x \geq 0 \\
-            \end{array}
-        \right.\\
-
-    Parameters
-    ----------
-    alpha : float
-        slope coefficient for the negative half axis. Must be >= 0.
-
-
-    Inputs:
-        - **data**: input tensor with arbitrary shape.
-
-    Outputs:
-        - **out**: output tensor with the same shape as `data`.
-    """
-    def __init__(self, alpha, **kwargs):
-        assert alpha >= 0, "Slope coefficient for LeakyReLU must be no less than 0."
-        super(LeakyReLU, self).__init__(**kwargs)
-        self._alpha = alpha
-
-    def hybrid_forward(self, F, x):
-        return F.LeakyReLU(x, act_type='leaky', slope=self._alpha, name='fwd')
-
-    def __repr__(self):
-        s = '{name}({alpha})'
-        return s.format(name=self.__class__.__name__,
-                        alpha=self._alpha)
-
-
 class Embedding(HybridBlock):
     r"""Turns non-negative integers (indexes/tokens) into dense vectors
     of fixed size. eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]

diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
@@ -29,7 +29,7 @@
 from ..block import HybridBlock
 from ... import symbol
 from ...base import numeric_types
-from .basic_layers import Activation
+from .activations import Activation
 
 
 def _infer_weight_shape(op_name, data_shape, kwargs):

diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
@@ -111,8 +111,13 @@ class LeakyReLUOp : public Operator {
       }
       case leakyrelu::kPReLU: {
         weight = in_data[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
-        Assign(out, req[leakyrelu::kOut],
-               F<mshadow_op::xelu>(data, mshadow::expr::broadcast<1>(weight, out.shape_)));
+        if (weight.shape_.Size() == 1) {
+          Assign(out, req[leakyrelu::kOut],
+                 F<mshadow_op::xelu>(data, mshadow::expr::broadcast_scalar(weight, out.shape_)));
+        } else {
+          Assign(out, req[leakyrelu::kOut],
+                 F<mshadow_op::xelu>(data, mshadow::expr::broadcast<1>(weight, out.shape_)));
+        }
         break;
       }
       case leakyrelu::kRReLU: {
@@ -177,9 +182,21 @@ class LeakyReLUOp : public Operator {
       case leakyrelu::kPReLU: {
         weight = in_data[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
         grad_weight = in_grad[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
-        grad_weight = sumall_except_dim<1>(F<prelu_grad>(data) * grad);
-        gdata = F<mshadow_op::xelu_grad>(data, mshadow::expr::broadcast<1>(weight, data.shape_))
-                * grad;
+        if (weight.shape_.Size() == 1) {
+          Shape<4> gshape = Shape4(1, grad.shape_[0], grad.shape_[1], grad.shape_[2]);
+          Assign(grad_weight, req[leakyrelu::kGamma],
+                 sumall_except_dim<0>(reshape(F<prelu_grad>(data) * grad, gshape)));
+          Assign(gdata, req[leakyrelu::kData],
+                 F<mshadow_op::xelu_grad>(data,
+                                          mshadow::expr::broadcast_scalar(weight, data.shape_))
+                 * grad);
+        } else {
+          Assign(grad_weight, req[leakyrelu::kGamma],
+                 sumall_except_dim<1>(F<prelu_grad>(data) * grad));
+          Assign(gdata, req[leakyrelu::kData],
+                 F<mshadow_op::xelu_grad>(data, mshadow::expr::broadcast<1>(weight, data.shape_))
+                 * grad);
+        }
         break;
       }
       case leakyrelu::kRReLU: {
@@ -225,7 +242,11 @@ class LeakyReLUProp : public OperatorProperty {
     const TShape &dshape = in_shape->at(leakyrelu::kData);
     if (dshape.ndim() == 0) return false;
     if (param_.act_type == leakyrelu::kPReLU) {
-      in_shape->at(leakyrelu::kGamma) = TShape(Shape1(dshape[1]));
+      const TShape &gshape = in_shape->at(leakyrelu::kGamma);
+      if (gshape.ndim() == 1 && gshape.Size() == 1)
+        in_shape->at(leakyrelu::kGamma) = TShape(Shape1(1));
+      else
+        in_shape->at(leakyrelu::kGamma) = TShape(Shape1(dshape[1]));
     }
     out_shape->clear();
     out_shape->push_back(dshape);