diff --git a/example/tutorial_imdb_fasttext.py b/example/tutorial_imdb_fasttext.py
index 250c1cb5c..e4a009113 100644
--- a/example/tutorial_imdb_fasttext.py
+++ b/example/tutorial_imdb_fasttext.py
@@ -71,7 +71,7 @@ def __init__(self, vocab_size, embedding_size, n_labels):
             tf.int32, shape=[None], name='labels')
 
         # Network structure
-        network = AverageEmbeddingInputlayer(
+        network = AverageEmbeddingInputLayer(
             self.inputs, self.vocab_size, self.embedding_size)
         self.network = DenseLayer(network, self.n_labels)
 
diff --git a/tensorlayer/layers.py b/tensorlayer/layers.py
index 3cf4096e8..a9c4b21dc 100755
--- a/tensorlayer/layers.py
+++ b/tensorlayer/layers.py
@@ -651,30 +651,34 @@ def __init__(
         self.all_drop = {}
 
 
-class AverageEmbeddingInputlayer(Layer):
-    """The :class:`AverageEmbeddingInputlayer` class is for FastText Embedding for sentence classification, see `[1] <http://arxiv.org/abs/1607.01759>`_.
+class AverageEmbeddingInputLayer(Layer):
+    """:class:`AverageEmbeddingInputlayer` averages over embeddings of inputs.
+
+    :class:`AverageEmbeddingInputlayer` can be used as the input layer
+    for models like DAN[1] and FastText[2].
 
     Parameters
     ------------
-    inputs : input placeholder or tensor; zeros are paddings
+    inputs : input placeholder or tensor
     vocabulary_size : an integer, the size of vocabulary
     embedding_size : an integer, the dimension of embedding vectors
+    pad_value : an integer, the scalar pad value used in inputs
     name : a string, the name of the layer
     embeddings_initializer : the initializer of the embedding matrix
     embeddings_kwargs : kwargs to get embedding matrix variable
 
     References
     ------------
-    - [1] Joulin, A., Grave, E., Bojanowski, P., & Mikolov, T. (2016). `Bag of Tricks for Efficient Text Classification. <http://arxiv.org/abs/1607.01759>`_
-    - [2] Recht, B., Re, C., Wright, S., & Niu, F. (2011). `Hogwild: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent. <https://arxiv.org/abs/1106.5730>`_ In NPIS 2011 (pp. 693–701).
-    - [3] `TensorFlow Candidate Sampling <https://www.tensorflow.org/api_guides/python/nn#Candidate_Sampling>`_
+    - [1] Iyyer, M., Manjunatha, V., Boyd-Graber, J., & Daum’e III, H. (2015). Deep Unordered Composition Rivals Syntactic Methods for Text Classification. In Association for Computational Linguistics.
+    - [2] Joulin, A., Grave, E., Bojanowski, P., & Mikolov, T. (2016).`Bag of Tricks for Efficient Text Classification. <http://arxiv.org/abs/1607.01759>`_
     """
     def __init__(
             self, inputs, vocabulary_size, embedding_size,
-            name='fasttext_layer',
+            pad_value=0,
+            name='average_embedding_layer',
             embeddings_initializer=tf.random_uniform_initializer(-0.1, 0.1),
-            embeddings_kwargs={}
-    ):#None):
+            embeddings_kwargs=None,
+    ):
         super().__init__(name=name)
 
         if inputs.get_shape().ndims != 2:
@@ -690,29 +694,24 @@ def __init__(
                 name='embeddings',
                 shape=(vocabulary_size, embedding_size),
                 initializer=embeddings_initializer,
-                # **(embeddings_kwargs or {}),
-                **embeddings_kwargs)
+                **(embeddings_kwargs or {}),
+            )
 
             word_embeddings = tf.nn.embedding_lookup(
                 self.embeddings, self.inputs,
                 name='word_embeddings',
             )
-
-            # Masks used to ignore padding words
-            masks = tf.expand_dims(
-                tf.sign(self.inputs),
-                axis=-1,
-                name='masks',
-            )
-            sum_word_embeddings = tf.reduce_sum(
-                word_embeddings * tf.cast(masks, tf.float32),
-                axis=1,
+            # Zero out embeddings of pad value
+            masks = tf.not_equal(self.inputs, pad_value, name='masks')
+            word_embeddings *= tf.cast(
+                tf.expand_dims(masks, axis=-1),
+                tf.float32,
             )
+            sum_word_embeddings = tf.reduce_sum(word_embeddings, axis=1)
 
             # Count number of non-padding words in each sentence
-            # Used to commute average word embeddings in sentences
             sentence_lengths = tf.count_nonzero(
-                self.inputs,
+                masks,
                 axis=1,
                 keep_dims=True,
                 dtype=tf.float32,
@@ -721,7 +720,7 @@ def __init__(
 
             sentence_embeddings = tf.divide(
                 sum_word_embeddings,
-                sentence_lengths,
+                sentence_lengths + 1e-8,  # Add epsilon to avoid dividing by 0
                 name='sentence_embeddings'
             )