diff --git a/docs/api_docs/python/_redirects.yaml b/docs/api_docs/python/_redirects.yaml
new file mode 100644
index 0000000000..62246f6804
--- /dev/null
+++ b/docs/api_docs/python/_redirects.yaml
@@ -0,0 +1,127 @@
+redirects:
+- from: /api_docs/python/tfa/image/distance_transform/euclidean_dist_transform
+  to: /api_docs/python/tfa/image/euclidean_dist_transform
+- from: /api_docs/python/tfa/image/distort_image_ops/adjust_hsv_in_yiq
+  to: /api_docs/python/tfa/image/adjust_hsv_in_yiq
+- from: /api_docs/python/tfa/image/distort_image_ops/random_hsv_in_yiq
+  to: /api_docs/python/tfa/image/random_hsv_in_yiq
+- from: /api_docs/python/tfa/image/filters/mean_filter2d
+  to: /api_docs/python/tfa/image/mean_filter2d
+- from: /api_docs/python/tfa/image/filters/median_filter2d
+  to: /api_docs/python/tfa/image/median_filter2d
+- from: /api_docs/python/tfa/image/transform_ops/rotate
+  to: /api_docs/python/tfa/image/rotate
+- from: /api_docs/python/tfa/image/transform_ops/transform
+  to: /api_docs/python/tfa/image/transform
+- from: /api_docs/python/tfa/layers/maxout/Maxout
+  to: /api_docs/python/tfa/layers/Maxout
+- from: /api_docs/python/tfa/layers/normalizations/GroupNormalization
+  to: /api_docs/python/tfa/layers/GroupNormalization
+- from: /api_docs/python/tfa/layers/normalizations/InstanceNormalization
+  to: /api_docs/python/tfa/layers/InstanceNormalization
+- from: /api_docs/python/tfa/layers/poincare/PoincareNormalize
+  to: /api_docs/python/tfa/layers/PoincareNormalize
+- from: /api_docs/python/tfa/layers/sparsemax/Sparsemax
+  to: /api_docs/python/tfa/layers/Sparsemax
+- from: /api_docs/python/tfa/layers/sparsemax/sparsemax
+  to: /api_docs/python/tfa/activations/sparsemax
+- from: /api_docs/python/tfa/layers/wrappers/WeightNormalization
+  to: /api_docs/python/tfa/layers/WeightNormalization
+- from: /api_docs/python/tfa/losses/contrastive/ContrastiveLoss
+  to: /api_docs/python/tfa/losses/ContrastiveLoss
+- from: /api_docs/python/tfa/losses/contrastive/contrastive_loss
+  to: /api_docs/python/tfa/losses/contrastive_loss
+- from: /api_docs/python/tfa/losses/focal_loss/SigmoidFocalCrossEntropy
+  to: /api_docs/python/tfa/losses/SigmoidFocalCrossEntropy
+- from: /api_docs/python/tfa/losses/focal_loss/sigmoid_focal_crossentropy
+  to: /api_docs/python/tfa/losses/sigmoid_focal_crossentropy
+- from: /api_docs/python/tfa/losses/lifted/LiftedStructLoss
+  to: /api_docs/python/tfa/losses/LiftedStructLoss
+- from: /api_docs/python/tfa/losses/lifted/lifted_struct_loss
+  to: /api_docs/python/tfa/losses/lifted_struct_loss
+- from: /api_docs/python/tfa/losses/triplet/TripletSemiHardLoss
+  to: /api_docs/python/tfa/losses/TripletSemiHardLoss
+- from: /api_docs/python/tfa/losses/triplet/triplet_semihard_loss
+  to: /api_docs/python/tfa/losses/triplet_semihard_loss
+- from: /api_docs/python/tfa/metrics/cohens_kappa/CohenKappa
+  to: /api_docs/python/tfa/metrics/CohenKappa
+- from: /api_docs/python/tfa/optimizers/lazy_adam/LazyAdam
+  to: /api_docs/python/tfa/optimizers/LazyAdam
+- from: /api_docs/python/tfa/optimizers/moving_average/MovingAverage
+  to: /api_docs/python/tfa/optimizers/MovingAverage
+- from: /api_docs/python/tfa/optimizers/weight_decay_optimizers/AdamW
+  to: /api_docs/python/tfa/optimizers/AdamW
+- from: /api_docs/python/tfa/optimizers/weight_decay_optimizers/SGDW
+  to: /api_docs/python/tfa/optimizers/SGDW
+- from: /api_docs/python/tfa/optimizers/weight_decay_optimizers/extend_with_decoupled_weight_decay
+  to: /api_docs/python/tfa/optimizers/extend_with_decoupled_weight_decay
+- from: /api_docs/python/tfa/rnn/cell/LayerNormLSTMCell
+  to: /api_docs/python/tfa/rnn/LayerNormLSTMCell
+- from: /api_docs/python/tfa/rnn/cell/NASCell
+  to: /api_docs/python/tfa/rnn/NASCell
+- from: /api_docs/python/tfa/seq2seq/attention_wrapper/AttentionMechanism
+  to: /api_docs/python/tfa/seq2seq/AttentionMechanism
+- from: /api_docs/python/tfa/seq2seq/attention_wrapper/AttentionWrapper
+  to: /api_docs/python/tfa/seq2seq/AttentionWrapper
+- from: /api_docs/python/tfa/seq2seq/attention_wrapper/AttentionWrapperState
+  to: /api_docs/python/tfa/seq2seq/AttentionWrapperState
+- from: /api_docs/python/tfa/seq2seq/attention_wrapper/BahdanauAttention
+  to: /api_docs/python/tfa/seq2seq/BahdanauAttention
+- from: /api_docs/python/tfa/seq2seq/attention_wrapper/BahdanauMonotonicAttention
+  to: /api_docs/python/tfa/seq2seq/BahdanauMonotonicAttention
+- from: /api_docs/python/tfa/seq2seq/attention_wrapper/LuongAttention
+  to: /api_docs/python/tfa/seq2seq/LuongAttention
+- from: /api_docs/python/tfa/seq2seq/attention_wrapper/LuongMonotonicAttention
+  to: /api_docs/python/tfa/seq2seq/LuongMonotonicAttention
+- from: /api_docs/python/tfa/seq2seq/attention_wrapper/hardmax
+  to: /api_docs/python/tfa/seq2seq/hardmax
+- from: /api_docs/python/tfa/seq2seq/attention_wrapper/monotonic_attention
+  to: /api_docs/python/tfa/seq2seq/monotonic_attention
+- from: /api_docs/python/tfa/seq2seq/attention_wrapper/safe_cumprod
+  to: /api_docs/python/tfa/seq2seq/safe_cumprod
+- from: /api_docs/python/tfa/seq2seq/basic_decoder/BasicDecoder
+  to: /api_docs/python/tfa/seq2seq/BasicDecoder
+- from: /api_docs/python/tfa/seq2seq/basic_decoder/BasicDecoderOutput
+  to: /api_docs/python/tfa/seq2seq/BasicDecoderOutput
+- from: /api_docs/python/tfa/seq2seq/beam_search_decoder/BeamSearchDecoder
+  to: /api_docs/python/tfa/seq2seq/BeamSearchDecoder
+- from: /api_docs/python/tfa/seq2seq/beam_search_decoder/BeamSearchDecoderOutput
+  to: /api_docs/python/tfa/seq2seq/BeamSearchDecoderOutput
+- from: /api_docs/python/tfa/seq2seq/beam_search_decoder/BeamSearchDecoderState
+  to: /api_docs/python/tfa/seq2seq/BeamSearchDecoderState
+- from: /api_docs/python/tfa/seq2seq/beam_search_decoder/FinalBeamSearchDecoderOutput
+  to: /api_docs/python/tfa/seq2seq/FinalBeamSearchDecoderOutput
+- from: /api_docs/python/tfa/seq2seq/beam_search_decoder/gather_tree_from_array
+  to: /api_docs/python/tfa/seq2seq/gather_tree_from_array
+- from: /api_docs/python/tfa/seq2seq/beam_search_decoder/tile_batch
+  to: /api_docs/python/tfa/seq2seq/tile_batch
+- from: /api_docs/python/tfa/seq2seq/decoder/BaseDecoder
+  to: /api_docs/python/tfa/seq2seq/BaseDecoder
+- from: /api_docs/python/tfa/seq2seq/decoder/Decoder
+  to: /api_docs/python/tfa/seq2seq/Decoder
+- from: /api_docs/python/tfa/seq2seq/decoder/dynamic_decode
+  to: /api_docs/python/tfa/seq2seq/dynamic_decode
+- from: /api_docs/python/tfa/seq2seq/loss/SequenceLoss
+  to: /api_docs/python/tfa/seq2seq/SequenceLoss
+- from: /api_docs/python/tfa/seq2seq/loss/sequence_loss
+  to: /api_docs/python/tfa/seq2seq/sequence_loss
+- from: /api_docs/python/tfa/seq2seq/sampler/CustomSampler
+  to: /api_docs/python/tfa/seq2seq/CustomSampler
+- from: /api_docs/python/tfa/seq2seq/sampler/GreedyEmbeddingSampler
+  to: /api_docs/python/tfa/seq2seq/GreedyEmbeddingSampler
+- from: /api_docs/python/tfa/seq2seq/sampler/InferenceSampler
+  to: /api_docs/python/tfa/seq2seq/InferenceSampler
+- from: /api_docs/python/tfa/seq2seq/sampler/SampleEmbeddingSampler
+  to: /api_docs/python/tfa/seq2seq/SampleEmbeddingSampler
+- from: /api_docs/python/tfa/seq2seq/sampler/Sampler
+  to: /api_docs/python/tfa/seq2seq/Sampler
+- from: /api_docs/python/tfa/seq2seq/sampler/ScheduledEmbeddingTrainingSampler
+  to: /api_docs/python/tfa/seq2seq/ScheduledEmbeddingTrainingSampler
+- from: /api_docs/python/tfa/seq2seq/sampler/ScheduledOutputTrainingSampler
+  to: /api_docs/python/tfa/seq2seq/ScheduledOutputTrainingSampler
+- from: /api_docs/python/tfa/seq2seq/sampler/TrainingSampler
+  to: /api_docs/python/tfa/seq2seq/TrainingSampler
+- from: /api_docs/python/tfa/text/skip_gram_ops/skip_gram_sample
+  to: /api_docs/python/tfa/text/skip_gram_sample
+- from: /api_docs/python/tfa/text/skip_gram_ops/skip_gram_sample_with_text_vocab
+  to: /api_docs/python/tfa/text/skip_gram_sample_with_text_vocab
diff --git a/docs/api_docs/python/_toc.yaml b/docs/api_docs/python/_toc.yaml
new file mode 100644
index 0000000000..eb2e325ef8
--- /dev/null
+++ b/docs/api_docs/python/_toc.yaml
@@ -0,0 +1,302 @@
+# Automatically generated file; please do not edit
+toc:
+  - title: tfa
+    section:
+    - title: Overview
+      path: /api_docs/python/tfa
+  - title: tfa.activations
+    section:
+    - title: Overview
+      path: /api_docs/python/tfa/activations
+    - title: sparsemax
+      path: /api_docs/python/tfa/activations/sparsemax
+  - title: tfa.image
+    section:
+    - title: Overview
+      path: /api_docs/python/tfa/image
+    - title: adjust_hsv_in_yiq
+      path: /api_docs/python/tfa/image/adjust_hsv_in_yiq
+    - title: dense_image_warp
+      path: /api_docs/python/tfa/image/dense_image_warp
+    - title: euclidean_dist_transform
+      path: /api_docs/python/tfa/image/euclidean_dist_transform
+    - title: interpolate_bilinear
+      path: /api_docs/python/tfa/image/interpolate_bilinear
+    - title: mean_filter2d
+      path: /api_docs/python/tfa/image/mean_filter2d
+    - title: median_filter2d
+      path: /api_docs/python/tfa/image/median_filter2d
+    - title: random_hsv_in_yiq
+      path: /api_docs/python/tfa/image/random_hsv_in_yiq
+    - title: rotate
+      path: /api_docs/python/tfa/image/rotate
+    - title: transform
+      path: /api_docs/python/tfa/image/transform
+    - title: distance_transform
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/image/distance_transform
+    - title: distort_image_ops
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/image/distort_image_ops
+    - title: filters
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/image/filters
+    - title: transform_ops
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/image/transform_ops
+      - title: angles_to_projective_transforms
+        path: /api_docs/python/tfa/image/transform_ops/angles_to_projective_transforms
+      - title: compose_transforms
+        path: /api_docs/python/tfa/image/transform_ops/compose_transforms
+      - title: flat_transforms_to_matrices
+        path: /api_docs/python/tfa/image/transform_ops/flat_transforms_to_matrices
+      - title: matrices_to_flat_transforms
+        path: /api_docs/python/tfa/image/transform_ops/matrices_to_flat_transforms
+  - title: tfa.layers
+    section:
+    - title: Overview
+      path: /api_docs/python/tfa/layers
+    - title: GroupNormalization
+      path: /api_docs/python/tfa/layers/GroupNormalization
+    - title: InstanceNormalization
+      path: /api_docs/python/tfa/layers/InstanceNormalization
+    - title: Maxout
+      path: /api_docs/python/tfa/layers/Maxout
+    - title: PoincareNormalize
+      path: /api_docs/python/tfa/layers/PoincareNormalize
+    - title: Sparsemax
+      path: /api_docs/python/tfa/layers/Sparsemax
+    - title: WeightNormalization
+      path: /api_docs/python/tfa/layers/WeightNormalization
+    - title: maxout
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/layers/maxout
+    - title: normalizations
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/layers/normalizations
+    - title: poincare
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/layers/poincare
+    - title: sparsemax
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/layers/sparsemax
+    - title: wrappers
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/layers/wrappers
+  - title: tfa.losses
+    section:
+    - title: Overview
+      path: /api_docs/python/tfa/losses
+    - title: ContrastiveLoss
+      path: /api_docs/python/tfa/losses/ContrastiveLoss
+    - title: contrastive_loss
+      path: /api_docs/python/tfa/losses/contrastive_loss
+    - title: LiftedStructLoss
+      path: /api_docs/python/tfa/losses/LiftedStructLoss
+    - title: lifted_struct_loss
+      path: /api_docs/python/tfa/losses/lifted_struct_loss
+    - title: SigmoidFocalCrossEntropy
+      path: /api_docs/python/tfa/losses/SigmoidFocalCrossEntropy
+    - title: sigmoid_focal_crossentropy
+      path: /api_docs/python/tfa/losses/sigmoid_focal_crossentropy
+    - title: SparsemaxLoss
+      path: /api_docs/python/tfa/losses/SparsemaxLoss
+    - title: sparsemax_loss
+      path: /api_docs/python/tfa/losses/sparsemax_loss
+    - title: TripletSemiHardLoss
+      path: /api_docs/python/tfa/losses/TripletSemiHardLoss
+    - title: triplet_semihard_loss
+      path: /api_docs/python/tfa/losses/triplet_semihard_loss
+    - title: contrastive
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/losses/contrastive
+    - title: focal_loss
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/losses/focal_loss
+    - title: lifted
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/losses/lifted
+    - title: metric_learning
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/losses/metric_learning
+      - title: pairwise_distance
+        path: /api_docs/python/tfa/losses/metric_learning/pairwise_distance
+    - title: triplet
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/losses/triplet
+  - title: tfa.metrics
+    section:
+    - title: Overview
+      path: /api_docs/python/tfa/metrics
+    - title: CohenKappa
+      path: /api_docs/python/tfa/metrics/CohenKappa
+    - title: cohens_kappa
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/metrics/cohens_kappa
+  - title: tfa.optimizers
+    section:
+    - title: Overview
+      path: /api_docs/python/tfa/optimizers
+    - title: AdamW
+      path: /api_docs/python/tfa/optimizers/AdamW
+    - title: extend_with_decoupled_weight_decay
+      path: /api_docs/python/tfa/optimizers/extend_with_decoupled_weight_decay
+    - title: LazyAdam
+      path: /api_docs/python/tfa/optimizers/LazyAdam
+    - title: MovingAverage
+      path: /api_docs/python/tfa/optimizers/MovingAverage
+    - title: SGDW
+      path: /api_docs/python/tfa/optimizers/SGDW
+    - title: lazy_adam
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/optimizers/lazy_adam
+    - title: moving_average
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/optimizers/moving_average
+    - title: weight_decay_optimizers
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/optimizers/weight_decay_optimizers
+      - title: DecoupledWeightDecayExtension
+        path: /api_docs/python/tfa/optimizers/weight_decay_optimizers/DecoupledWeightDecayExtension
+  - title: tfa.rnn
+    section:
+    - title: Overview
+      path: /api_docs/python/tfa/rnn
+    - title: LayerNormLSTMCell
+      path: /api_docs/python/tfa/rnn/LayerNormLSTMCell
+    - title: NASCell
+      path: /api_docs/python/tfa/rnn/NASCell
+    - title: cell
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/rnn/cell
+  - title: tfa.seq2seq
+    section:
+    - title: Overview
+      path: /api_docs/python/tfa/seq2seq
+    - title: AttentionMechanism
+      path: /api_docs/python/tfa/seq2seq/AttentionMechanism
+    - title: AttentionWrapper
+      path: /api_docs/python/tfa/seq2seq/AttentionWrapper
+    - title: AttentionWrapperState
+      path: /api_docs/python/tfa/seq2seq/AttentionWrapperState
+    - title: BahdanauAttention
+      path: /api_docs/python/tfa/seq2seq/BahdanauAttention
+    - title: BahdanauMonotonicAttention
+      path: /api_docs/python/tfa/seq2seq/BahdanauMonotonicAttention
+    - title: BaseDecoder
+      path: /api_docs/python/tfa/seq2seq/BaseDecoder
+    - title: BasicDecoder
+      path: /api_docs/python/tfa/seq2seq/BasicDecoder
+    - title: BasicDecoderOutput
+      path: /api_docs/python/tfa/seq2seq/BasicDecoderOutput
+    - title: BeamSearchDecoder
+      path: /api_docs/python/tfa/seq2seq/BeamSearchDecoder
+    - title: BeamSearchDecoderOutput
+      path: /api_docs/python/tfa/seq2seq/BeamSearchDecoderOutput
+    - title: BeamSearchDecoderState
+      path: /api_docs/python/tfa/seq2seq/BeamSearchDecoderState
+    - title: CustomSampler
+      path: /api_docs/python/tfa/seq2seq/CustomSampler
+    - title: Decoder
+      path: /api_docs/python/tfa/seq2seq/Decoder
+    - title: dynamic_decode
+      path: /api_docs/python/tfa/seq2seq/dynamic_decode
+    - title: FinalBeamSearchDecoderOutput
+      path: /api_docs/python/tfa/seq2seq/FinalBeamSearchDecoderOutput
+    - title: gather_tree_from_array
+      path: /api_docs/python/tfa/seq2seq/gather_tree_from_array
+    - title: GreedyEmbeddingSampler
+      path: /api_docs/python/tfa/seq2seq/GreedyEmbeddingSampler
+    - title: hardmax
+      path: /api_docs/python/tfa/seq2seq/hardmax
+    - title: InferenceSampler
+      path: /api_docs/python/tfa/seq2seq/InferenceSampler
+    - title: LuongAttention
+      path: /api_docs/python/tfa/seq2seq/LuongAttention
+    - title: LuongMonotonicAttention
+      path: /api_docs/python/tfa/seq2seq/LuongMonotonicAttention
+    - title: monotonic_attention
+      path: /api_docs/python/tfa/seq2seq/monotonic_attention
+    - title: safe_cumprod
+      path: /api_docs/python/tfa/seq2seq/safe_cumprod
+    - title: SampleEmbeddingSampler
+      path: /api_docs/python/tfa/seq2seq/SampleEmbeddingSampler
+    - title: Sampler
+      path: /api_docs/python/tfa/seq2seq/Sampler
+    - title: ScheduledEmbeddingTrainingSampler
+      path: /api_docs/python/tfa/seq2seq/ScheduledEmbeddingTrainingSampler
+    - title: ScheduledOutputTrainingSampler
+      path: /api_docs/python/tfa/seq2seq/ScheduledOutputTrainingSampler
+    - title: SequenceLoss
+      path: /api_docs/python/tfa/seq2seq/SequenceLoss
+    - title: sequence_loss
+      path: /api_docs/python/tfa/seq2seq/sequence_loss
+    - title: tile_batch
+      path: /api_docs/python/tfa/seq2seq/tile_batch
+    - title: TrainingSampler
+      path: /api_docs/python/tfa/seq2seq/TrainingSampler
+    - title: attention_wrapper
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/seq2seq/attention_wrapper
+    - title: basic_decoder
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/seq2seq/basic_decoder
+    - title: beam_search_decoder
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/seq2seq/beam_search_decoder
+      - title: attention_probs_from_attn_state
+        path: /api_docs/python/tfa/seq2seq/beam_search_decoder/attention_probs_from_attn_state
+      - title: BeamSearchDecoderMixin
+        path: /api_docs/python/tfa/seq2seq/beam_search_decoder/BeamSearchDecoderMixin
+      - title: get_attention_probs
+        path: /api_docs/python/tfa/seq2seq/beam_search_decoder/get_attention_probs
+    - title: decoder
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/seq2seq/decoder
+    - title: loss
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/seq2seq/loss
+    - title: sampler
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/seq2seq/sampler
+      - title: bernoulli_sample
+        path: /api_docs/python/tfa/seq2seq/sampler/bernoulli_sample
+      - title: categorical_sample
+        path: /api_docs/python/tfa/seq2seq/sampler/categorical_sample
+  - title: tfa.text
+    section:
+    - title: Overview
+      path: /api_docs/python/tfa/text
+    - title: skip_gram_sample
+      path: /api_docs/python/tfa/text/skip_gram_sample
+    - title: skip_gram_sample_with_text_vocab
+      path: /api_docs/python/tfa/text/skip_gram_sample_with_text_vocab
+    - title: skip_gram_ops
+      section:
+      - title: Overview
+        path: /api_docs/python/tfa/text/skip_gram_ops
diff --git a/docs/api_docs/python/index.md b/docs/api_docs/python/index.md
new file mode 100644
index 0000000000..c95e0535a4
--- /dev/null
+++ b/docs/api_docs/python/index.md
@@ -0,0 +1,179 @@
+# All symbols in TensorFlow Addons
+
+*  <a href="./tfa.md"><code>tfa</code></a>
+*  <a href="./tfa/activations.md"><code>tfa.activations</code></a>
+*  <a href="./tfa/activations/sparsemax.md"><code>tfa.activations.sparsemax</code></a>
+*  <a href="./tfa/image.md"><code>tfa.image</code></a>
+*  <a href="./tfa/image/adjust_hsv_in_yiq.md"><code>tfa.image.adjust_hsv_in_yiq</code></a>
+*  <a href="./tfa/image/dense_image_warp.md"><code>tfa.image.dense_image_warp</code></a>
+*  <a href="./tfa/image/distance_transform.md"><code>tfa.image.distance_transform</code></a>
+*  <a href="./tfa/image/euclidean_dist_transform.md"><code>tfa.image.distance_transform.euclidean_dist_transform</code></a>
+*  <a href="./tfa/image/distort_image_ops.md"><code>tfa.image.distort_image_ops</code></a>
+*  <a href="./tfa/image/adjust_hsv_in_yiq.md"><code>tfa.image.distort_image_ops.adjust_hsv_in_yiq</code></a>
+*  <a href="./tfa/image/random_hsv_in_yiq.md"><code>tfa.image.distort_image_ops.random_hsv_in_yiq</code></a>
+*  <a href="./tfa/image/euclidean_dist_transform.md"><code>tfa.image.euclidean_dist_transform</code></a>
+*  <a href="./tfa/image/filters.md"><code>tfa.image.filters</code></a>
+*  <a href="./tfa/image/mean_filter2d.md"><code>tfa.image.filters.mean_filter2d</code></a>
+*  <a href="./tfa/image/median_filter2d.md"><code>tfa.image.filters.median_filter2d</code></a>
+*  <a href="./tfa/image/interpolate_bilinear.md"><code>tfa.image.interpolate_bilinear</code></a>
+*  <a href="./tfa/image/mean_filter2d.md"><code>tfa.image.mean_filter2d</code></a>
+*  <a href="./tfa/image/median_filter2d.md"><code>tfa.image.median_filter2d</code></a>
+*  <a href="./tfa/image/random_hsv_in_yiq.md"><code>tfa.image.random_hsv_in_yiq</code></a>
+*  <a href="./tfa/image/rotate.md"><code>tfa.image.rotate</code></a>
+*  <a href="./tfa/image/transform.md"><code>tfa.image.transform</code></a>
+*  <a href="./tfa/image/transform_ops.md"><code>tfa.image.transform_ops</code></a>
+*  <a href="./tfa/image/transform_ops/angles_to_projective_transforms.md"><code>tfa.image.transform_ops.angles_to_projective_transforms</code></a>
+*  <a href="./tfa/image/transform_ops/compose_transforms.md"><code>tfa.image.transform_ops.compose_transforms</code></a>
+*  <a href="./tfa/image/transform_ops/flat_transforms_to_matrices.md"><code>tfa.image.transform_ops.flat_transforms_to_matrices</code></a>
+*  <a href="./tfa/image/transform_ops/matrices_to_flat_transforms.md"><code>tfa.image.transform_ops.matrices_to_flat_transforms</code></a>
+*  <a href="./tfa/image/rotate.md"><code>tfa.image.transform_ops.rotate</code></a>
+*  <a href="./tfa/image/transform.md"><code>tfa.image.transform_ops.transform</code></a>
+*  <a href="./tfa/layers.md"><code>tfa.layers</code></a>
+*  <a href="./tfa/layers/GroupNormalization.md"><code>tfa.layers.GroupNormalization</code></a>
+*  <a href="./tfa/layers/InstanceNormalization.md"><code>tfa.layers.InstanceNormalization</code></a>
+*  <a href="./tfa/layers/Maxout.md"><code>tfa.layers.Maxout</code></a>
+*  <a href="./tfa/layers/PoincareNormalize.md"><code>tfa.layers.PoincareNormalize</code></a>
+*  <a href="./tfa/layers/Sparsemax.md"><code>tfa.layers.Sparsemax</code></a>
+*  <a href="./tfa/layers/WeightNormalization.md"><code>tfa.layers.WeightNormalization</code></a>
+*  <a href="./tfa/layers/maxout.md"><code>tfa.layers.maxout</code></a>
+*  <a href="./tfa/layers/Maxout.md"><code>tfa.layers.maxout.Maxout</code></a>
+*  <a href="./tfa/layers/normalizations.md"><code>tfa.layers.normalizations</code></a>
+*  <a href="./tfa/layers/GroupNormalization.md"><code>tfa.layers.normalizations.GroupNormalization</code></a>
+*  <a href="./tfa/layers/InstanceNormalization.md"><code>tfa.layers.normalizations.InstanceNormalization</code></a>
+*  <a href="./tfa/layers/poincare.md"><code>tfa.layers.poincare</code></a>
+*  <a href="./tfa/layers/PoincareNormalize.md"><code>tfa.layers.poincare.PoincareNormalize</code></a>
+*  <a href="./tfa/layers/sparsemax.md"><code>tfa.layers.sparsemax</code></a>
+*  <a href="./tfa/layers/Sparsemax.md"><code>tfa.layers.sparsemax.Sparsemax</code></a>
+*  <a href="./tfa/activations/sparsemax.md"><code>tfa.layers.sparsemax.sparsemax</code></a>
+*  <a href="./tfa/layers/wrappers.md"><code>tfa.layers.wrappers</code></a>
+*  <a href="./tfa/layers/WeightNormalization.md"><code>tfa.layers.wrappers.WeightNormalization</code></a>
+*  <a href="./tfa/losses.md"><code>tfa.losses</code></a>
+*  <a href="./tfa/losses/ContrastiveLoss.md"><code>tfa.losses.ContrastiveLoss</code></a>
+*  <a href="./tfa/losses/LiftedStructLoss.md"><code>tfa.losses.LiftedStructLoss</code></a>
+*  <a href="./tfa/losses/SigmoidFocalCrossEntropy.md"><code>tfa.losses.SigmoidFocalCrossEntropy</code></a>
+*  <a href="./tfa/losses/SparsemaxLoss.md"><code>tfa.losses.SparsemaxLoss</code></a>
+*  <a href="./tfa/losses/TripletSemiHardLoss.md"><code>tfa.losses.TripletSemiHardLoss</code></a>
+*  <a href="./tfa/losses/contrastive.md"><code>tfa.losses.contrastive</code></a>
+*  <a href="./tfa/losses/ContrastiveLoss.md"><code>tfa.losses.contrastive.ContrastiveLoss</code></a>
+*  <a href="./tfa/losses/contrastive_loss.md"><code>tfa.losses.contrastive.contrastive_loss</code></a>
+*  <a href="./tfa/losses/contrastive_loss.md"><code>tfa.losses.contrastive_loss</code></a>
+*  <a href="./tfa/losses/focal_loss.md"><code>tfa.losses.focal_loss</code></a>
+*  <a href="./tfa/losses/SigmoidFocalCrossEntropy.md"><code>tfa.losses.focal_loss.SigmoidFocalCrossEntropy</code></a>
+*  <a href="./tfa/losses/sigmoid_focal_crossentropy.md"><code>tfa.losses.focal_loss.sigmoid_focal_crossentropy</code></a>
+*  <a href="./tfa/losses/lifted.md"><code>tfa.losses.lifted</code></a>
+*  <a href="./tfa/losses/LiftedStructLoss.md"><code>tfa.losses.lifted.LiftedStructLoss</code></a>
+*  <a href="./tfa/losses/lifted_struct_loss.md"><code>tfa.losses.lifted.lifted_struct_loss</code></a>
+*  <a href="./tfa/losses/lifted_struct_loss.md"><code>tfa.losses.lifted_struct_loss</code></a>
+*  <a href="./tfa/losses/metric_learning.md"><code>tfa.losses.metric_learning</code></a>
+*  <a href="./tfa/losses/metric_learning/pairwise_distance.md"><code>tfa.losses.metric_learning.pairwise_distance</code></a>
+*  <a href="./tfa/losses/sigmoid_focal_crossentropy.md"><code>tfa.losses.sigmoid_focal_crossentropy</code></a>
+*  <a href="./tfa/losses/sparsemax_loss.md"><code>tfa.losses.sparsemax_loss</code></a>
+*  <a href="./tfa/losses/triplet.md"><code>tfa.losses.triplet</code></a>
+*  <a href="./tfa/losses/TripletSemiHardLoss.md"><code>tfa.losses.triplet.TripletSemiHardLoss</code></a>
+*  <a href="./tfa/losses/triplet_semihard_loss.md"><code>tfa.losses.triplet.triplet_semihard_loss</code></a>
+*  <a href="./tfa/losses/triplet_semihard_loss.md"><code>tfa.losses.triplet_semihard_loss</code></a>
+*  <a href="./tfa/metrics.md"><code>tfa.metrics</code></a>
+*  <a href="./tfa/metrics/CohenKappa.md"><code>tfa.metrics.CohenKappa</code></a>
+*  <a href="./tfa/metrics/cohens_kappa.md"><code>tfa.metrics.cohens_kappa</code></a>
+*  <a href="./tfa/metrics/CohenKappa.md"><code>tfa.metrics.cohens_kappa.CohenKappa</code></a>
+*  <a href="./tfa/optimizers.md"><code>tfa.optimizers</code></a>
+*  <a href="./tfa/optimizers/AdamW.md"><code>tfa.optimizers.AdamW</code></a>
+*  <a href="./tfa/optimizers/LazyAdam.md"><code>tfa.optimizers.LazyAdam</code></a>
+*  <a href="./tfa/optimizers/MovingAverage.md"><code>tfa.optimizers.MovingAverage</code></a>
+*  <a href="./tfa/optimizers/SGDW.md"><code>tfa.optimizers.SGDW</code></a>
+*  <a href="./tfa/optimizers/extend_with_decoupled_weight_decay.md"><code>tfa.optimizers.extend_with_decoupled_weight_decay</code></a>
+*  <a href="./tfa/optimizers/lazy_adam.md"><code>tfa.optimizers.lazy_adam</code></a>
+*  <a href="./tfa/optimizers/LazyAdam.md"><code>tfa.optimizers.lazy_adam.LazyAdam</code></a>
+*  <a href="./tfa/optimizers/moving_average.md"><code>tfa.optimizers.moving_average</code></a>
+*  <a href="./tfa/optimizers/MovingAverage.md"><code>tfa.optimizers.moving_average.MovingAverage</code></a>
+*  <a href="./tfa/optimizers/weight_decay_optimizers.md"><code>tfa.optimizers.weight_decay_optimizers</code></a>
+*  <a href="./tfa/optimizers/AdamW.md"><code>tfa.optimizers.weight_decay_optimizers.AdamW</code></a>
+*  <a href="./tfa/optimizers/weight_decay_optimizers/DecoupledWeightDecayExtension.md"><code>tfa.optimizers.weight_decay_optimizers.DecoupledWeightDecayExtension</code></a>
+*  <a href="./tfa/optimizers/SGDW.md"><code>tfa.optimizers.weight_decay_optimizers.SGDW</code></a>
+*  <a href="./tfa/optimizers/extend_with_decoupled_weight_decay.md"><code>tfa.optimizers.weight_decay_optimizers.extend_with_decoupled_weight_decay</code></a>
+*  <a href="./tfa/rnn.md"><code>tfa.rnn</code></a>
+*  <a href="./tfa/rnn/LayerNormLSTMCell.md"><code>tfa.rnn.LayerNormLSTMCell</code></a>
+*  <a href="./tfa/rnn/NASCell.md"><code>tfa.rnn.NASCell</code></a>
+*  <a href="./tfa/rnn/cell.md"><code>tfa.rnn.cell</code></a>
+*  <a href="./tfa/rnn/LayerNormLSTMCell.md"><code>tfa.rnn.cell.LayerNormLSTMCell</code></a>
+*  <a href="./tfa/rnn/NASCell.md"><code>tfa.rnn.cell.NASCell</code></a>
+*  <a href="./tfa/seq2seq.md"><code>tfa.seq2seq</code></a>
+*  <a href="./tfa/seq2seq/AttentionMechanism.md"><code>tfa.seq2seq.AttentionMechanism</code></a>
+*  <a href="./tfa/seq2seq/AttentionWrapper.md"><code>tfa.seq2seq.AttentionWrapper</code></a>
+*  <a href="./tfa/seq2seq/AttentionWrapperState.md"><code>tfa.seq2seq.AttentionWrapperState</code></a>
+*  <a href="./tfa/seq2seq/BahdanauAttention.md"><code>tfa.seq2seq.BahdanauAttention</code></a>
+*  <a href="./tfa/seq2seq/BahdanauMonotonicAttention.md"><code>tfa.seq2seq.BahdanauMonotonicAttention</code></a>
+*  <a href="./tfa/seq2seq/BaseDecoder.md"><code>tfa.seq2seq.BaseDecoder</code></a>
+*  <a href="./tfa/seq2seq/BasicDecoder.md"><code>tfa.seq2seq.BasicDecoder</code></a>
+*  <a href="./tfa/seq2seq/BasicDecoderOutput.md"><code>tfa.seq2seq.BasicDecoderOutput</code></a>
+*  <a href="./tfa/seq2seq/BeamSearchDecoder.md"><code>tfa.seq2seq.BeamSearchDecoder</code></a>
+*  <a href="./tfa/seq2seq/BeamSearchDecoderOutput.md"><code>tfa.seq2seq.BeamSearchDecoderOutput</code></a>
+*  <a href="./tfa/seq2seq/BeamSearchDecoderState.md"><code>tfa.seq2seq.BeamSearchDecoderState</code></a>
+*  <a href="./tfa/seq2seq/CustomSampler.md"><code>tfa.seq2seq.CustomSampler</code></a>
+*  <a href="./tfa/seq2seq/Decoder.md"><code>tfa.seq2seq.Decoder</code></a>
+*  <a href="./tfa/seq2seq/FinalBeamSearchDecoderOutput.md"><code>tfa.seq2seq.FinalBeamSearchDecoderOutput</code></a>
+*  <a href="./tfa/seq2seq/GreedyEmbeddingSampler.md"><code>tfa.seq2seq.GreedyEmbeddingSampler</code></a>
+*  <a href="./tfa/seq2seq/InferenceSampler.md"><code>tfa.seq2seq.InferenceSampler</code></a>
+*  <a href="./tfa/seq2seq/LuongAttention.md"><code>tfa.seq2seq.LuongAttention</code></a>
+*  <a href="./tfa/seq2seq/LuongMonotonicAttention.md"><code>tfa.seq2seq.LuongMonotonicAttention</code></a>
+*  <a href="./tfa/seq2seq/SampleEmbeddingSampler.md"><code>tfa.seq2seq.SampleEmbeddingSampler</code></a>
+*  <a href="./tfa/seq2seq/Sampler.md"><code>tfa.seq2seq.Sampler</code></a>
+*  <a href="./tfa/seq2seq/ScheduledEmbeddingTrainingSampler.md"><code>tfa.seq2seq.ScheduledEmbeddingTrainingSampler</code></a>
+*  <a href="./tfa/seq2seq/ScheduledOutputTrainingSampler.md"><code>tfa.seq2seq.ScheduledOutputTrainingSampler</code></a>
+*  <a href="./tfa/seq2seq/SequenceLoss.md"><code>tfa.seq2seq.SequenceLoss</code></a>
+*  <a href="./tfa/seq2seq/TrainingSampler.md"><code>tfa.seq2seq.TrainingSampler</code></a>
+*  <a href="./tfa/seq2seq/attention_wrapper.md"><code>tfa.seq2seq.attention_wrapper</code></a>
+*  <a href="./tfa/seq2seq/AttentionMechanism.md"><code>tfa.seq2seq.attention_wrapper.AttentionMechanism</code></a>
+*  <a href="./tfa/seq2seq/AttentionWrapper.md"><code>tfa.seq2seq.attention_wrapper.AttentionWrapper</code></a>
+*  <a href="./tfa/seq2seq/AttentionWrapperState.md"><code>tfa.seq2seq.attention_wrapper.AttentionWrapperState</code></a>
+*  <a href="./tfa/seq2seq/BahdanauAttention.md"><code>tfa.seq2seq.attention_wrapper.BahdanauAttention</code></a>
+*  <a href="./tfa/seq2seq/BahdanauMonotonicAttention.md"><code>tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention</code></a>
+*  <a href="./tfa/seq2seq/LuongAttention.md"><code>tfa.seq2seq.attention_wrapper.LuongAttention</code></a>
+*  <a href="./tfa/seq2seq/LuongMonotonicAttention.md"><code>tfa.seq2seq.attention_wrapper.LuongMonotonicAttention</code></a>
+*  <a href="./tfa/seq2seq/hardmax.md"><code>tfa.seq2seq.attention_wrapper.hardmax</code></a>
+*  <a href="./tfa/seq2seq/monotonic_attention.md"><code>tfa.seq2seq.attention_wrapper.monotonic_attention</code></a>
+*  <a href="./tfa/seq2seq/safe_cumprod.md"><code>tfa.seq2seq.attention_wrapper.safe_cumprod</code></a>
+*  <a href="./tfa/seq2seq/basic_decoder.md"><code>tfa.seq2seq.basic_decoder</code></a>
+*  <a href="./tfa/seq2seq/BasicDecoder.md"><code>tfa.seq2seq.basic_decoder.BasicDecoder</code></a>
+*  <a href="./tfa/seq2seq/BasicDecoderOutput.md"><code>tfa.seq2seq.basic_decoder.BasicDecoderOutput</code></a>
+*  <a href="./tfa/seq2seq/beam_search_decoder.md"><code>tfa.seq2seq.beam_search_decoder</code></a>
+*  <a href="./tfa/seq2seq/BeamSearchDecoder.md"><code>tfa.seq2seq.beam_search_decoder.BeamSearchDecoder</code></a>
+*  <a href="./tfa/seq2seq/beam_search_decoder/BeamSearchDecoderMixin.md"><code>tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin</code></a>
+*  <a href="./tfa/seq2seq/BeamSearchDecoderOutput.md"><code>tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput</code></a>
+*  <a href="./tfa/seq2seq/BeamSearchDecoderState.md"><code>tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState</code></a>
+*  <a href="./tfa/seq2seq/FinalBeamSearchDecoderOutput.md"><code>tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput</code></a>
+*  <a href="./tfa/seq2seq/beam_search_decoder/attention_probs_from_attn_state.md"><code>tfa.seq2seq.beam_search_decoder.attention_probs_from_attn_state</code></a>
+*  <a href="./tfa/seq2seq/gather_tree_from_array.md"><code>tfa.seq2seq.beam_search_decoder.gather_tree_from_array</code></a>
+*  <a href="./tfa/seq2seq/beam_search_decoder/get_attention_probs.md"><code>tfa.seq2seq.beam_search_decoder.get_attention_probs</code></a>
+*  <a href="./tfa/seq2seq/tile_batch.md"><code>tfa.seq2seq.beam_search_decoder.tile_batch</code></a>
+*  <a href="./tfa/seq2seq/decoder.md"><code>tfa.seq2seq.decoder</code></a>
+*  <a href="./tfa/seq2seq/BaseDecoder.md"><code>tfa.seq2seq.decoder.BaseDecoder</code></a>
+*  <a href="./tfa/seq2seq/Decoder.md"><code>tfa.seq2seq.decoder.Decoder</code></a>
+*  <a href="./tfa/seq2seq/dynamic_decode.md"><code>tfa.seq2seq.decoder.dynamic_decode</code></a>
+*  <a href="./tfa/seq2seq/dynamic_decode.md"><code>tfa.seq2seq.dynamic_decode</code></a>
+*  <a href="./tfa/seq2seq/gather_tree_from_array.md"><code>tfa.seq2seq.gather_tree_from_array</code></a>
+*  <a href="./tfa/seq2seq/hardmax.md"><code>tfa.seq2seq.hardmax</code></a>
+*  <a href="./tfa/seq2seq/loss.md"><code>tfa.seq2seq.loss</code></a>
+*  <a href="./tfa/seq2seq/SequenceLoss.md"><code>tfa.seq2seq.loss.SequenceLoss</code></a>
+*  <a href="./tfa/seq2seq/sequence_loss.md"><code>tfa.seq2seq.loss.sequence_loss</code></a>
+*  <a href="./tfa/seq2seq/monotonic_attention.md"><code>tfa.seq2seq.monotonic_attention</code></a>
+*  <a href="./tfa/seq2seq/safe_cumprod.md"><code>tfa.seq2seq.safe_cumprod</code></a>
+*  <a href="./tfa/seq2seq/sampler.md"><code>tfa.seq2seq.sampler</code></a>
+*  <a href="./tfa/seq2seq/CustomSampler.md"><code>tfa.seq2seq.sampler.CustomSampler</code></a>
+*  <a href="./tfa/seq2seq/GreedyEmbeddingSampler.md"><code>tfa.seq2seq.sampler.GreedyEmbeddingSampler</code></a>
+*  <a href="./tfa/seq2seq/InferenceSampler.md"><code>tfa.seq2seq.sampler.InferenceSampler</code></a>
+*  <a href="./tfa/seq2seq/SampleEmbeddingSampler.md"><code>tfa.seq2seq.sampler.SampleEmbeddingSampler</code></a>
+*  <a href="./tfa/seq2seq/Sampler.md"><code>tfa.seq2seq.sampler.Sampler</code></a>
+*  <a href="./tfa/seq2seq/ScheduledEmbeddingTrainingSampler.md"><code>tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler</code></a>
+*  <a href="./tfa/seq2seq/ScheduledOutputTrainingSampler.md"><code>tfa.seq2seq.sampler.ScheduledOutputTrainingSampler</code></a>
+*  <a href="./tfa/seq2seq/TrainingSampler.md"><code>tfa.seq2seq.sampler.TrainingSampler</code></a>
+*  <a href="./tfa/seq2seq/sampler/bernoulli_sample.md"><code>tfa.seq2seq.sampler.bernoulli_sample</code></a>
+*  <a href="./tfa/seq2seq/sampler/categorical_sample.md"><code>tfa.seq2seq.sampler.categorical_sample</code></a>
+*  <a href="./tfa/seq2seq/sequence_loss.md"><code>tfa.seq2seq.sequence_loss</code></a>
+*  <a href="./tfa/seq2seq/tile_batch.md"><code>tfa.seq2seq.tile_batch</code></a>
+*  <a href="./tfa/text.md"><code>tfa.text</code></a>
+*  <a href="./tfa/text/skip_gram_ops.md"><code>tfa.text.skip_gram_ops</code></a>
+*  <a href="./tfa/text/skip_gram_sample.md"><code>tfa.text.skip_gram_ops.skip_gram_sample</code></a>
+*  <a href="./tfa/text/skip_gram_sample_with_text_vocab.md"><code>tfa.text.skip_gram_ops.skip_gram_sample_with_text_vocab</code></a>
+*  <a href="./tfa/text/skip_gram_sample.md"><code>tfa.text.skip_gram_sample</code></a>
+*  <a href="./tfa/text/skip_gram_sample_with_text_vocab.md"><code>tfa.text.skip_gram_sample_with_text_vocab</code></a>
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa.md b/docs/api_docs/python/tfa.md
new file mode 100644
index 0000000000..a4086ba235
--- /dev/null
+++ b/docs/api_docs/python/tfa.md
@@ -0,0 +1,36 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa
+
+Useful extra functionality for TensorFlow maintained by SIG-addons.
+
+
+
+Defined in [`__init__.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/__init__.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Modules
+
+[`activations`](./tfa/activations.md) module: A module containing activation routines.
+
+[`image`](./tfa/image.md) module: Image manipulation ops.
+
+[`layers`](./tfa/layers.md) module: Additional layers that conform to Keras API.
+
+[`losses`](./tfa/losses.md) module: Additional losses that conform to Keras API.
+
+[`metrics`](./tfa/metrics.md) module: A module containing metrics that conform to Keras API.
+
+[`optimizers`](./tfa/optimizers.md) module: Additional optimizers that conform to Keras API.
+
+[`rnn`](./tfa/rnn.md) module: Customized RNN cells.
+
+[`seq2seq`](./tfa/seq2seq.md) module: Ops for building neural network sequence to sequence decoders and losses.
+
+[`text`](./tfa/text.md) module: Text-processing ops.
+
diff --git a/docs/api_docs/python/tfa/_api_cache.json b/docs/api_docs/python/tfa/_api_cache.json
new file mode 100644
index 0000000000..1b18603244
--- /dev/null
+++ b/docs/api_docs/python/tfa/_api_cache.json
@@ -0,0 +1,3805 @@
+{
+  "duplicate_of": {
+    "tfa.image.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.image.distance_transform.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.image.distance_transform.division": "tfa.activations.division", 
+    "tfa.image.distance_transform.euclidean_dist_transform": "tfa.image.euclidean_dist_transform", 
+    "tfa.image.distance_transform.print_function": "tfa.activations.print_function", 
+    "tfa.image.distort_image_ops.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.image.distort_image_ops.adjust_hsv_in_yiq": "tfa.image.adjust_hsv_in_yiq", 
+    "tfa.image.distort_image_ops.division": "tfa.activations.division", 
+    "tfa.image.distort_image_ops.print_function": "tfa.activations.print_function", 
+    "tfa.image.distort_image_ops.random_hsv_in_yiq": "tfa.image.random_hsv_in_yiq", 
+    "tfa.image.division": "tfa.activations.division", 
+    "tfa.image.filters.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.image.filters.division": "tfa.activations.division", 
+    "tfa.image.filters.mean_filter2d": "tfa.image.mean_filter2d", 
+    "tfa.image.filters.median_filter2d": "tfa.image.median_filter2d", 
+    "tfa.image.filters.print_function": "tfa.activations.print_function", 
+    "tfa.image.print_function": "tfa.activations.print_function", 
+    "tfa.image.transform_ops.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.image.transform_ops.division": "tfa.activations.division", 
+    "tfa.image.transform_ops.print_function": "tfa.activations.print_function", 
+    "tfa.image.transform_ops.rotate": "tfa.image.rotate", 
+    "tfa.image.transform_ops.transform": "tfa.image.transform", 
+    "tfa.layers.InstanceNormalization.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.layers.InstanceNormalization.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.layers.InstanceNormalization.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.layers.InstanceNormalization.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.layers.InstanceNormalization.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.layers.InstanceNormalization.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.layers.InstanceNormalization.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.layers.InstanceNormalization.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.layers.InstanceNormalization.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.layers.InstanceNormalization.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.layers.InstanceNormalization.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.layers.InstanceNormalization.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.layers.InstanceNormalization.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.layers.InstanceNormalization.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.layers.InstanceNormalization.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.layers.InstanceNormalization.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.layers.InstanceNormalization.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.layers.InstanceNormalization.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.layers.InstanceNormalization.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.layers.InstanceNormalization.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.layers.InstanceNormalization.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.layers.InstanceNormalization.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.layers.InstanceNormalization.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.layers.Maxout.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.layers.Maxout.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.layers.Maxout.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.layers.Maxout.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.layers.Maxout.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.layers.Maxout.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.layers.Maxout.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.layers.Maxout.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.layers.Maxout.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.layers.Maxout.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.layers.Maxout.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.layers.Maxout.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.layers.Maxout.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.layers.Maxout.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.layers.Maxout.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.layers.Maxout.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.layers.Maxout.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.layers.Maxout.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.layers.Maxout.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.layers.Maxout.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.layers.Maxout.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.layers.Maxout.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.layers.Maxout.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.layers.PoincareNormalize.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.layers.PoincareNormalize.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.layers.PoincareNormalize.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.layers.PoincareNormalize.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.layers.PoincareNormalize.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.layers.PoincareNormalize.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.layers.PoincareNormalize.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.layers.PoincareNormalize.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.layers.PoincareNormalize.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.layers.PoincareNormalize.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.layers.PoincareNormalize.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.layers.PoincareNormalize.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.layers.PoincareNormalize.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.layers.PoincareNormalize.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.layers.PoincareNormalize.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.layers.PoincareNormalize.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.layers.PoincareNormalize.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.layers.PoincareNormalize.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.layers.PoincareNormalize.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.layers.PoincareNormalize.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.layers.PoincareNormalize.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.layers.PoincareNormalize.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.layers.PoincareNormalize.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.layers.Sparsemax.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.layers.Sparsemax.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.layers.Sparsemax.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.layers.Sparsemax.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.layers.Sparsemax.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.layers.Sparsemax.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.layers.Sparsemax.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.layers.Sparsemax.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.layers.Sparsemax.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.layers.Sparsemax.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.layers.Sparsemax.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.layers.Sparsemax.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.layers.Sparsemax.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.layers.Sparsemax.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.layers.Sparsemax.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.layers.Sparsemax.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.layers.Sparsemax.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.layers.Sparsemax.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.layers.Sparsemax.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.layers.Sparsemax.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.layers.Sparsemax.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.layers.Sparsemax.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.layers.Sparsemax.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.layers.WeightNormalization.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.layers.WeightNormalization.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.layers.WeightNormalization.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.layers.WeightNormalization.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.layers.WeightNormalization.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.layers.WeightNormalization.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.layers.WeightNormalization.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.layers.WeightNormalization.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.layers.WeightNormalization.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.layers.WeightNormalization.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.layers.WeightNormalization.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.layers.WeightNormalization.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.layers.WeightNormalization.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.layers.WeightNormalization.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.layers.WeightNormalization.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.layers.WeightNormalization.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.layers.WeightNormalization.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.layers.WeightNormalization.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.layers.WeightNormalization.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.layers.WeightNormalization.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.layers.WeightNormalization.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.layers.WeightNormalization.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.layers.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.layers.division": "tfa.activations.division", 
+    "tfa.layers.maxout.Maxout": "tfa.layers.Maxout", 
+    "tfa.layers.maxout.Maxout.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.layers.maxout.Maxout.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.layers.maxout.Maxout.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.layers.maxout.Maxout.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.layers.maxout.Maxout.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.layers.maxout.Maxout.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.layers.maxout.Maxout.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.layers.maxout.Maxout.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.layers.maxout.Maxout.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.layers.maxout.Maxout.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.layers.maxout.Maxout.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.layers.maxout.Maxout.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.layers.maxout.Maxout.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.layers.maxout.Maxout.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.layers.maxout.Maxout.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.layers.maxout.Maxout.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.layers.maxout.Maxout.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.layers.maxout.Maxout.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.layers.maxout.Maxout.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.layers.maxout.Maxout.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.layers.maxout.Maxout.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.layers.maxout.Maxout.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.layers.maxout.Maxout.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.layers.maxout.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.layers.maxout.division": "tfa.activations.division", 
+    "tfa.layers.maxout.print_function": "tfa.activations.print_function", 
+    "tfa.layers.normalizations.GroupNormalization": "tfa.layers.GroupNormalization", 
+    "tfa.layers.normalizations.GroupNormalization.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.layers.normalizations.GroupNormalization.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.layers.normalizations.GroupNormalization.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.layers.normalizations.GroupNormalization.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.layers.normalizations.GroupNormalization.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.layers.normalizations.GroupNormalization.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.layers.normalizations.GroupNormalization.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.layers.normalizations.GroupNormalization.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.layers.normalizations.GroupNormalization.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.layers.normalizations.GroupNormalization.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.layers.normalizations.GroupNormalization.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.layers.normalizations.GroupNormalization.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.layers.normalizations.GroupNormalization.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.layers.normalizations.GroupNormalization.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.layers.normalizations.GroupNormalization.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.layers.normalizations.GroupNormalization.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.layers.normalizations.GroupNormalization.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.layers.normalizations.GroupNormalization.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.layers.normalizations.GroupNormalization.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.layers.normalizations.GroupNormalization.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.layers.normalizations.GroupNormalization.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.layers.normalizations.GroupNormalization.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.layers.normalizations.GroupNormalization.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.layers.normalizations.InstanceNormalization": "tfa.layers.InstanceNormalization", 
+    "tfa.layers.normalizations.InstanceNormalization.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.layers.normalizations.InstanceNormalization.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.layers.normalizations.InstanceNormalization.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.layers.normalizations.InstanceNormalization.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.layers.normalizations.InstanceNormalization.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.layers.normalizations.InstanceNormalization.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.layers.normalizations.InstanceNormalization.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.layers.normalizations.InstanceNormalization.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.layers.normalizations.InstanceNormalization.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.layers.normalizations.InstanceNormalization.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.layers.normalizations.InstanceNormalization.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.layers.normalizations.InstanceNormalization.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.layers.normalizations.InstanceNormalization.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.layers.normalizations.InstanceNormalization.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.layers.normalizations.InstanceNormalization.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.layers.normalizations.InstanceNormalization.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.layers.normalizations.InstanceNormalization.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.layers.normalizations.InstanceNormalization.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.layers.normalizations.InstanceNormalization.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.layers.normalizations.InstanceNormalization.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.layers.normalizations.InstanceNormalization.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.layers.normalizations.InstanceNormalization.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.layers.normalizations.InstanceNormalization.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.layers.normalizations.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.layers.normalizations.division": "tfa.activations.division", 
+    "tfa.layers.normalizations.print_function": "tfa.activations.print_function", 
+    "tfa.layers.poincare.PoincareNormalize": "tfa.layers.PoincareNormalize", 
+    "tfa.layers.poincare.PoincareNormalize.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.layers.poincare.PoincareNormalize.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.layers.poincare.PoincareNormalize.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.layers.poincare.PoincareNormalize.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.layers.poincare.PoincareNormalize.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.layers.poincare.PoincareNormalize.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.layers.poincare.PoincareNormalize.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.layers.poincare.PoincareNormalize.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.layers.poincare.PoincareNormalize.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.layers.poincare.PoincareNormalize.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.layers.poincare.PoincareNormalize.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.layers.poincare.PoincareNormalize.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.layers.poincare.PoincareNormalize.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.layers.poincare.PoincareNormalize.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.layers.poincare.PoincareNormalize.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.layers.poincare.PoincareNormalize.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.layers.poincare.PoincareNormalize.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.layers.poincare.PoincareNormalize.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.layers.poincare.PoincareNormalize.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.layers.poincare.PoincareNormalize.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.layers.poincare.PoincareNormalize.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.layers.poincare.PoincareNormalize.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.layers.poincare.PoincareNormalize.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.layers.poincare.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.layers.poincare.division": "tfa.activations.division", 
+    "tfa.layers.poincare.print_function": "tfa.activations.print_function", 
+    "tfa.layers.print_function": "tfa.activations.print_function", 
+    "tfa.layers.sparsemax.Sparsemax": "tfa.layers.Sparsemax", 
+    "tfa.layers.sparsemax.Sparsemax.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.layers.sparsemax.Sparsemax.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.layers.sparsemax.Sparsemax.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.layers.sparsemax.Sparsemax.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.layers.sparsemax.Sparsemax.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.layers.sparsemax.Sparsemax.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.layers.sparsemax.Sparsemax.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.layers.sparsemax.Sparsemax.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.layers.sparsemax.Sparsemax.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.layers.sparsemax.Sparsemax.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.layers.sparsemax.Sparsemax.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.layers.sparsemax.Sparsemax.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.layers.sparsemax.Sparsemax.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.layers.sparsemax.Sparsemax.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.layers.sparsemax.Sparsemax.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.layers.sparsemax.Sparsemax.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.layers.sparsemax.Sparsemax.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.layers.sparsemax.Sparsemax.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.layers.sparsemax.Sparsemax.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.layers.sparsemax.Sparsemax.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.layers.sparsemax.Sparsemax.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.layers.sparsemax.Sparsemax.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.layers.sparsemax.Sparsemax.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.layers.sparsemax.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.layers.sparsemax.division": "tfa.activations.division", 
+    "tfa.layers.sparsemax.print_function": "tfa.activations.print_function", 
+    "tfa.layers.sparsemax.sparsemax": "tfa.activations.sparsemax", 
+    "tfa.layers.wrappers.WeightNormalization": "tfa.layers.WeightNormalization", 
+    "tfa.layers.wrappers.WeightNormalization.activity_regularizer": "tfa.layers.WeightNormalization.activity_regularizer", 
+    "tfa.layers.wrappers.WeightNormalization.call": "tfa.layers.WeightNormalization.call", 
+    "tfa.layers.wrappers.WeightNormalization.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.layers.wrappers.WeightNormalization.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.layers.wrappers.WeightNormalization.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.layers.wrappers.WeightNormalization.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.layers.wrappers.WeightNormalization.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.layers.wrappers.WeightNormalization.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.layers.wrappers.WeightNormalization.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.layers.wrappers.WeightNormalization.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.layers.wrappers.WeightNormalization.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.layers.wrappers.WeightNormalization.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.layers.wrappers.WeightNormalization.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.layers.wrappers.WeightNormalization.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.layers.wrappers.WeightNormalization.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.layers.wrappers.WeightNormalization.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.layers.wrappers.WeightNormalization.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.layers.wrappers.WeightNormalization.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.layers.wrappers.WeightNormalization.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.layers.wrappers.WeightNormalization.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.layers.wrappers.WeightNormalization.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.layers.wrappers.WeightNormalization.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.layers.wrappers.WeightNormalization.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.layers.wrappers.WeightNormalization.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.layers.wrappers.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.layers.wrappers.division": "tfa.activations.division", 
+    "tfa.layers.wrappers.print_function": "tfa.activations.print_function", 
+    "tfa.losses.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.losses.contrastive.ContrastiveLoss": "tfa.losses.ContrastiveLoss", 
+    "tfa.losses.contrastive.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.losses.contrastive.contrastive_loss": "tfa.losses.contrastive_loss", 
+    "tfa.losses.contrastive.division": "tfa.activations.division", 
+    "tfa.losses.contrastive.print_function": "tfa.activations.print_function", 
+    "tfa.losses.division": "tfa.activations.division", 
+    "tfa.losses.focal_loss.SigmoidFocalCrossEntropy": "tfa.losses.SigmoidFocalCrossEntropy", 
+    "tfa.losses.focal_loss.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.losses.focal_loss.division": "tfa.activations.division", 
+    "tfa.losses.focal_loss.print_function": "tfa.activations.print_function", 
+    "tfa.losses.focal_loss.sigmoid_focal_crossentropy": "tfa.losses.sigmoid_focal_crossentropy", 
+    "tfa.losses.lifted.LiftedStructLoss": "tfa.losses.LiftedStructLoss", 
+    "tfa.losses.lifted.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.losses.lifted.division": "tfa.activations.division", 
+    "tfa.losses.lifted.lifted_struct_loss": "tfa.losses.lifted_struct_loss", 
+    "tfa.losses.lifted.print_function": "tfa.activations.print_function", 
+    "tfa.losses.metric_learning.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.losses.metric_learning.division": "tfa.activations.division", 
+    "tfa.losses.metric_learning.print_function": "tfa.activations.print_function", 
+    "tfa.losses.print_function": "tfa.activations.print_function", 
+    "tfa.losses.triplet.TripletSemiHardLoss": "tfa.losses.TripletSemiHardLoss", 
+    "tfa.losses.triplet.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.losses.triplet.division": "tfa.activations.division", 
+    "tfa.losses.triplet.print_function": "tfa.activations.print_function", 
+    "tfa.losses.triplet.triplet_semihard_loss": "tfa.losses.triplet_semihard_loss", 
+    "tfa.metrics.CohenKappa.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.metrics.CohenKappa.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.metrics.CohenKappa.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.metrics.CohenKappa.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.metrics.CohenKappa.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.metrics.CohenKappa.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.metrics.CohenKappa.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.metrics.CohenKappa.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.metrics.CohenKappa.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.metrics.CohenKappa.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.metrics.CohenKappa.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.metrics.CohenKappa.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.metrics.CohenKappa.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.metrics.CohenKappa.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.metrics.CohenKappa.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.metrics.CohenKappa.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.metrics.CohenKappa.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.metrics.CohenKappa.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.metrics.CohenKappa.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.metrics.CohenKappa.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.metrics.CohenKappa.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.metrics.CohenKappa.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.metrics.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.metrics.cohens_kappa.CohenKappa": "tfa.metrics.CohenKappa", 
+    "tfa.metrics.cohens_kappa.CohenKappa.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.metrics.cohens_kappa.CohenKappa.dtype": "tfa.metrics.CohenKappa.dtype", 
+    "tfa.metrics.cohens_kappa.CohenKappa.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.metrics.cohens_kappa.CohenKappa.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.metrics.cohens_kappa.CohenKappa.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.metrics.cohens_kappa.CohenKappa.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.metrics.cohens_kappa.CohenKappa.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.metrics.cohens_kappa.CohenKappa.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.metrics.cohens_kappa.CohenKappa.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.metrics.cohens_kappa.CohenKappa.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.metrics.cohens_kappa.CohenKappa.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.metrics.cohens_kappa.CohenKappa.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.metrics.cohens_kappa.CohenKappa.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.metrics.cohens_kappa.CohenKappa.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.metrics.cohens_kappa.CohenKappa.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.metrics.cohens_kappa.CohenKappa.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.metrics.cohens_kappa.CohenKappa.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.metrics.cohens_kappa.CohenKappa.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.metrics.cohens_kappa.CohenKappa.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.metrics.cohens_kappa.CohenKappa.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.metrics.cohens_kappa.CohenKappa.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.metrics.cohens_kappa.CohenKappa.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.metrics.cohens_kappa.CohenKappa.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.metrics.cohens_kappa.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.metrics.cohens_kappa.division": "tfa.activations.division", 
+    "tfa.metrics.cohens_kappa.print_function": "tfa.activations.print_function", 
+    "tfa.metrics.division": "tfa.activations.division", 
+    "tfa.metrics.print_function": "tfa.activations.print_function", 
+    "tfa.optimizers.LazyAdam.iterations": "tfa.optimizers.AdamW.iterations", 
+    "tfa.optimizers.LazyAdam.weights": "tfa.optimizers.AdamW.weights", 
+    "tfa.optimizers.MovingAverage.iterations": "tfa.optimizers.AdamW.iterations", 
+    "tfa.optimizers.SGDW.iterations": "tfa.optimizers.AdamW.iterations", 
+    "tfa.optimizers.SGDW.weights": "tfa.optimizers.AdamW.weights", 
+    "tfa.optimizers.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.optimizers.division": "tfa.activations.division", 
+    "tfa.optimizers.lazy_adam.LazyAdam": "tfa.optimizers.LazyAdam", 
+    "tfa.optimizers.lazy_adam.LazyAdam.iterations": "tfa.optimizers.AdamW.iterations", 
+    "tfa.optimizers.lazy_adam.LazyAdam.weights": "tfa.optimizers.AdamW.weights", 
+    "tfa.optimizers.lazy_adam.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.optimizers.lazy_adam.division": "tfa.activations.division", 
+    "tfa.optimizers.lazy_adam.print_function": "tfa.activations.print_function", 
+    "tfa.optimizers.moving_average.MovingAverage": "tfa.optimizers.MovingAverage", 
+    "tfa.optimizers.moving_average.MovingAverage.iterations": "tfa.optimizers.AdamW.iterations", 
+    "tfa.optimizers.moving_average.MovingAverage.weights": "tfa.optimizers.MovingAverage.weights", 
+    "tfa.optimizers.moving_average.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.optimizers.moving_average.division": "tfa.activations.division", 
+    "tfa.optimizers.moving_average.print_function": "tfa.activations.print_function", 
+    "tfa.optimizers.print_function": "tfa.activations.print_function", 
+    "tfa.optimizers.weight_decay_optimizers.AdamW": "tfa.optimizers.AdamW", 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.iterations": "tfa.optimizers.AdamW.iterations", 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.weights": "tfa.optimizers.AdamW.weights", 
+    "tfa.optimizers.weight_decay_optimizers.SGDW": "tfa.optimizers.SGDW", 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.iterations": "tfa.optimizers.AdamW.iterations", 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.weights": "tfa.optimizers.AdamW.weights", 
+    "tfa.optimizers.weight_decay_optimizers.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.optimizers.weight_decay_optimizers.division": "tfa.activations.division", 
+    "tfa.optimizers.weight_decay_optimizers.extend_with_decoupled_weight_decay": "tfa.optimizers.extend_with_decoupled_weight_decay", 
+    "tfa.optimizers.weight_decay_optimizers.print_function": "tfa.activations.print_function", 
+    "tfa.rnn.LayerNormLSTMCell.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.rnn.LayerNormLSTMCell.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.rnn.LayerNormLSTMCell.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.rnn.LayerNormLSTMCell.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.rnn.LayerNormLSTMCell.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.rnn.LayerNormLSTMCell.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.rnn.LayerNormLSTMCell.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.rnn.LayerNormLSTMCell.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.rnn.LayerNormLSTMCell.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.rnn.LayerNormLSTMCell.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.rnn.LayerNormLSTMCell.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.rnn.LayerNormLSTMCell.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.rnn.LayerNormLSTMCell.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.rnn.LayerNormLSTMCell.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.rnn.LayerNormLSTMCell.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.rnn.LayerNormLSTMCell.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.rnn.LayerNormLSTMCell.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.rnn.LayerNormLSTMCell.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.rnn.LayerNormLSTMCell.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.rnn.LayerNormLSTMCell.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.rnn.LayerNormLSTMCell.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.rnn.LayerNormLSTMCell.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.rnn.LayerNormLSTMCell.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.rnn.NASCell.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.rnn.NASCell.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.rnn.NASCell.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.rnn.NASCell.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.rnn.NASCell.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.rnn.NASCell.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.rnn.NASCell.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.rnn.NASCell.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.rnn.NASCell.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.rnn.NASCell.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.rnn.NASCell.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.rnn.NASCell.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.rnn.NASCell.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.rnn.NASCell.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.rnn.NASCell.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.rnn.NASCell.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.rnn.NASCell.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.rnn.NASCell.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.rnn.NASCell.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.rnn.NASCell.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.rnn.NASCell.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.rnn.NASCell.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.rnn.NASCell.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.rnn.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.rnn.cell.LayerNormLSTMCell": "tfa.rnn.LayerNormLSTMCell", 
+    "tfa.rnn.cell.LayerNormLSTMCell.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.rnn.cell.LayerNormLSTMCell.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.rnn.cell.LayerNormLSTMCell.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.rnn.cell.LayerNormLSTMCell.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.rnn.cell.LayerNormLSTMCell.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.rnn.cell.LayerNormLSTMCell.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.rnn.cell.LayerNormLSTMCell.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.rnn.cell.LayerNormLSTMCell.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.rnn.cell.LayerNormLSTMCell.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.rnn.cell.LayerNormLSTMCell.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.rnn.cell.LayerNormLSTMCell.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.rnn.cell.LayerNormLSTMCell.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.rnn.cell.LayerNormLSTMCell.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.rnn.cell.LayerNormLSTMCell.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.rnn.cell.LayerNormLSTMCell.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.rnn.cell.LayerNormLSTMCell.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.rnn.cell.LayerNormLSTMCell.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.rnn.cell.LayerNormLSTMCell.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.rnn.cell.LayerNormLSTMCell.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.rnn.cell.LayerNormLSTMCell.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.rnn.cell.LayerNormLSTMCell.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.rnn.cell.LayerNormLSTMCell.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.rnn.cell.LayerNormLSTMCell.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.rnn.cell.NASCell": "tfa.rnn.NASCell", 
+    "tfa.rnn.cell.NASCell.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.rnn.cell.NASCell.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.rnn.cell.NASCell.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.rnn.cell.NASCell.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.rnn.cell.NASCell.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.rnn.cell.NASCell.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.rnn.cell.NASCell.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.rnn.cell.NASCell.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.rnn.cell.NASCell.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.rnn.cell.NASCell.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.rnn.cell.NASCell.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.rnn.cell.NASCell.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.rnn.cell.NASCell.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.rnn.cell.NASCell.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.rnn.cell.NASCell.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.rnn.cell.NASCell.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.rnn.cell.NASCell.output_size": "tfa.rnn.NASCell.output_size", 
+    "tfa.rnn.cell.NASCell.state_size": "tfa.rnn.NASCell.state_size", 
+    "tfa.rnn.cell.NASCell.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.rnn.cell.NASCell.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.rnn.cell.NASCell.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.rnn.cell.NASCell.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.rnn.cell.NASCell.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.rnn.cell.NASCell.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.rnn.cell.NASCell.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.rnn.cell.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.rnn.cell.division": "tfa.activations.division", 
+    "tfa.rnn.cell.print_function": "tfa.activations.print_function", 
+    "tfa.rnn.division": "tfa.activations.division", 
+    "tfa.rnn.print_function": "tfa.activations.print_function", 
+    "tfa.seq2seq.AttentionWrapper.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.AttentionWrapper.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.AttentionWrapper.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.AttentionWrapper.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.AttentionWrapper.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.AttentionWrapper.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.AttentionWrapper.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.AttentionWrapper.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.AttentionWrapper.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.AttentionWrapper.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.AttentionWrapper.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.AttentionWrapper.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.AttentionWrapper.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.AttentionWrapper.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.AttentionWrapper.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.AttentionWrapper.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.AttentionWrapper.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.AttentionWrapper.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.AttentionWrapper.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.AttentionWrapper.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.AttentionWrapper.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.AttentionWrapper.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.AttentionWrapper.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.AttentionWrapperState.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.BahdanauAttention.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.BahdanauAttention.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.BahdanauAttention.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.BahdanauAttention.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.BahdanauAttention.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.BahdanauAttention.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.BahdanauAttention.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.BahdanauAttention.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.BahdanauAttention.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.BahdanauAttention.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.BahdanauAttention.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.BahdanauAttention.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.BahdanauAttention.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.BahdanauAttention.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.BahdanauAttention.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.BahdanauAttention.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.BahdanauAttention.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.BahdanauAttention.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.BahdanauAttention.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.BahdanauAttention.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.BahdanauAttention.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.BahdanauAttention.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.BahdanauAttention.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.alignments_size": "tfa.seq2seq.BahdanauAttention.alignments_size", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.state_size": "tfa.seq2seq.BahdanauAttention.state_size", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.BahdanauMonotonicAttention.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.BaseDecoder.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.BaseDecoder.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.BaseDecoder.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.BaseDecoder.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.BaseDecoder.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.BaseDecoder.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.BaseDecoder.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.BaseDecoder.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.BaseDecoder.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.BaseDecoder.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.BaseDecoder.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.BaseDecoder.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.BaseDecoder.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.BaseDecoder.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.BaseDecoder.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.BaseDecoder.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.BaseDecoder.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.BaseDecoder.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.BaseDecoder.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.BaseDecoder.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.BaseDecoder.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.BaseDecoder.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.BaseDecoder.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.BasicDecoder.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.BasicDecoder.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.BasicDecoder.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.BasicDecoder.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.BasicDecoder.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.BasicDecoder.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.BasicDecoder.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.BasicDecoder.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.BasicDecoder.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.BasicDecoder.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.BasicDecoder.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.BasicDecoder.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.BasicDecoder.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.BasicDecoder.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.BasicDecoder.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.BasicDecoder.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.BasicDecoder.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.BasicDecoder.tracks_own_finished": "tfa.seq2seq.BaseDecoder.tracks_own_finished", 
+    "tfa.seq2seq.BasicDecoder.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.BasicDecoder.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.BasicDecoder.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.BasicDecoder.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.BasicDecoder.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.BasicDecoder.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.BasicDecoderOutput.__add__": "tfa.seq2seq.AttentionWrapperState.__add__", 
+    "tfa.seq2seq.BasicDecoderOutput.__contains__": "tfa.seq2seq.AttentionWrapperState.__contains__", 
+    "tfa.seq2seq.BasicDecoderOutput.__eq__": "tfa.seq2seq.AttentionWrapperState.__eq__", 
+    "tfa.seq2seq.BasicDecoderOutput.__ge__": "tfa.seq2seq.AttentionWrapperState.__ge__", 
+    "tfa.seq2seq.BasicDecoderOutput.__getitem__": "tfa.seq2seq.AttentionWrapperState.__getitem__", 
+    "tfa.seq2seq.BasicDecoderOutput.__gt__": "tfa.seq2seq.AttentionWrapperState.__gt__", 
+    "tfa.seq2seq.BasicDecoderOutput.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.BasicDecoderOutput.__iter__": "tfa.seq2seq.AttentionWrapperState.__iter__", 
+    "tfa.seq2seq.BasicDecoderOutput.__le__": "tfa.seq2seq.AttentionWrapperState.__le__", 
+    "tfa.seq2seq.BasicDecoderOutput.__len__": "tfa.seq2seq.AttentionWrapperState.__len__", 
+    "tfa.seq2seq.BasicDecoderOutput.__lt__": "tfa.seq2seq.AttentionWrapperState.__lt__", 
+    "tfa.seq2seq.BasicDecoderOutput.__mul__": "tfa.seq2seq.AttentionWrapperState.__mul__", 
+    "tfa.seq2seq.BasicDecoderOutput.__ne__": "tfa.seq2seq.AttentionWrapperState.__ne__", 
+    "tfa.seq2seq.BasicDecoderOutput.__rmul__": "tfa.seq2seq.AttentionWrapperState.__rmul__", 
+    "tfa.seq2seq.BasicDecoderOutput.count": "tfa.seq2seq.AttentionWrapperState.count", 
+    "tfa.seq2seq.BasicDecoderOutput.index": "tfa.seq2seq.AttentionWrapperState.index", 
+    "tfa.seq2seq.BeamSearchDecoder.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.BeamSearchDecoder.batch_size": "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin.batch_size", 
+    "tfa.seq2seq.BeamSearchDecoder.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.BeamSearchDecoder.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.BeamSearchDecoder.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.BeamSearchDecoder.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.BeamSearchDecoder.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.BeamSearchDecoder.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.BeamSearchDecoder.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.BeamSearchDecoder.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.BeamSearchDecoder.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.BeamSearchDecoder.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.BeamSearchDecoder.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.BeamSearchDecoder.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.BeamSearchDecoder.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.BeamSearchDecoder.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.BeamSearchDecoder.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.BeamSearchDecoder.output_size": "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin.output_size", 
+    "tfa.seq2seq.BeamSearchDecoder.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.BeamSearchDecoder.tracks_own_finished": "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin.tracks_own_finished", 
+    "tfa.seq2seq.BeamSearchDecoder.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.BeamSearchDecoder.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.BeamSearchDecoder.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.BeamSearchDecoder.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.BeamSearchDecoder.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.BeamSearchDecoder.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__add__": "tfa.seq2seq.AttentionWrapperState.__add__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__contains__": "tfa.seq2seq.AttentionWrapperState.__contains__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__eq__": "tfa.seq2seq.AttentionWrapperState.__eq__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__ge__": "tfa.seq2seq.AttentionWrapperState.__ge__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__getitem__": "tfa.seq2seq.AttentionWrapperState.__getitem__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__gt__": "tfa.seq2seq.AttentionWrapperState.__gt__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__iter__": "tfa.seq2seq.AttentionWrapperState.__iter__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__le__": "tfa.seq2seq.AttentionWrapperState.__le__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__len__": "tfa.seq2seq.AttentionWrapperState.__len__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__lt__": "tfa.seq2seq.AttentionWrapperState.__lt__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__mul__": "tfa.seq2seq.AttentionWrapperState.__mul__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__ne__": "tfa.seq2seq.AttentionWrapperState.__ne__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__rmul__": "tfa.seq2seq.AttentionWrapperState.__rmul__", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.count": "tfa.seq2seq.AttentionWrapperState.count", 
+    "tfa.seq2seq.BeamSearchDecoderOutput.index": "tfa.seq2seq.AttentionWrapperState.index", 
+    "tfa.seq2seq.BeamSearchDecoderState.__add__": "tfa.seq2seq.AttentionWrapperState.__add__", 
+    "tfa.seq2seq.BeamSearchDecoderState.__contains__": "tfa.seq2seq.AttentionWrapperState.__contains__", 
+    "tfa.seq2seq.BeamSearchDecoderState.__eq__": "tfa.seq2seq.AttentionWrapperState.__eq__", 
+    "tfa.seq2seq.BeamSearchDecoderState.__ge__": "tfa.seq2seq.AttentionWrapperState.__ge__", 
+    "tfa.seq2seq.BeamSearchDecoderState.__getitem__": "tfa.seq2seq.AttentionWrapperState.__getitem__", 
+    "tfa.seq2seq.BeamSearchDecoderState.__gt__": "tfa.seq2seq.AttentionWrapperState.__gt__", 
+    "tfa.seq2seq.BeamSearchDecoderState.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.BeamSearchDecoderState.__iter__": "tfa.seq2seq.AttentionWrapperState.__iter__", 
+    "tfa.seq2seq.BeamSearchDecoderState.__le__": "tfa.seq2seq.AttentionWrapperState.__le__", 
+    "tfa.seq2seq.BeamSearchDecoderState.__len__": "tfa.seq2seq.AttentionWrapperState.__len__", 
+    "tfa.seq2seq.BeamSearchDecoderState.__lt__": "tfa.seq2seq.AttentionWrapperState.__lt__", 
+    "tfa.seq2seq.BeamSearchDecoderState.__mul__": "tfa.seq2seq.AttentionWrapperState.__mul__", 
+    "tfa.seq2seq.BeamSearchDecoderState.__ne__": "tfa.seq2seq.AttentionWrapperState.__ne__", 
+    "tfa.seq2seq.BeamSearchDecoderState.__rmul__": "tfa.seq2seq.AttentionWrapperState.__rmul__", 
+    "tfa.seq2seq.BeamSearchDecoderState.count": "tfa.seq2seq.AttentionWrapperState.count", 
+    "tfa.seq2seq.BeamSearchDecoderState.index": "tfa.seq2seq.AttentionWrapperState.index", 
+    "tfa.seq2seq.Decoder.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__add__": "tfa.seq2seq.AttentionWrapperState.__add__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__contains__": "tfa.seq2seq.AttentionWrapperState.__contains__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__eq__": "tfa.seq2seq.AttentionWrapperState.__eq__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__ge__": "tfa.seq2seq.AttentionWrapperState.__ge__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__getitem__": "tfa.seq2seq.AttentionWrapperState.__getitem__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__gt__": "tfa.seq2seq.AttentionWrapperState.__gt__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__iter__": "tfa.seq2seq.AttentionWrapperState.__iter__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__le__": "tfa.seq2seq.AttentionWrapperState.__le__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__len__": "tfa.seq2seq.AttentionWrapperState.__len__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__lt__": "tfa.seq2seq.AttentionWrapperState.__lt__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__mul__": "tfa.seq2seq.AttentionWrapperState.__mul__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__ne__": "tfa.seq2seq.AttentionWrapperState.__ne__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__rmul__": "tfa.seq2seq.AttentionWrapperState.__rmul__", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.count": "tfa.seq2seq.AttentionWrapperState.count", 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.index": "tfa.seq2seq.AttentionWrapperState.index", 
+    "tfa.seq2seq.LuongAttention.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.LuongAttention.alignments_size": "tfa.seq2seq.BahdanauAttention.alignments_size", 
+    "tfa.seq2seq.LuongAttention.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.LuongAttention.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.LuongAttention.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.LuongAttention.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.LuongAttention.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.LuongAttention.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.LuongAttention.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.LuongAttention.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.LuongAttention.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.LuongAttention.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.LuongAttention.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.LuongAttention.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.LuongAttention.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.LuongAttention.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.LuongAttention.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.LuongAttention.state_size": "tfa.seq2seq.BahdanauAttention.state_size", 
+    "tfa.seq2seq.LuongAttention.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.LuongAttention.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.LuongAttention.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.LuongAttention.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.LuongAttention.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.LuongAttention.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.LuongAttention.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.LuongMonotonicAttention.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.LuongMonotonicAttention.alignments_size": "tfa.seq2seq.BahdanauAttention.alignments_size", 
+    "tfa.seq2seq.LuongMonotonicAttention.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.LuongMonotonicAttention.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.LuongMonotonicAttention.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.LuongMonotonicAttention.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.LuongMonotonicAttention.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.LuongMonotonicAttention.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.LuongMonotonicAttention.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.LuongMonotonicAttention.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.LuongMonotonicAttention.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.LuongMonotonicAttention.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.LuongMonotonicAttention.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.LuongMonotonicAttention.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.LuongMonotonicAttention.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.LuongMonotonicAttention.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.LuongMonotonicAttention.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.LuongMonotonicAttention.state_size": "tfa.seq2seq.BahdanauAttention.state_size", 
+    "tfa.seq2seq.LuongMonotonicAttention.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.LuongMonotonicAttention.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.LuongMonotonicAttention.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.LuongMonotonicAttention.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.LuongMonotonicAttention.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.LuongMonotonicAttention.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.LuongMonotonicAttention.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.SampleEmbeddingSampler.batch_size": "tfa.seq2seq.GreedyEmbeddingSampler.batch_size", 
+    "tfa.seq2seq.SampleEmbeddingSampler.sample_ids_dtype": "tfa.seq2seq.GreedyEmbeddingSampler.sample_ids_dtype", 
+    "tfa.seq2seq.SampleEmbeddingSampler.sample_ids_shape": "tfa.seq2seq.GreedyEmbeddingSampler.sample_ids_shape", 
+    "tfa.seq2seq.Sampler.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.ScheduledEmbeddingTrainingSampler.batch_size": "tfa.seq2seq.TrainingSampler.batch_size", 
+    "tfa.seq2seq.ScheduledEmbeddingTrainingSampler.sample_ids_dtype": "tfa.seq2seq.TrainingSampler.sample_ids_dtype", 
+    "tfa.seq2seq.ScheduledEmbeddingTrainingSampler.sample_ids_shape": "tfa.seq2seq.TrainingSampler.sample_ids_shape", 
+    "tfa.seq2seq.ScheduledOutputTrainingSampler.batch_size": "tfa.seq2seq.TrainingSampler.batch_size", 
+    "tfa.seq2seq.ScheduledOutputTrainingSampler.sample_ids_dtype": "tfa.seq2seq.TrainingSampler.sample_ids_dtype", 
+    "tfa.seq2seq.ScheduledOutputTrainingSampler.sample_ids_shape": "tfa.seq2seq.TrainingSampler.sample_ids_shape", 
+    "tfa.seq2seq.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.seq2seq.attention_wrapper.AttentionMechanism": "tfa.seq2seq.AttentionMechanism", 
+    "tfa.seq2seq.attention_wrapper.AttentionMechanism.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.attention_wrapper.AttentionMechanism.alignments_size": "tfa.seq2seq.AttentionMechanism.alignments_size", 
+    "tfa.seq2seq.attention_wrapper.AttentionMechanism.state_size": "tfa.seq2seq.AttentionMechanism.state_size", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper": "tfa.seq2seq.AttentionWrapper", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.output_size": "tfa.seq2seq.AttentionWrapper.output_size", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.state_size": "tfa.seq2seq.AttentionWrapper.state_size", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState": "tfa.seq2seq.AttentionWrapperState", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__add__": "tfa.seq2seq.AttentionWrapperState.__add__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__contains__": "tfa.seq2seq.AttentionWrapperState.__contains__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__eq__": "tfa.seq2seq.AttentionWrapperState.__eq__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__ge__": "tfa.seq2seq.AttentionWrapperState.__ge__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__getitem__": "tfa.seq2seq.AttentionWrapperState.__getitem__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__gt__": "tfa.seq2seq.AttentionWrapperState.__gt__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__iter__": "tfa.seq2seq.AttentionWrapperState.__iter__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__le__": "tfa.seq2seq.AttentionWrapperState.__le__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__len__": "tfa.seq2seq.AttentionWrapperState.__len__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__lt__": "tfa.seq2seq.AttentionWrapperState.__lt__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__mul__": "tfa.seq2seq.AttentionWrapperState.__mul__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__ne__": "tfa.seq2seq.AttentionWrapperState.__ne__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__rmul__": "tfa.seq2seq.AttentionWrapperState.__rmul__", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.alignment_history": "tfa.seq2seq.AttentionWrapperState.alignment_history", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.alignments": "tfa.seq2seq.AttentionWrapperState.alignments", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.attention": "tfa.seq2seq.AttentionWrapperState.attention", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.attention_state": "tfa.seq2seq.AttentionWrapperState.attention_state", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.cell_state": "tfa.seq2seq.AttentionWrapperState.cell_state", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.count": "tfa.seq2seq.AttentionWrapperState.count", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.index": "tfa.seq2seq.AttentionWrapperState.index", 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.time": "tfa.seq2seq.AttentionWrapperState.time", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention": "tfa.seq2seq.BahdanauAttention", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.alignments_size": "tfa.seq2seq.BahdanauAttention.alignments_size", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.state_size": "tfa.seq2seq.BahdanauAttention.state_size", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention": "tfa.seq2seq.BahdanauMonotonicAttention", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.alignments_size": "tfa.seq2seq.BahdanauAttention.alignments_size", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.state_size": "tfa.seq2seq.BahdanauAttention.state_size", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention": "tfa.seq2seq.LuongAttention", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.alignments_size": "tfa.seq2seq.BahdanauAttention.alignments_size", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.state_size": "tfa.seq2seq.BahdanauAttention.state_size", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention": "tfa.seq2seq.LuongMonotonicAttention", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.alignments_size": "tfa.seq2seq.BahdanauAttention.alignments_size", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.state_size": "tfa.seq2seq.BahdanauAttention.state_size", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.attention_wrapper.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.seq2seq.attention_wrapper.division": "tfa.activations.division", 
+    "tfa.seq2seq.attention_wrapper.hardmax": "tfa.seq2seq.hardmax", 
+    "tfa.seq2seq.attention_wrapper.monotonic_attention": "tfa.seq2seq.monotonic_attention", 
+    "tfa.seq2seq.attention_wrapper.print_function": "tfa.activations.print_function", 
+    "tfa.seq2seq.attention_wrapper.safe_cumprod": "tfa.seq2seq.safe_cumprod", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder": "tfa.seq2seq.BasicDecoder", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.batch_size": "tfa.seq2seq.BasicDecoder.batch_size", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.output_dtype": "tfa.seq2seq.BasicDecoder.output_dtype", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.output_size": "tfa.seq2seq.BasicDecoder.output_size", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.tracks_own_finished": "tfa.seq2seq.BaseDecoder.tracks_own_finished", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput": "tfa.seq2seq.BasicDecoderOutput", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__add__": "tfa.seq2seq.AttentionWrapperState.__add__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__contains__": "tfa.seq2seq.AttentionWrapperState.__contains__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__eq__": "tfa.seq2seq.AttentionWrapperState.__eq__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__ge__": "tfa.seq2seq.AttentionWrapperState.__ge__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__getitem__": "tfa.seq2seq.AttentionWrapperState.__getitem__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__gt__": "tfa.seq2seq.AttentionWrapperState.__gt__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__iter__": "tfa.seq2seq.AttentionWrapperState.__iter__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__le__": "tfa.seq2seq.AttentionWrapperState.__le__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__len__": "tfa.seq2seq.AttentionWrapperState.__len__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__lt__": "tfa.seq2seq.AttentionWrapperState.__lt__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__mul__": "tfa.seq2seq.AttentionWrapperState.__mul__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__ne__": "tfa.seq2seq.AttentionWrapperState.__ne__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__rmul__": "tfa.seq2seq.AttentionWrapperState.__rmul__", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.count": "tfa.seq2seq.AttentionWrapperState.count", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.index": "tfa.seq2seq.AttentionWrapperState.index", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.rnn_output": "tfa.seq2seq.BasicDecoderOutput.rnn_output", 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.sample_id": "tfa.seq2seq.BasicDecoderOutput.sample_id", 
+    "tfa.seq2seq.basic_decoder.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.seq2seq.basic_decoder.division": "tfa.activations.division", 
+    "tfa.seq2seq.basic_decoder.print_function": "tfa.activations.print_function", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder": "tfa.seq2seq.BeamSearchDecoder", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.batch_size": "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin.batch_size", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.output_dtype": "tfa.seq2seq.BeamSearchDecoder.output_dtype", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.output_size": "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin.output_size", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.tracks_own_finished": "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin.tracks_own_finished", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput": "tfa.seq2seq.BeamSearchDecoderOutput", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__add__": "tfa.seq2seq.AttentionWrapperState.__add__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__contains__": "tfa.seq2seq.AttentionWrapperState.__contains__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__eq__": "tfa.seq2seq.AttentionWrapperState.__eq__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__ge__": "tfa.seq2seq.AttentionWrapperState.__ge__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__getitem__": "tfa.seq2seq.AttentionWrapperState.__getitem__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__gt__": "tfa.seq2seq.AttentionWrapperState.__gt__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__iter__": "tfa.seq2seq.AttentionWrapperState.__iter__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__le__": "tfa.seq2seq.AttentionWrapperState.__le__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__len__": "tfa.seq2seq.AttentionWrapperState.__len__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__lt__": "tfa.seq2seq.AttentionWrapperState.__lt__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__mul__": "tfa.seq2seq.AttentionWrapperState.__mul__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__ne__": "tfa.seq2seq.AttentionWrapperState.__ne__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__rmul__": "tfa.seq2seq.AttentionWrapperState.__rmul__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.count": "tfa.seq2seq.AttentionWrapperState.count", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.index": "tfa.seq2seq.AttentionWrapperState.index", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.parent_ids": "tfa.seq2seq.BeamSearchDecoderOutput.parent_ids", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.predicted_ids": "tfa.seq2seq.BeamSearchDecoderOutput.predicted_ids", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.scores": "tfa.seq2seq.BeamSearchDecoderOutput.scores", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState": "tfa.seq2seq.BeamSearchDecoderState", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__add__": "tfa.seq2seq.AttentionWrapperState.__add__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__contains__": "tfa.seq2seq.AttentionWrapperState.__contains__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__eq__": "tfa.seq2seq.AttentionWrapperState.__eq__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__ge__": "tfa.seq2seq.AttentionWrapperState.__ge__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__getitem__": "tfa.seq2seq.AttentionWrapperState.__getitem__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__gt__": "tfa.seq2seq.AttentionWrapperState.__gt__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__iter__": "tfa.seq2seq.AttentionWrapperState.__iter__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__le__": "tfa.seq2seq.AttentionWrapperState.__le__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__len__": "tfa.seq2seq.AttentionWrapperState.__len__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__lt__": "tfa.seq2seq.AttentionWrapperState.__lt__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__mul__": "tfa.seq2seq.AttentionWrapperState.__mul__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__ne__": "tfa.seq2seq.AttentionWrapperState.__ne__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__rmul__": "tfa.seq2seq.AttentionWrapperState.__rmul__", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.accumulated_attention_probs": "tfa.seq2seq.BeamSearchDecoderState.accumulated_attention_probs", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.cell_state": "tfa.seq2seq.BeamSearchDecoderState.cell_state", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.count": "tfa.seq2seq.AttentionWrapperState.count", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.finished": "tfa.seq2seq.BeamSearchDecoderState.finished", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.index": "tfa.seq2seq.AttentionWrapperState.index", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.lengths": "tfa.seq2seq.BeamSearchDecoderState.lengths", 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.log_probs": "tfa.seq2seq.BeamSearchDecoderState.log_probs", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput": "tfa.seq2seq.FinalBeamSearchDecoderOutput", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__add__": "tfa.seq2seq.AttentionWrapperState.__add__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__contains__": "tfa.seq2seq.AttentionWrapperState.__contains__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__eq__": "tfa.seq2seq.AttentionWrapperState.__eq__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__ge__": "tfa.seq2seq.AttentionWrapperState.__ge__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__getitem__": "tfa.seq2seq.AttentionWrapperState.__getitem__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__gt__": "tfa.seq2seq.AttentionWrapperState.__gt__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__iter__": "tfa.seq2seq.AttentionWrapperState.__iter__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__le__": "tfa.seq2seq.AttentionWrapperState.__le__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__len__": "tfa.seq2seq.AttentionWrapperState.__len__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__lt__": "tfa.seq2seq.AttentionWrapperState.__lt__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__mul__": "tfa.seq2seq.AttentionWrapperState.__mul__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__ne__": "tfa.seq2seq.AttentionWrapperState.__ne__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__rmul__": "tfa.seq2seq.AttentionWrapperState.__rmul__", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.beam_search_decoder_output": "tfa.seq2seq.FinalBeamSearchDecoderOutput.beam_search_decoder_output", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.count": "tfa.seq2seq.AttentionWrapperState.count", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.index": "tfa.seq2seq.AttentionWrapperState.index", 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.predicted_ids": "tfa.seq2seq.FinalBeamSearchDecoderOutput.predicted_ids", 
+    "tfa.seq2seq.beam_search_decoder.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.seq2seq.beam_search_decoder.division": "tfa.activations.division", 
+    "tfa.seq2seq.beam_search_decoder.gather_tree_from_array": "tfa.seq2seq.gather_tree_from_array", 
+    "tfa.seq2seq.beam_search_decoder.print_function": "tfa.activations.print_function", 
+    "tfa.seq2seq.beam_search_decoder.tile_batch": "tfa.seq2seq.tile_batch", 
+    "tfa.seq2seq.decoder.BaseDecoder": "tfa.seq2seq.BaseDecoder", 
+    "tfa.seq2seq.decoder.BaseDecoder.activity_regularizer": "tfa.layers.GroupNormalization.activity_regularizer", 
+    "tfa.seq2seq.decoder.BaseDecoder.batch_size": "tfa.seq2seq.BaseDecoder.batch_size", 
+    "tfa.seq2seq.decoder.BaseDecoder.dtype": "tfa.layers.GroupNormalization.dtype", 
+    "tfa.seq2seq.decoder.BaseDecoder.dynamic": "tfa.layers.GroupNormalization.dynamic", 
+    "tfa.seq2seq.decoder.BaseDecoder.input": "tfa.layers.GroupNormalization.input", 
+    "tfa.seq2seq.decoder.BaseDecoder.input_mask": "tfa.layers.GroupNormalization.input_mask", 
+    "tfa.seq2seq.decoder.BaseDecoder.input_shape": "tfa.layers.GroupNormalization.input_shape", 
+    "tfa.seq2seq.decoder.BaseDecoder.input_spec": "tfa.layers.GroupNormalization.input_spec", 
+    "tfa.seq2seq.decoder.BaseDecoder.losses": "tfa.layers.GroupNormalization.losses", 
+    "tfa.seq2seq.decoder.BaseDecoder.metrics": "tfa.layers.GroupNormalization.metrics", 
+    "tfa.seq2seq.decoder.BaseDecoder.name": "tfa.layers.GroupNormalization.name", 
+    "tfa.seq2seq.decoder.BaseDecoder.name_scope": "tfa.layers.GroupNormalization.name_scope", 
+    "tfa.seq2seq.decoder.BaseDecoder.non_trainable_variables": "tfa.layers.GroupNormalization.non_trainable_variables", 
+    "tfa.seq2seq.decoder.BaseDecoder.non_trainable_weights": "tfa.layers.GroupNormalization.non_trainable_weights", 
+    "tfa.seq2seq.decoder.BaseDecoder.output": "tfa.layers.GroupNormalization.output", 
+    "tfa.seq2seq.decoder.BaseDecoder.output_dtype": "tfa.seq2seq.BaseDecoder.output_dtype", 
+    "tfa.seq2seq.decoder.BaseDecoder.output_mask": "tfa.layers.GroupNormalization.output_mask", 
+    "tfa.seq2seq.decoder.BaseDecoder.output_shape": "tfa.layers.GroupNormalization.output_shape", 
+    "tfa.seq2seq.decoder.BaseDecoder.output_size": "tfa.seq2seq.BaseDecoder.output_size", 
+    "tfa.seq2seq.decoder.BaseDecoder.submodules": "tfa.layers.GroupNormalization.submodules", 
+    "tfa.seq2seq.decoder.BaseDecoder.tracks_own_finished": "tfa.seq2seq.BaseDecoder.tracks_own_finished", 
+    "tfa.seq2seq.decoder.BaseDecoder.trainable": "tfa.layers.GroupNormalization.trainable", 
+    "tfa.seq2seq.decoder.BaseDecoder.trainable_variables": "tfa.layers.GroupNormalization.trainable_variables", 
+    "tfa.seq2seq.decoder.BaseDecoder.trainable_weights": "tfa.layers.GroupNormalization.trainable_weights", 
+    "tfa.seq2seq.decoder.BaseDecoder.updates": "tfa.layers.GroupNormalization.updates", 
+    "tfa.seq2seq.decoder.BaseDecoder.variables": "tfa.layers.GroupNormalization.variables", 
+    "tfa.seq2seq.decoder.BaseDecoder.weights": "tfa.layers.GroupNormalization.weights", 
+    "tfa.seq2seq.decoder.Decoder": "tfa.seq2seq.Decoder", 
+    "tfa.seq2seq.decoder.Decoder.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.decoder.Decoder.batch_size": "tfa.seq2seq.Decoder.batch_size", 
+    "tfa.seq2seq.decoder.Decoder.output_dtype": "tfa.seq2seq.Decoder.output_dtype", 
+    "tfa.seq2seq.decoder.Decoder.output_size": "tfa.seq2seq.Decoder.output_size", 
+    "tfa.seq2seq.decoder.Decoder.tracks_own_finished": "tfa.seq2seq.Decoder.tracks_own_finished", 
+    "tfa.seq2seq.decoder.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.seq2seq.decoder.division": "tfa.activations.division", 
+    "tfa.seq2seq.decoder.dynamic_decode": "tfa.seq2seq.dynamic_decode", 
+    "tfa.seq2seq.decoder.print_function": "tfa.activations.print_function", 
+    "tfa.seq2seq.division": "tfa.activations.division", 
+    "tfa.seq2seq.loss.SequenceLoss": "tfa.seq2seq.SequenceLoss", 
+    "tfa.seq2seq.loss.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.seq2seq.loss.division": "tfa.activations.division", 
+    "tfa.seq2seq.loss.print_function": "tfa.activations.print_function", 
+    "tfa.seq2seq.loss.sequence_loss": "tfa.seq2seq.sequence_loss", 
+    "tfa.seq2seq.print_function": "tfa.activations.print_function", 
+    "tfa.seq2seq.sampler.CustomSampler": "tfa.seq2seq.CustomSampler", 
+    "tfa.seq2seq.sampler.CustomSampler.batch_size": "tfa.seq2seq.CustomSampler.batch_size", 
+    "tfa.seq2seq.sampler.CustomSampler.sample_ids_dtype": "tfa.seq2seq.CustomSampler.sample_ids_dtype", 
+    "tfa.seq2seq.sampler.CustomSampler.sample_ids_shape": "tfa.seq2seq.CustomSampler.sample_ids_shape", 
+    "tfa.seq2seq.sampler.GreedyEmbeddingSampler": "tfa.seq2seq.GreedyEmbeddingSampler", 
+    "tfa.seq2seq.sampler.GreedyEmbeddingSampler.batch_size": "tfa.seq2seq.GreedyEmbeddingSampler.batch_size", 
+    "tfa.seq2seq.sampler.GreedyEmbeddingSampler.sample_ids_dtype": "tfa.seq2seq.GreedyEmbeddingSampler.sample_ids_dtype", 
+    "tfa.seq2seq.sampler.GreedyEmbeddingSampler.sample_ids_shape": "tfa.seq2seq.GreedyEmbeddingSampler.sample_ids_shape", 
+    "tfa.seq2seq.sampler.InferenceSampler": "tfa.seq2seq.InferenceSampler", 
+    "tfa.seq2seq.sampler.InferenceSampler.batch_size": "tfa.seq2seq.InferenceSampler.batch_size", 
+    "tfa.seq2seq.sampler.InferenceSampler.sample_ids_dtype": "tfa.seq2seq.InferenceSampler.sample_ids_dtype", 
+    "tfa.seq2seq.sampler.InferenceSampler.sample_ids_shape": "tfa.seq2seq.InferenceSampler.sample_ids_shape", 
+    "tfa.seq2seq.sampler.SampleEmbeddingSampler": "tfa.seq2seq.SampleEmbeddingSampler", 
+    "tfa.seq2seq.sampler.SampleEmbeddingSampler.batch_size": "tfa.seq2seq.GreedyEmbeddingSampler.batch_size", 
+    "tfa.seq2seq.sampler.SampleEmbeddingSampler.sample_ids_dtype": "tfa.seq2seq.GreedyEmbeddingSampler.sample_ids_dtype", 
+    "tfa.seq2seq.sampler.SampleEmbeddingSampler.sample_ids_shape": "tfa.seq2seq.GreedyEmbeddingSampler.sample_ids_shape", 
+    "tfa.seq2seq.sampler.Sampler": "tfa.seq2seq.Sampler", 
+    "tfa.seq2seq.sampler.Sampler.__init__": "tfa.seq2seq.AttentionMechanism.__init__", 
+    "tfa.seq2seq.sampler.Sampler.batch_size": "tfa.seq2seq.Sampler.batch_size", 
+    "tfa.seq2seq.sampler.Sampler.sample_ids_dtype": "tfa.seq2seq.Sampler.sample_ids_dtype", 
+    "tfa.seq2seq.sampler.Sampler.sample_ids_shape": "tfa.seq2seq.Sampler.sample_ids_shape", 
+    "tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler": "tfa.seq2seq.ScheduledEmbeddingTrainingSampler", 
+    "tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler.batch_size": "tfa.seq2seq.TrainingSampler.batch_size", 
+    "tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler.sample_ids_dtype": "tfa.seq2seq.TrainingSampler.sample_ids_dtype", 
+    "tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler.sample_ids_shape": "tfa.seq2seq.TrainingSampler.sample_ids_shape", 
+    "tfa.seq2seq.sampler.ScheduledOutputTrainingSampler": "tfa.seq2seq.ScheduledOutputTrainingSampler", 
+    "tfa.seq2seq.sampler.ScheduledOutputTrainingSampler.batch_size": "tfa.seq2seq.TrainingSampler.batch_size", 
+    "tfa.seq2seq.sampler.ScheduledOutputTrainingSampler.sample_ids_dtype": "tfa.seq2seq.TrainingSampler.sample_ids_dtype", 
+    "tfa.seq2seq.sampler.ScheduledOutputTrainingSampler.sample_ids_shape": "tfa.seq2seq.TrainingSampler.sample_ids_shape", 
+    "tfa.seq2seq.sampler.TrainingSampler": "tfa.seq2seq.TrainingSampler", 
+    "tfa.seq2seq.sampler.TrainingSampler.batch_size": "tfa.seq2seq.TrainingSampler.batch_size", 
+    "tfa.seq2seq.sampler.TrainingSampler.sample_ids_dtype": "tfa.seq2seq.TrainingSampler.sample_ids_dtype", 
+    "tfa.seq2seq.sampler.TrainingSampler.sample_ids_shape": "tfa.seq2seq.TrainingSampler.sample_ids_shape", 
+    "tfa.seq2seq.sampler.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.seq2seq.sampler.division": "tfa.activations.division", 
+    "tfa.seq2seq.sampler.print_function": "tfa.activations.print_function", 
+    "tfa.text.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.text.division": "tfa.activations.division", 
+    "tfa.text.print_function": "tfa.activations.print_function", 
+    "tfa.text.skip_gram_ops.absolute_import": "tfa.activations.absolute_import", 
+    "tfa.text.skip_gram_ops.division": "tfa.activations.division", 
+    "tfa.text.skip_gram_ops.print_function": "tfa.activations.print_function", 
+    "tfa.text.skip_gram_ops.skip_gram_sample": "tfa.text.skip_gram_sample", 
+    "tfa.text.skip_gram_ops.skip_gram_sample_with_text_vocab": "tfa.text.skip_gram_sample_with_text_vocab"
+  }, 
+  "is_fragment": {
+    "tfa": false, 
+    "tfa.activations": false, 
+    "tfa.activations.absolute_import": true, 
+    "tfa.activations.division": true, 
+    "tfa.activations.print_function": true, 
+    "tfa.activations.sparsemax": false, 
+    "tfa.image": false, 
+    "tfa.image.absolute_import": true, 
+    "tfa.image.adjust_hsv_in_yiq": false, 
+    "tfa.image.dense_image_warp": false, 
+    "tfa.image.distance_transform": false, 
+    "tfa.image.distance_transform.absolute_import": true, 
+    "tfa.image.distance_transform.division": true, 
+    "tfa.image.distance_transform.euclidean_dist_transform": false, 
+    "tfa.image.distance_transform.print_function": true, 
+    "tfa.image.distort_image_ops": false, 
+    "tfa.image.distort_image_ops.absolute_import": true, 
+    "tfa.image.distort_image_ops.adjust_hsv_in_yiq": false, 
+    "tfa.image.distort_image_ops.division": true, 
+    "tfa.image.distort_image_ops.print_function": true, 
+    "tfa.image.distort_image_ops.random_hsv_in_yiq": false, 
+    "tfa.image.division": true, 
+    "tfa.image.euclidean_dist_transform": false, 
+    "tfa.image.filters": false, 
+    "tfa.image.filters.absolute_import": true, 
+    "tfa.image.filters.division": true, 
+    "tfa.image.filters.mean_filter2d": false, 
+    "tfa.image.filters.median_filter2d": false, 
+    "tfa.image.filters.print_function": true, 
+    "tfa.image.interpolate_bilinear": false, 
+    "tfa.image.mean_filter2d": false, 
+    "tfa.image.median_filter2d": false, 
+    "tfa.image.print_function": true, 
+    "tfa.image.random_hsv_in_yiq": false, 
+    "tfa.image.rotate": false, 
+    "tfa.image.transform": false, 
+    "tfa.image.transform_ops": false, 
+    "tfa.image.transform_ops.absolute_import": true, 
+    "tfa.image.transform_ops.angles_to_projective_transforms": false, 
+    "tfa.image.transform_ops.compose_transforms": false, 
+    "tfa.image.transform_ops.division": true, 
+    "tfa.image.transform_ops.flat_transforms_to_matrices": false, 
+    "tfa.image.transform_ops.matrices_to_flat_transforms": false, 
+    "tfa.image.transform_ops.print_function": true, 
+    "tfa.image.transform_ops.rotate": false, 
+    "tfa.image.transform_ops.transform": false, 
+    "tfa.layers": false, 
+    "tfa.layers.GroupNormalization": false, 
+    "tfa.layers.GroupNormalization.__call__": true, 
+    "tfa.layers.GroupNormalization.__init__": true, 
+    "tfa.layers.GroupNormalization.activity_regularizer": true, 
+    "tfa.layers.GroupNormalization.add_loss": true, 
+    "tfa.layers.GroupNormalization.add_metric": true, 
+    "tfa.layers.GroupNormalization.add_update": true, 
+    "tfa.layers.GroupNormalization.add_variable": true, 
+    "tfa.layers.GroupNormalization.add_weight": true, 
+    "tfa.layers.GroupNormalization.apply": true, 
+    "tfa.layers.GroupNormalization.build": true, 
+    "tfa.layers.GroupNormalization.call": true, 
+    "tfa.layers.GroupNormalization.compute_mask": true, 
+    "tfa.layers.GroupNormalization.compute_output_shape": true, 
+    "tfa.layers.GroupNormalization.compute_output_signature": true, 
+    "tfa.layers.GroupNormalization.count_params": true, 
+    "tfa.layers.GroupNormalization.dtype": true, 
+    "tfa.layers.GroupNormalization.dynamic": true, 
+    "tfa.layers.GroupNormalization.from_config": true, 
+    "tfa.layers.GroupNormalization.get_config": true, 
+    "tfa.layers.GroupNormalization.get_input_at": true, 
+    "tfa.layers.GroupNormalization.get_input_mask_at": true, 
+    "tfa.layers.GroupNormalization.get_input_shape_at": true, 
+    "tfa.layers.GroupNormalization.get_losses_for": true, 
+    "tfa.layers.GroupNormalization.get_output_at": true, 
+    "tfa.layers.GroupNormalization.get_output_mask_at": true, 
+    "tfa.layers.GroupNormalization.get_output_shape_at": true, 
+    "tfa.layers.GroupNormalization.get_updates_for": true, 
+    "tfa.layers.GroupNormalization.get_weights": true, 
+    "tfa.layers.GroupNormalization.input": true, 
+    "tfa.layers.GroupNormalization.input_mask": true, 
+    "tfa.layers.GroupNormalization.input_shape": true, 
+    "tfa.layers.GroupNormalization.input_spec": true, 
+    "tfa.layers.GroupNormalization.losses": true, 
+    "tfa.layers.GroupNormalization.metrics": true, 
+    "tfa.layers.GroupNormalization.name": true, 
+    "tfa.layers.GroupNormalization.name_scope": true, 
+    "tfa.layers.GroupNormalization.non_trainable_variables": true, 
+    "tfa.layers.GroupNormalization.non_trainable_weights": true, 
+    "tfa.layers.GroupNormalization.output": true, 
+    "tfa.layers.GroupNormalization.output_mask": true, 
+    "tfa.layers.GroupNormalization.output_shape": true, 
+    "tfa.layers.GroupNormalization.set_weights": true, 
+    "tfa.layers.GroupNormalization.submodules": true, 
+    "tfa.layers.GroupNormalization.trainable": true, 
+    "tfa.layers.GroupNormalization.trainable_variables": true, 
+    "tfa.layers.GroupNormalization.trainable_weights": true, 
+    "tfa.layers.GroupNormalization.updates": true, 
+    "tfa.layers.GroupNormalization.variables": true, 
+    "tfa.layers.GroupNormalization.weights": true, 
+    "tfa.layers.GroupNormalization.with_name_scope": true, 
+    "tfa.layers.InstanceNormalization": false, 
+    "tfa.layers.InstanceNormalization.__call__": true, 
+    "tfa.layers.InstanceNormalization.__init__": true, 
+    "tfa.layers.InstanceNormalization.activity_regularizer": true, 
+    "tfa.layers.InstanceNormalization.add_loss": true, 
+    "tfa.layers.InstanceNormalization.add_metric": true, 
+    "tfa.layers.InstanceNormalization.add_update": true, 
+    "tfa.layers.InstanceNormalization.add_variable": true, 
+    "tfa.layers.InstanceNormalization.add_weight": true, 
+    "tfa.layers.InstanceNormalization.apply": true, 
+    "tfa.layers.InstanceNormalization.build": true, 
+    "tfa.layers.InstanceNormalization.call": true, 
+    "tfa.layers.InstanceNormalization.compute_mask": true, 
+    "tfa.layers.InstanceNormalization.compute_output_shape": true, 
+    "tfa.layers.InstanceNormalization.compute_output_signature": true, 
+    "tfa.layers.InstanceNormalization.count_params": true, 
+    "tfa.layers.InstanceNormalization.dtype": true, 
+    "tfa.layers.InstanceNormalization.dynamic": true, 
+    "tfa.layers.InstanceNormalization.from_config": true, 
+    "tfa.layers.InstanceNormalization.get_config": true, 
+    "tfa.layers.InstanceNormalization.get_input_at": true, 
+    "tfa.layers.InstanceNormalization.get_input_mask_at": true, 
+    "tfa.layers.InstanceNormalization.get_input_shape_at": true, 
+    "tfa.layers.InstanceNormalization.get_losses_for": true, 
+    "tfa.layers.InstanceNormalization.get_output_at": true, 
+    "tfa.layers.InstanceNormalization.get_output_mask_at": true, 
+    "tfa.layers.InstanceNormalization.get_output_shape_at": true, 
+    "tfa.layers.InstanceNormalization.get_updates_for": true, 
+    "tfa.layers.InstanceNormalization.get_weights": true, 
+    "tfa.layers.InstanceNormalization.input": true, 
+    "tfa.layers.InstanceNormalization.input_mask": true, 
+    "tfa.layers.InstanceNormalization.input_shape": true, 
+    "tfa.layers.InstanceNormalization.input_spec": true, 
+    "tfa.layers.InstanceNormalization.losses": true, 
+    "tfa.layers.InstanceNormalization.metrics": true, 
+    "tfa.layers.InstanceNormalization.name": true, 
+    "tfa.layers.InstanceNormalization.name_scope": true, 
+    "tfa.layers.InstanceNormalization.non_trainable_variables": true, 
+    "tfa.layers.InstanceNormalization.non_trainable_weights": true, 
+    "tfa.layers.InstanceNormalization.output": true, 
+    "tfa.layers.InstanceNormalization.output_mask": true, 
+    "tfa.layers.InstanceNormalization.output_shape": true, 
+    "tfa.layers.InstanceNormalization.set_weights": true, 
+    "tfa.layers.InstanceNormalization.submodules": true, 
+    "tfa.layers.InstanceNormalization.trainable": true, 
+    "tfa.layers.InstanceNormalization.trainable_variables": true, 
+    "tfa.layers.InstanceNormalization.trainable_weights": true, 
+    "tfa.layers.InstanceNormalization.updates": true, 
+    "tfa.layers.InstanceNormalization.variables": true, 
+    "tfa.layers.InstanceNormalization.weights": true, 
+    "tfa.layers.InstanceNormalization.with_name_scope": true, 
+    "tfa.layers.Maxout": false, 
+    "tfa.layers.Maxout.__call__": true, 
+    "tfa.layers.Maxout.__init__": true, 
+    "tfa.layers.Maxout.activity_regularizer": true, 
+    "tfa.layers.Maxout.add_loss": true, 
+    "tfa.layers.Maxout.add_metric": true, 
+    "tfa.layers.Maxout.add_update": true, 
+    "tfa.layers.Maxout.add_variable": true, 
+    "tfa.layers.Maxout.add_weight": true, 
+    "tfa.layers.Maxout.apply": true, 
+    "tfa.layers.Maxout.build": true, 
+    "tfa.layers.Maxout.call": true, 
+    "tfa.layers.Maxout.compute_mask": true, 
+    "tfa.layers.Maxout.compute_output_shape": true, 
+    "tfa.layers.Maxout.compute_output_signature": true, 
+    "tfa.layers.Maxout.count_params": true, 
+    "tfa.layers.Maxout.dtype": true, 
+    "tfa.layers.Maxout.dynamic": true, 
+    "tfa.layers.Maxout.from_config": true, 
+    "tfa.layers.Maxout.get_config": true, 
+    "tfa.layers.Maxout.get_input_at": true, 
+    "tfa.layers.Maxout.get_input_mask_at": true, 
+    "tfa.layers.Maxout.get_input_shape_at": true, 
+    "tfa.layers.Maxout.get_losses_for": true, 
+    "tfa.layers.Maxout.get_output_at": true, 
+    "tfa.layers.Maxout.get_output_mask_at": true, 
+    "tfa.layers.Maxout.get_output_shape_at": true, 
+    "tfa.layers.Maxout.get_updates_for": true, 
+    "tfa.layers.Maxout.get_weights": true, 
+    "tfa.layers.Maxout.input": true, 
+    "tfa.layers.Maxout.input_mask": true, 
+    "tfa.layers.Maxout.input_shape": true, 
+    "tfa.layers.Maxout.input_spec": true, 
+    "tfa.layers.Maxout.losses": true, 
+    "tfa.layers.Maxout.metrics": true, 
+    "tfa.layers.Maxout.name": true, 
+    "tfa.layers.Maxout.name_scope": true, 
+    "tfa.layers.Maxout.non_trainable_variables": true, 
+    "tfa.layers.Maxout.non_trainable_weights": true, 
+    "tfa.layers.Maxout.output": true, 
+    "tfa.layers.Maxout.output_mask": true, 
+    "tfa.layers.Maxout.output_shape": true, 
+    "tfa.layers.Maxout.set_weights": true, 
+    "tfa.layers.Maxout.submodules": true, 
+    "tfa.layers.Maxout.trainable": true, 
+    "tfa.layers.Maxout.trainable_variables": true, 
+    "tfa.layers.Maxout.trainable_weights": true, 
+    "tfa.layers.Maxout.updates": true, 
+    "tfa.layers.Maxout.variables": true, 
+    "tfa.layers.Maxout.weights": true, 
+    "tfa.layers.Maxout.with_name_scope": true, 
+    "tfa.layers.PoincareNormalize": false, 
+    "tfa.layers.PoincareNormalize.__call__": true, 
+    "tfa.layers.PoincareNormalize.__init__": true, 
+    "tfa.layers.PoincareNormalize.activity_regularizer": true, 
+    "tfa.layers.PoincareNormalize.add_loss": true, 
+    "tfa.layers.PoincareNormalize.add_metric": true, 
+    "tfa.layers.PoincareNormalize.add_update": true, 
+    "tfa.layers.PoincareNormalize.add_variable": true, 
+    "tfa.layers.PoincareNormalize.add_weight": true, 
+    "tfa.layers.PoincareNormalize.apply": true, 
+    "tfa.layers.PoincareNormalize.build": true, 
+    "tfa.layers.PoincareNormalize.call": true, 
+    "tfa.layers.PoincareNormalize.compute_mask": true, 
+    "tfa.layers.PoincareNormalize.compute_output_shape": true, 
+    "tfa.layers.PoincareNormalize.compute_output_signature": true, 
+    "tfa.layers.PoincareNormalize.count_params": true, 
+    "tfa.layers.PoincareNormalize.dtype": true, 
+    "tfa.layers.PoincareNormalize.dynamic": true, 
+    "tfa.layers.PoincareNormalize.from_config": true, 
+    "tfa.layers.PoincareNormalize.get_config": true, 
+    "tfa.layers.PoincareNormalize.get_input_at": true, 
+    "tfa.layers.PoincareNormalize.get_input_mask_at": true, 
+    "tfa.layers.PoincareNormalize.get_input_shape_at": true, 
+    "tfa.layers.PoincareNormalize.get_losses_for": true, 
+    "tfa.layers.PoincareNormalize.get_output_at": true, 
+    "tfa.layers.PoincareNormalize.get_output_mask_at": true, 
+    "tfa.layers.PoincareNormalize.get_output_shape_at": true, 
+    "tfa.layers.PoincareNormalize.get_updates_for": true, 
+    "tfa.layers.PoincareNormalize.get_weights": true, 
+    "tfa.layers.PoincareNormalize.input": true, 
+    "tfa.layers.PoincareNormalize.input_mask": true, 
+    "tfa.layers.PoincareNormalize.input_shape": true, 
+    "tfa.layers.PoincareNormalize.input_spec": true, 
+    "tfa.layers.PoincareNormalize.losses": true, 
+    "tfa.layers.PoincareNormalize.metrics": true, 
+    "tfa.layers.PoincareNormalize.name": true, 
+    "tfa.layers.PoincareNormalize.name_scope": true, 
+    "tfa.layers.PoincareNormalize.non_trainable_variables": true, 
+    "tfa.layers.PoincareNormalize.non_trainable_weights": true, 
+    "tfa.layers.PoincareNormalize.output": true, 
+    "tfa.layers.PoincareNormalize.output_mask": true, 
+    "tfa.layers.PoincareNormalize.output_shape": true, 
+    "tfa.layers.PoincareNormalize.set_weights": true, 
+    "tfa.layers.PoincareNormalize.submodules": true, 
+    "tfa.layers.PoincareNormalize.trainable": true, 
+    "tfa.layers.PoincareNormalize.trainable_variables": true, 
+    "tfa.layers.PoincareNormalize.trainable_weights": true, 
+    "tfa.layers.PoincareNormalize.updates": true, 
+    "tfa.layers.PoincareNormalize.variables": true, 
+    "tfa.layers.PoincareNormalize.weights": true, 
+    "tfa.layers.PoincareNormalize.with_name_scope": true, 
+    "tfa.layers.Sparsemax": false, 
+    "tfa.layers.Sparsemax.__call__": true, 
+    "tfa.layers.Sparsemax.__init__": true, 
+    "tfa.layers.Sparsemax.activity_regularizer": true, 
+    "tfa.layers.Sparsemax.add_loss": true, 
+    "tfa.layers.Sparsemax.add_metric": true, 
+    "tfa.layers.Sparsemax.add_update": true, 
+    "tfa.layers.Sparsemax.add_variable": true, 
+    "tfa.layers.Sparsemax.add_weight": true, 
+    "tfa.layers.Sparsemax.apply": true, 
+    "tfa.layers.Sparsemax.build": true, 
+    "tfa.layers.Sparsemax.call": true, 
+    "tfa.layers.Sparsemax.compute_mask": true, 
+    "tfa.layers.Sparsemax.compute_output_shape": true, 
+    "tfa.layers.Sparsemax.compute_output_signature": true, 
+    "tfa.layers.Sparsemax.count_params": true, 
+    "tfa.layers.Sparsemax.dtype": true, 
+    "tfa.layers.Sparsemax.dynamic": true, 
+    "tfa.layers.Sparsemax.from_config": true, 
+    "tfa.layers.Sparsemax.get_config": true, 
+    "tfa.layers.Sparsemax.get_input_at": true, 
+    "tfa.layers.Sparsemax.get_input_mask_at": true, 
+    "tfa.layers.Sparsemax.get_input_shape_at": true, 
+    "tfa.layers.Sparsemax.get_losses_for": true, 
+    "tfa.layers.Sparsemax.get_output_at": true, 
+    "tfa.layers.Sparsemax.get_output_mask_at": true, 
+    "tfa.layers.Sparsemax.get_output_shape_at": true, 
+    "tfa.layers.Sparsemax.get_updates_for": true, 
+    "tfa.layers.Sparsemax.get_weights": true, 
+    "tfa.layers.Sparsemax.input": true, 
+    "tfa.layers.Sparsemax.input_mask": true, 
+    "tfa.layers.Sparsemax.input_shape": true, 
+    "tfa.layers.Sparsemax.input_spec": true, 
+    "tfa.layers.Sparsemax.losses": true, 
+    "tfa.layers.Sparsemax.metrics": true, 
+    "tfa.layers.Sparsemax.name": true, 
+    "tfa.layers.Sparsemax.name_scope": true, 
+    "tfa.layers.Sparsemax.non_trainable_variables": true, 
+    "tfa.layers.Sparsemax.non_trainable_weights": true, 
+    "tfa.layers.Sparsemax.output": true, 
+    "tfa.layers.Sparsemax.output_mask": true, 
+    "tfa.layers.Sparsemax.output_shape": true, 
+    "tfa.layers.Sparsemax.set_weights": true, 
+    "tfa.layers.Sparsemax.submodules": true, 
+    "tfa.layers.Sparsemax.trainable": true, 
+    "tfa.layers.Sparsemax.trainable_variables": true, 
+    "tfa.layers.Sparsemax.trainable_weights": true, 
+    "tfa.layers.Sparsemax.updates": true, 
+    "tfa.layers.Sparsemax.variables": true, 
+    "tfa.layers.Sparsemax.weights": true, 
+    "tfa.layers.Sparsemax.with_name_scope": true, 
+    "tfa.layers.WeightNormalization": false, 
+    "tfa.layers.WeightNormalization.__call__": true, 
+    "tfa.layers.WeightNormalization.__init__": true, 
+    "tfa.layers.WeightNormalization.activity_regularizer": true, 
+    "tfa.layers.WeightNormalization.add_loss": true, 
+    "tfa.layers.WeightNormalization.add_metric": true, 
+    "tfa.layers.WeightNormalization.add_update": true, 
+    "tfa.layers.WeightNormalization.add_variable": true, 
+    "tfa.layers.WeightNormalization.add_weight": true, 
+    "tfa.layers.WeightNormalization.apply": true, 
+    "tfa.layers.WeightNormalization.build": true, 
+    "tfa.layers.WeightNormalization.call": true, 
+    "tfa.layers.WeightNormalization.compute_mask": true, 
+    "tfa.layers.WeightNormalization.compute_output_shape": true, 
+    "tfa.layers.WeightNormalization.compute_output_signature": true, 
+    "tfa.layers.WeightNormalization.count_params": true, 
+    "tfa.layers.WeightNormalization.dtype": true, 
+    "tfa.layers.WeightNormalization.dynamic": true, 
+    "tfa.layers.WeightNormalization.from_config": true, 
+    "tfa.layers.WeightNormalization.get_config": true, 
+    "tfa.layers.WeightNormalization.get_input_at": true, 
+    "tfa.layers.WeightNormalization.get_input_mask_at": true, 
+    "tfa.layers.WeightNormalization.get_input_shape_at": true, 
+    "tfa.layers.WeightNormalization.get_losses_for": true, 
+    "tfa.layers.WeightNormalization.get_output_at": true, 
+    "tfa.layers.WeightNormalization.get_output_mask_at": true, 
+    "tfa.layers.WeightNormalization.get_output_shape_at": true, 
+    "tfa.layers.WeightNormalization.get_updates_for": true, 
+    "tfa.layers.WeightNormalization.get_weights": true, 
+    "tfa.layers.WeightNormalization.input": true, 
+    "tfa.layers.WeightNormalization.input_mask": true, 
+    "tfa.layers.WeightNormalization.input_shape": true, 
+    "tfa.layers.WeightNormalization.input_spec": true, 
+    "tfa.layers.WeightNormalization.losses": true, 
+    "tfa.layers.WeightNormalization.metrics": true, 
+    "tfa.layers.WeightNormalization.name": true, 
+    "tfa.layers.WeightNormalization.name_scope": true, 
+    "tfa.layers.WeightNormalization.non_trainable_variables": true, 
+    "tfa.layers.WeightNormalization.non_trainable_weights": true, 
+    "tfa.layers.WeightNormalization.output": true, 
+    "tfa.layers.WeightNormalization.output_mask": true, 
+    "tfa.layers.WeightNormalization.output_shape": true, 
+    "tfa.layers.WeightNormalization.set_weights": true, 
+    "tfa.layers.WeightNormalization.submodules": true, 
+    "tfa.layers.WeightNormalization.trainable": true, 
+    "tfa.layers.WeightNormalization.trainable_variables": true, 
+    "tfa.layers.WeightNormalization.trainable_weights": true, 
+    "tfa.layers.WeightNormalization.updates": true, 
+    "tfa.layers.WeightNormalization.variables": true, 
+    "tfa.layers.WeightNormalization.weights": true, 
+    "tfa.layers.WeightNormalization.with_name_scope": true, 
+    "tfa.layers.absolute_import": true, 
+    "tfa.layers.division": true, 
+    "tfa.layers.maxout": false, 
+    "tfa.layers.maxout.Maxout": false, 
+    "tfa.layers.maxout.Maxout.__call__": true, 
+    "tfa.layers.maxout.Maxout.__init__": true, 
+    "tfa.layers.maxout.Maxout.activity_regularizer": true, 
+    "tfa.layers.maxout.Maxout.add_loss": true, 
+    "tfa.layers.maxout.Maxout.add_metric": true, 
+    "tfa.layers.maxout.Maxout.add_update": true, 
+    "tfa.layers.maxout.Maxout.add_variable": true, 
+    "tfa.layers.maxout.Maxout.add_weight": true, 
+    "tfa.layers.maxout.Maxout.apply": true, 
+    "tfa.layers.maxout.Maxout.build": true, 
+    "tfa.layers.maxout.Maxout.call": true, 
+    "tfa.layers.maxout.Maxout.compute_mask": true, 
+    "tfa.layers.maxout.Maxout.compute_output_shape": true, 
+    "tfa.layers.maxout.Maxout.compute_output_signature": true, 
+    "tfa.layers.maxout.Maxout.count_params": true, 
+    "tfa.layers.maxout.Maxout.dtype": true, 
+    "tfa.layers.maxout.Maxout.dynamic": true, 
+    "tfa.layers.maxout.Maxout.from_config": true, 
+    "tfa.layers.maxout.Maxout.get_config": true, 
+    "tfa.layers.maxout.Maxout.get_input_at": true, 
+    "tfa.layers.maxout.Maxout.get_input_mask_at": true, 
+    "tfa.layers.maxout.Maxout.get_input_shape_at": true, 
+    "tfa.layers.maxout.Maxout.get_losses_for": true, 
+    "tfa.layers.maxout.Maxout.get_output_at": true, 
+    "tfa.layers.maxout.Maxout.get_output_mask_at": true, 
+    "tfa.layers.maxout.Maxout.get_output_shape_at": true, 
+    "tfa.layers.maxout.Maxout.get_updates_for": true, 
+    "tfa.layers.maxout.Maxout.get_weights": true, 
+    "tfa.layers.maxout.Maxout.input": true, 
+    "tfa.layers.maxout.Maxout.input_mask": true, 
+    "tfa.layers.maxout.Maxout.input_shape": true, 
+    "tfa.layers.maxout.Maxout.input_spec": true, 
+    "tfa.layers.maxout.Maxout.losses": true, 
+    "tfa.layers.maxout.Maxout.metrics": true, 
+    "tfa.layers.maxout.Maxout.name": true, 
+    "tfa.layers.maxout.Maxout.name_scope": true, 
+    "tfa.layers.maxout.Maxout.non_trainable_variables": true, 
+    "tfa.layers.maxout.Maxout.non_trainable_weights": true, 
+    "tfa.layers.maxout.Maxout.output": true, 
+    "tfa.layers.maxout.Maxout.output_mask": true, 
+    "tfa.layers.maxout.Maxout.output_shape": true, 
+    "tfa.layers.maxout.Maxout.set_weights": true, 
+    "tfa.layers.maxout.Maxout.submodules": true, 
+    "tfa.layers.maxout.Maxout.trainable": true, 
+    "tfa.layers.maxout.Maxout.trainable_variables": true, 
+    "tfa.layers.maxout.Maxout.trainable_weights": true, 
+    "tfa.layers.maxout.Maxout.updates": true, 
+    "tfa.layers.maxout.Maxout.variables": true, 
+    "tfa.layers.maxout.Maxout.weights": true, 
+    "tfa.layers.maxout.Maxout.with_name_scope": true, 
+    "tfa.layers.maxout.absolute_import": true, 
+    "tfa.layers.maxout.division": true, 
+    "tfa.layers.maxout.print_function": true, 
+    "tfa.layers.normalizations": false, 
+    "tfa.layers.normalizations.GroupNormalization": false, 
+    "tfa.layers.normalizations.GroupNormalization.__call__": true, 
+    "tfa.layers.normalizations.GroupNormalization.__init__": true, 
+    "tfa.layers.normalizations.GroupNormalization.activity_regularizer": true, 
+    "tfa.layers.normalizations.GroupNormalization.add_loss": true, 
+    "tfa.layers.normalizations.GroupNormalization.add_metric": true, 
+    "tfa.layers.normalizations.GroupNormalization.add_update": true, 
+    "tfa.layers.normalizations.GroupNormalization.add_variable": true, 
+    "tfa.layers.normalizations.GroupNormalization.add_weight": true, 
+    "tfa.layers.normalizations.GroupNormalization.apply": true, 
+    "tfa.layers.normalizations.GroupNormalization.build": true, 
+    "tfa.layers.normalizations.GroupNormalization.call": true, 
+    "tfa.layers.normalizations.GroupNormalization.compute_mask": true, 
+    "tfa.layers.normalizations.GroupNormalization.compute_output_shape": true, 
+    "tfa.layers.normalizations.GroupNormalization.compute_output_signature": true, 
+    "tfa.layers.normalizations.GroupNormalization.count_params": true, 
+    "tfa.layers.normalizations.GroupNormalization.dtype": true, 
+    "tfa.layers.normalizations.GroupNormalization.dynamic": true, 
+    "tfa.layers.normalizations.GroupNormalization.from_config": true, 
+    "tfa.layers.normalizations.GroupNormalization.get_config": true, 
+    "tfa.layers.normalizations.GroupNormalization.get_input_at": true, 
+    "tfa.layers.normalizations.GroupNormalization.get_input_mask_at": true, 
+    "tfa.layers.normalizations.GroupNormalization.get_input_shape_at": true, 
+    "tfa.layers.normalizations.GroupNormalization.get_losses_for": true, 
+    "tfa.layers.normalizations.GroupNormalization.get_output_at": true, 
+    "tfa.layers.normalizations.GroupNormalization.get_output_mask_at": true, 
+    "tfa.layers.normalizations.GroupNormalization.get_output_shape_at": true, 
+    "tfa.layers.normalizations.GroupNormalization.get_updates_for": true, 
+    "tfa.layers.normalizations.GroupNormalization.get_weights": true, 
+    "tfa.layers.normalizations.GroupNormalization.input": true, 
+    "tfa.layers.normalizations.GroupNormalization.input_mask": true, 
+    "tfa.layers.normalizations.GroupNormalization.input_shape": true, 
+    "tfa.layers.normalizations.GroupNormalization.input_spec": true, 
+    "tfa.layers.normalizations.GroupNormalization.losses": true, 
+    "tfa.layers.normalizations.GroupNormalization.metrics": true, 
+    "tfa.layers.normalizations.GroupNormalization.name": true, 
+    "tfa.layers.normalizations.GroupNormalization.name_scope": true, 
+    "tfa.layers.normalizations.GroupNormalization.non_trainable_variables": true, 
+    "tfa.layers.normalizations.GroupNormalization.non_trainable_weights": true, 
+    "tfa.layers.normalizations.GroupNormalization.output": true, 
+    "tfa.layers.normalizations.GroupNormalization.output_mask": true, 
+    "tfa.layers.normalizations.GroupNormalization.output_shape": true, 
+    "tfa.layers.normalizations.GroupNormalization.set_weights": true, 
+    "tfa.layers.normalizations.GroupNormalization.submodules": true, 
+    "tfa.layers.normalizations.GroupNormalization.trainable": true, 
+    "tfa.layers.normalizations.GroupNormalization.trainable_variables": true, 
+    "tfa.layers.normalizations.GroupNormalization.trainable_weights": true, 
+    "tfa.layers.normalizations.GroupNormalization.updates": true, 
+    "tfa.layers.normalizations.GroupNormalization.variables": true, 
+    "tfa.layers.normalizations.GroupNormalization.weights": true, 
+    "tfa.layers.normalizations.GroupNormalization.with_name_scope": true, 
+    "tfa.layers.normalizations.InstanceNormalization": false, 
+    "tfa.layers.normalizations.InstanceNormalization.__call__": true, 
+    "tfa.layers.normalizations.InstanceNormalization.__init__": true, 
+    "tfa.layers.normalizations.InstanceNormalization.activity_regularizer": true, 
+    "tfa.layers.normalizations.InstanceNormalization.add_loss": true, 
+    "tfa.layers.normalizations.InstanceNormalization.add_metric": true, 
+    "tfa.layers.normalizations.InstanceNormalization.add_update": true, 
+    "tfa.layers.normalizations.InstanceNormalization.add_variable": true, 
+    "tfa.layers.normalizations.InstanceNormalization.add_weight": true, 
+    "tfa.layers.normalizations.InstanceNormalization.apply": true, 
+    "tfa.layers.normalizations.InstanceNormalization.build": true, 
+    "tfa.layers.normalizations.InstanceNormalization.call": true, 
+    "tfa.layers.normalizations.InstanceNormalization.compute_mask": true, 
+    "tfa.layers.normalizations.InstanceNormalization.compute_output_shape": true, 
+    "tfa.layers.normalizations.InstanceNormalization.compute_output_signature": true, 
+    "tfa.layers.normalizations.InstanceNormalization.count_params": true, 
+    "tfa.layers.normalizations.InstanceNormalization.dtype": true, 
+    "tfa.layers.normalizations.InstanceNormalization.dynamic": true, 
+    "tfa.layers.normalizations.InstanceNormalization.from_config": true, 
+    "tfa.layers.normalizations.InstanceNormalization.get_config": true, 
+    "tfa.layers.normalizations.InstanceNormalization.get_input_at": true, 
+    "tfa.layers.normalizations.InstanceNormalization.get_input_mask_at": true, 
+    "tfa.layers.normalizations.InstanceNormalization.get_input_shape_at": true, 
+    "tfa.layers.normalizations.InstanceNormalization.get_losses_for": true, 
+    "tfa.layers.normalizations.InstanceNormalization.get_output_at": true, 
+    "tfa.layers.normalizations.InstanceNormalization.get_output_mask_at": true, 
+    "tfa.layers.normalizations.InstanceNormalization.get_output_shape_at": true, 
+    "tfa.layers.normalizations.InstanceNormalization.get_updates_for": true, 
+    "tfa.layers.normalizations.InstanceNormalization.get_weights": true, 
+    "tfa.layers.normalizations.InstanceNormalization.input": true, 
+    "tfa.layers.normalizations.InstanceNormalization.input_mask": true, 
+    "tfa.layers.normalizations.InstanceNormalization.input_shape": true, 
+    "tfa.layers.normalizations.InstanceNormalization.input_spec": true, 
+    "tfa.layers.normalizations.InstanceNormalization.losses": true, 
+    "tfa.layers.normalizations.InstanceNormalization.metrics": true, 
+    "tfa.layers.normalizations.InstanceNormalization.name": true, 
+    "tfa.layers.normalizations.InstanceNormalization.name_scope": true, 
+    "tfa.layers.normalizations.InstanceNormalization.non_trainable_variables": true, 
+    "tfa.layers.normalizations.InstanceNormalization.non_trainable_weights": true, 
+    "tfa.layers.normalizations.InstanceNormalization.output": true, 
+    "tfa.layers.normalizations.InstanceNormalization.output_mask": true, 
+    "tfa.layers.normalizations.InstanceNormalization.output_shape": true, 
+    "tfa.layers.normalizations.InstanceNormalization.set_weights": true, 
+    "tfa.layers.normalizations.InstanceNormalization.submodules": true, 
+    "tfa.layers.normalizations.InstanceNormalization.trainable": true, 
+    "tfa.layers.normalizations.InstanceNormalization.trainable_variables": true, 
+    "tfa.layers.normalizations.InstanceNormalization.trainable_weights": true, 
+    "tfa.layers.normalizations.InstanceNormalization.updates": true, 
+    "tfa.layers.normalizations.InstanceNormalization.variables": true, 
+    "tfa.layers.normalizations.InstanceNormalization.weights": true, 
+    "tfa.layers.normalizations.InstanceNormalization.with_name_scope": true, 
+    "tfa.layers.normalizations.absolute_import": true, 
+    "tfa.layers.normalizations.division": true, 
+    "tfa.layers.normalizations.print_function": true, 
+    "tfa.layers.poincare": false, 
+    "tfa.layers.poincare.PoincareNormalize": false, 
+    "tfa.layers.poincare.PoincareNormalize.__call__": true, 
+    "tfa.layers.poincare.PoincareNormalize.__init__": true, 
+    "tfa.layers.poincare.PoincareNormalize.activity_regularizer": true, 
+    "tfa.layers.poincare.PoincareNormalize.add_loss": true, 
+    "tfa.layers.poincare.PoincareNormalize.add_metric": true, 
+    "tfa.layers.poincare.PoincareNormalize.add_update": true, 
+    "tfa.layers.poincare.PoincareNormalize.add_variable": true, 
+    "tfa.layers.poincare.PoincareNormalize.add_weight": true, 
+    "tfa.layers.poincare.PoincareNormalize.apply": true, 
+    "tfa.layers.poincare.PoincareNormalize.build": true, 
+    "tfa.layers.poincare.PoincareNormalize.call": true, 
+    "tfa.layers.poincare.PoincareNormalize.compute_mask": true, 
+    "tfa.layers.poincare.PoincareNormalize.compute_output_shape": true, 
+    "tfa.layers.poincare.PoincareNormalize.compute_output_signature": true, 
+    "tfa.layers.poincare.PoincareNormalize.count_params": true, 
+    "tfa.layers.poincare.PoincareNormalize.dtype": true, 
+    "tfa.layers.poincare.PoincareNormalize.dynamic": true, 
+    "tfa.layers.poincare.PoincareNormalize.from_config": true, 
+    "tfa.layers.poincare.PoincareNormalize.get_config": true, 
+    "tfa.layers.poincare.PoincareNormalize.get_input_at": true, 
+    "tfa.layers.poincare.PoincareNormalize.get_input_mask_at": true, 
+    "tfa.layers.poincare.PoincareNormalize.get_input_shape_at": true, 
+    "tfa.layers.poincare.PoincareNormalize.get_losses_for": true, 
+    "tfa.layers.poincare.PoincareNormalize.get_output_at": true, 
+    "tfa.layers.poincare.PoincareNormalize.get_output_mask_at": true, 
+    "tfa.layers.poincare.PoincareNormalize.get_output_shape_at": true, 
+    "tfa.layers.poincare.PoincareNormalize.get_updates_for": true, 
+    "tfa.layers.poincare.PoincareNormalize.get_weights": true, 
+    "tfa.layers.poincare.PoincareNormalize.input": true, 
+    "tfa.layers.poincare.PoincareNormalize.input_mask": true, 
+    "tfa.layers.poincare.PoincareNormalize.input_shape": true, 
+    "tfa.layers.poincare.PoincareNormalize.input_spec": true, 
+    "tfa.layers.poincare.PoincareNormalize.losses": true, 
+    "tfa.layers.poincare.PoincareNormalize.metrics": true, 
+    "tfa.layers.poincare.PoincareNormalize.name": true, 
+    "tfa.layers.poincare.PoincareNormalize.name_scope": true, 
+    "tfa.layers.poincare.PoincareNormalize.non_trainable_variables": true, 
+    "tfa.layers.poincare.PoincareNormalize.non_trainable_weights": true, 
+    "tfa.layers.poincare.PoincareNormalize.output": true, 
+    "tfa.layers.poincare.PoincareNormalize.output_mask": true, 
+    "tfa.layers.poincare.PoincareNormalize.output_shape": true, 
+    "tfa.layers.poincare.PoincareNormalize.set_weights": true, 
+    "tfa.layers.poincare.PoincareNormalize.submodules": true, 
+    "tfa.layers.poincare.PoincareNormalize.trainable": true, 
+    "tfa.layers.poincare.PoincareNormalize.trainable_variables": true, 
+    "tfa.layers.poincare.PoincareNormalize.trainable_weights": true, 
+    "tfa.layers.poincare.PoincareNormalize.updates": true, 
+    "tfa.layers.poincare.PoincareNormalize.variables": true, 
+    "tfa.layers.poincare.PoincareNormalize.weights": true, 
+    "tfa.layers.poincare.PoincareNormalize.with_name_scope": true, 
+    "tfa.layers.poincare.absolute_import": true, 
+    "tfa.layers.poincare.division": true, 
+    "tfa.layers.poincare.print_function": true, 
+    "tfa.layers.print_function": true, 
+    "tfa.layers.sparsemax": false, 
+    "tfa.layers.sparsemax.Sparsemax": false, 
+    "tfa.layers.sparsemax.Sparsemax.__call__": true, 
+    "tfa.layers.sparsemax.Sparsemax.__init__": true, 
+    "tfa.layers.sparsemax.Sparsemax.activity_regularizer": true, 
+    "tfa.layers.sparsemax.Sparsemax.add_loss": true, 
+    "tfa.layers.sparsemax.Sparsemax.add_metric": true, 
+    "tfa.layers.sparsemax.Sparsemax.add_update": true, 
+    "tfa.layers.sparsemax.Sparsemax.add_variable": true, 
+    "tfa.layers.sparsemax.Sparsemax.add_weight": true, 
+    "tfa.layers.sparsemax.Sparsemax.apply": true, 
+    "tfa.layers.sparsemax.Sparsemax.build": true, 
+    "tfa.layers.sparsemax.Sparsemax.call": true, 
+    "tfa.layers.sparsemax.Sparsemax.compute_mask": true, 
+    "tfa.layers.sparsemax.Sparsemax.compute_output_shape": true, 
+    "tfa.layers.sparsemax.Sparsemax.compute_output_signature": true, 
+    "tfa.layers.sparsemax.Sparsemax.count_params": true, 
+    "tfa.layers.sparsemax.Sparsemax.dtype": true, 
+    "tfa.layers.sparsemax.Sparsemax.dynamic": true, 
+    "tfa.layers.sparsemax.Sparsemax.from_config": true, 
+    "tfa.layers.sparsemax.Sparsemax.get_config": true, 
+    "tfa.layers.sparsemax.Sparsemax.get_input_at": true, 
+    "tfa.layers.sparsemax.Sparsemax.get_input_mask_at": true, 
+    "tfa.layers.sparsemax.Sparsemax.get_input_shape_at": true, 
+    "tfa.layers.sparsemax.Sparsemax.get_losses_for": true, 
+    "tfa.layers.sparsemax.Sparsemax.get_output_at": true, 
+    "tfa.layers.sparsemax.Sparsemax.get_output_mask_at": true, 
+    "tfa.layers.sparsemax.Sparsemax.get_output_shape_at": true, 
+    "tfa.layers.sparsemax.Sparsemax.get_updates_for": true, 
+    "tfa.layers.sparsemax.Sparsemax.get_weights": true, 
+    "tfa.layers.sparsemax.Sparsemax.input": true, 
+    "tfa.layers.sparsemax.Sparsemax.input_mask": true, 
+    "tfa.layers.sparsemax.Sparsemax.input_shape": true, 
+    "tfa.layers.sparsemax.Sparsemax.input_spec": true, 
+    "tfa.layers.sparsemax.Sparsemax.losses": true, 
+    "tfa.layers.sparsemax.Sparsemax.metrics": true, 
+    "tfa.layers.sparsemax.Sparsemax.name": true, 
+    "tfa.layers.sparsemax.Sparsemax.name_scope": true, 
+    "tfa.layers.sparsemax.Sparsemax.non_trainable_variables": true, 
+    "tfa.layers.sparsemax.Sparsemax.non_trainable_weights": true, 
+    "tfa.layers.sparsemax.Sparsemax.output": true, 
+    "tfa.layers.sparsemax.Sparsemax.output_mask": true, 
+    "tfa.layers.sparsemax.Sparsemax.output_shape": true, 
+    "tfa.layers.sparsemax.Sparsemax.set_weights": true, 
+    "tfa.layers.sparsemax.Sparsemax.submodules": true, 
+    "tfa.layers.sparsemax.Sparsemax.trainable": true, 
+    "tfa.layers.sparsemax.Sparsemax.trainable_variables": true, 
+    "tfa.layers.sparsemax.Sparsemax.trainable_weights": true, 
+    "tfa.layers.sparsemax.Sparsemax.updates": true, 
+    "tfa.layers.sparsemax.Sparsemax.variables": true, 
+    "tfa.layers.sparsemax.Sparsemax.weights": true, 
+    "tfa.layers.sparsemax.Sparsemax.with_name_scope": true, 
+    "tfa.layers.sparsemax.absolute_import": true, 
+    "tfa.layers.sparsemax.division": true, 
+    "tfa.layers.sparsemax.print_function": true, 
+    "tfa.layers.sparsemax.sparsemax": false, 
+    "tfa.layers.wrappers": false, 
+    "tfa.layers.wrappers.WeightNormalization": false, 
+    "tfa.layers.wrappers.WeightNormalization.__call__": true, 
+    "tfa.layers.wrappers.WeightNormalization.__init__": true, 
+    "tfa.layers.wrappers.WeightNormalization.activity_regularizer": true, 
+    "tfa.layers.wrappers.WeightNormalization.add_loss": true, 
+    "tfa.layers.wrappers.WeightNormalization.add_metric": true, 
+    "tfa.layers.wrappers.WeightNormalization.add_update": true, 
+    "tfa.layers.wrappers.WeightNormalization.add_variable": true, 
+    "tfa.layers.wrappers.WeightNormalization.add_weight": true, 
+    "tfa.layers.wrappers.WeightNormalization.apply": true, 
+    "tfa.layers.wrappers.WeightNormalization.build": true, 
+    "tfa.layers.wrappers.WeightNormalization.call": true, 
+    "tfa.layers.wrappers.WeightNormalization.compute_mask": true, 
+    "tfa.layers.wrappers.WeightNormalization.compute_output_shape": true, 
+    "tfa.layers.wrappers.WeightNormalization.compute_output_signature": true, 
+    "tfa.layers.wrappers.WeightNormalization.count_params": true, 
+    "tfa.layers.wrappers.WeightNormalization.dtype": true, 
+    "tfa.layers.wrappers.WeightNormalization.dynamic": true, 
+    "tfa.layers.wrappers.WeightNormalization.from_config": true, 
+    "tfa.layers.wrappers.WeightNormalization.get_config": true, 
+    "tfa.layers.wrappers.WeightNormalization.get_input_at": true, 
+    "tfa.layers.wrappers.WeightNormalization.get_input_mask_at": true, 
+    "tfa.layers.wrappers.WeightNormalization.get_input_shape_at": true, 
+    "tfa.layers.wrappers.WeightNormalization.get_losses_for": true, 
+    "tfa.layers.wrappers.WeightNormalization.get_output_at": true, 
+    "tfa.layers.wrappers.WeightNormalization.get_output_mask_at": true, 
+    "tfa.layers.wrappers.WeightNormalization.get_output_shape_at": true, 
+    "tfa.layers.wrappers.WeightNormalization.get_updates_for": true, 
+    "tfa.layers.wrappers.WeightNormalization.get_weights": true, 
+    "tfa.layers.wrappers.WeightNormalization.input": true, 
+    "tfa.layers.wrappers.WeightNormalization.input_mask": true, 
+    "tfa.layers.wrappers.WeightNormalization.input_shape": true, 
+    "tfa.layers.wrappers.WeightNormalization.input_spec": true, 
+    "tfa.layers.wrappers.WeightNormalization.losses": true, 
+    "tfa.layers.wrappers.WeightNormalization.metrics": true, 
+    "tfa.layers.wrappers.WeightNormalization.name": true, 
+    "tfa.layers.wrappers.WeightNormalization.name_scope": true, 
+    "tfa.layers.wrappers.WeightNormalization.non_trainable_variables": true, 
+    "tfa.layers.wrappers.WeightNormalization.non_trainable_weights": true, 
+    "tfa.layers.wrappers.WeightNormalization.output": true, 
+    "tfa.layers.wrappers.WeightNormalization.output_mask": true, 
+    "tfa.layers.wrappers.WeightNormalization.output_shape": true, 
+    "tfa.layers.wrappers.WeightNormalization.set_weights": true, 
+    "tfa.layers.wrappers.WeightNormalization.submodules": true, 
+    "tfa.layers.wrappers.WeightNormalization.trainable": true, 
+    "tfa.layers.wrappers.WeightNormalization.trainable_variables": true, 
+    "tfa.layers.wrappers.WeightNormalization.trainable_weights": true, 
+    "tfa.layers.wrappers.WeightNormalization.updates": true, 
+    "tfa.layers.wrappers.WeightNormalization.variables": true, 
+    "tfa.layers.wrappers.WeightNormalization.weights": true, 
+    "tfa.layers.wrappers.WeightNormalization.with_name_scope": true, 
+    "tfa.layers.wrappers.absolute_import": true, 
+    "tfa.layers.wrappers.division": true, 
+    "tfa.layers.wrappers.print_function": true, 
+    "tfa.losses": false, 
+    "tfa.losses.ContrastiveLoss": false, 
+    "tfa.losses.ContrastiveLoss.__call__": true, 
+    "tfa.losses.ContrastiveLoss.__init__": true, 
+    "tfa.losses.ContrastiveLoss.call": true, 
+    "tfa.losses.ContrastiveLoss.from_config": true, 
+    "tfa.losses.ContrastiveLoss.get_config": true, 
+    "tfa.losses.LiftedStructLoss": false, 
+    "tfa.losses.LiftedStructLoss.__call__": true, 
+    "tfa.losses.LiftedStructLoss.__init__": true, 
+    "tfa.losses.LiftedStructLoss.call": true, 
+    "tfa.losses.LiftedStructLoss.from_config": true, 
+    "tfa.losses.LiftedStructLoss.get_config": true, 
+    "tfa.losses.SigmoidFocalCrossEntropy": false, 
+    "tfa.losses.SigmoidFocalCrossEntropy.__call__": true, 
+    "tfa.losses.SigmoidFocalCrossEntropy.__init__": true, 
+    "tfa.losses.SigmoidFocalCrossEntropy.call": true, 
+    "tfa.losses.SigmoidFocalCrossEntropy.from_config": true, 
+    "tfa.losses.SigmoidFocalCrossEntropy.get_config": true, 
+    "tfa.losses.SparsemaxLoss": false, 
+    "tfa.losses.SparsemaxLoss.__call__": true, 
+    "tfa.losses.SparsemaxLoss.__init__": true, 
+    "tfa.losses.SparsemaxLoss.call": true, 
+    "tfa.losses.SparsemaxLoss.from_config": true, 
+    "tfa.losses.SparsemaxLoss.get_config": true, 
+    "tfa.losses.TripletSemiHardLoss": false, 
+    "tfa.losses.TripletSemiHardLoss.__call__": true, 
+    "tfa.losses.TripletSemiHardLoss.__init__": true, 
+    "tfa.losses.TripletSemiHardLoss.call": true, 
+    "tfa.losses.TripletSemiHardLoss.from_config": true, 
+    "tfa.losses.TripletSemiHardLoss.get_config": true, 
+    "tfa.losses.absolute_import": true, 
+    "tfa.losses.contrastive": false, 
+    "tfa.losses.contrastive.ContrastiveLoss": false, 
+    "tfa.losses.contrastive.ContrastiveLoss.__call__": true, 
+    "tfa.losses.contrastive.ContrastiveLoss.__init__": true, 
+    "tfa.losses.contrastive.ContrastiveLoss.call": true, 
+    "tfa.losses.contrastive.ContrastiveLoss.from_config": true, 
+    "tfa.losses.contrastive.ContrastiveLoss.get_config": true, 
+    "tfa.losses.contrastive.absolute_import": true, 
+    "tfa.losses.contrastive.contrastive_loss": false, 
+    "tfa.losses.contrastive.division": true, 
+    "tfa.losses.contrastive.print_function": true, 
+    "tfa.losses.contrastive_loss": false, 
+    "tfa.losses.division": true, 
+    "tfa.losses.focal_loss": false, 
+    "tfa.losses.focal_loss.SigmoidFocalCrossEntropy": false, 
+    "tfa.losses.focal_loss.SigmoidFocalCrossEntropy.__call__": true, 
+    "tfa.losses.focal_loss.SigmoidFocalCrossEntropy.__init__": true, 
+    "tfa.losses.focal_loss.SigmoidFocalCrossEntropy.call": true, 
+    "tfa.losses.focal_loss.SigmoidFocalCrossEntropy.from_config": true, 
+    "tfa.losses.focal_loss.SigmoidFocalCrossEntropy.get_config": true, 
+    "tfa.losses.focal_loss.absolute_import": true, 
+    "tfa.losses.focal_loss.division": true, 
+    "tfa.losses.focal_loss.print_function": true, 
+    "tfa.losses.focal_loss.sigmoid_focal_crossentropy": false, 
+    "tfa.losses.lifted": false, 
+    "tfa.losses.lifted.LiftedStructLoss": false, 
+    "tfa.losses.lifted.LiftedStructLoss.__call__": true, 
+    "tfa.losses.lifted.LiftedStructLoss.__init__": true, 
+    "tfa.losses.lifted.LiftedStructLoss.call": true, 
+    "tfa.losses.lifted.LiftedStructLoss.from_config": true, 
+    "tfa.losses.lifted.LiftedStructLoss.get_config": true, 
+    "tfa.losses.lifted.absolute_import": true, 
+    "tfa.losses.lifted.division": true, 
+    "tfa.losses.lifted.lifted_struct_loss": false, 
+    "tfa.losses.lifted.print_function": true, 
+    "tfa.losses.lifted_struct_loss": false, 
+    "tfa.losses.metric_learning": false, 
+    "tfa.losses.metric_learning.absolute_import": true, 
+    "tfa.losses.metric_learning.division": true, 
+    "tfa.losses.metric_learning.pairwise_distance": false, 
+    "tfa.losses.metric_learning.print_function": true, 
+    "tfa.losses.print_function": true, 
+    "tfa.losses.sigmoid_focal_crossentropy": false, 
+    "tfa.losses.sparsemax_loss": false, 
+    "tfa.losses.triplet": false, 
+    "tfa.losses.triplet.TripletSemiHardLoss": false, 
+    "tfa.losses.triplet.TripletSemiHardLoss.__call__": true, 
+    "tfa.losses.triplet.TripletSemiHardLoss.__init__": true, 
+    "tfa.losses.triplet.TripletSemiHardLoss.call": true, 
+    "tfa.losses.triplet.TripletSemiHardLoss.from_config": true, 
+    "tfa.losses.triplet.TripletSemiHardLoss.get_config": true, 
+    "tfa.losses.triplet.absolute_import": true, 
+    "tfa.losses.triplet.division": true, 
+    "tfa.losses.triplet.print_function": true, 
+    "tfa.losses.triplet.triplet_semihard_loss": false, 
+    "tfa.losses.triplet_semihard_loss": false, 
+    "tfa.metrics": false, 
+    "tfa.metrics.CohenKappa": false, 
+    "tfa.metrics.CohenKappa.__call__": true, 
+    "tfa.metrics.CohenKappa.__init__": true, 
+    "tfa.metrics.CohenKappa.activity_regularizer": true, 
+    "tfa.metrics.CohenKappa.add_loss": true, 
+    "tfa.metrics.CohenKappa.add_metric": true, 
+    "tfa.metrics.CohenKappa.add_update": true, 
+    "tfa.metrics.CohenKappa.add_variable": true, 
+    "tfa.metrics.CohenKappa.add_weight": true, 
+    "tfa.metrics.CohenKappa.apply": true, 
+    "tfa.metrics.CohenKappa.build": true, 
+    "tfa.metrics.CohenKappa.call": true, 
+    "tfa.metrics.CohenKappa.compute_mask": true, 
+    "tfa.metrics.CohenKappa.compute_output_shape": true, 
+    "tfa.metrics.CohenKappa.compute_output_signature": true, 
+    "tfa.metrics.CohenKappa.count_params": true, 
+    "tfa.metrics.CohenKappa.dtype": true, 
+    "tfa.metrics.CohenKappa.dynamic": true, 
+    "tfa.metrics.CohenKappa.from_config": true, 
+    "tfa.metrics.CohenKappa.get_config": true, 
+    "tfa.metrics.CohenKappa.get_input_at": true, 
+    "tfa.metrics.CohenKappa.get_input_mask_at": true, 
+    "tfa.metrics.CohenKappa.get_input_shape_at": true, 
+    "tfa.metrics.CohenKappa.get_losses_for": true, 
+    "tfa.metrics.CohenKappa.get_output_at": true, 
+    "tfa.metrics.CohenKappa.get_output_mask_at": true, 
+    "tfa.metrics.CohenKappa.get_output_shape_at": true, 
+    "tfa.metrics.CohenKappa.get_updates_for": true, 
+    "tfa.metrics.CohenKappa.get_weights": true, 
+    "tfa.metrics.CohenKappa.input": true, 
+    "tfa.metrics.CohenKappa.input_mask": true, 
+    "tfa.metrics.CohenKappa.input_shape": true, 
+    "tfa.metrics.CohenKappa.input_spec": true, 
+    "tfa.metrics.CohenKappa.losses": true, 
+    "tfa.metrics.CohenKappa.metrics": true, 
+    "tfa.metrics.CohenKappa.name": true, 
+    "tfa.metrics.CohenKappa.name_scope": true, 
+    "tfa.metrics.CohenKappa.non_trainable_variables": true, 
+    "tfa.metrics.CohenKappa.non_trainable_weights": true, 
+    "tfa.metrics.CohenKappa.output": true, 
+    "tfa.metrics.CohenKappa.output_mask": true, 
+    "tfa.metrics.CohenKappa.output_shape": true, 
+    "tfa.metrics.CohenKappa.reset_states": true, 
+    "tfa.metrics.CohenKappa.result": true, 
+    "tfa.metrics.CohenKappa.set_weights": true, 
+    "tfa.metrics.CohenKappa.submodules": true, 
+    "tfa.metrics.CohenKappa.trainable": true, 
+    "tfa.metrics.CohenKappa.trainable_variables": true, 
+    "tfa.metrics.CohenKappa.trainable_weights": true, 
+    "tfa.metrics.CohenKappa.update_state": true, 
+    "tfa.metrics.CohenKappa.updates": true, 
+    "tfa.metrics.CohenKappa.variables": true, 
+    "tfa.metrics.CohenKappa.weights": true, 
+    "tfa.metrics.CohenKappa.with_name_scope": true, 
+    "tfa.metrics.absolute_import": true, 
+    "tfa.metrics.cohens_kappa": false, 
+    "tfa.metrics.cohens_kappa.CohenKappa": false, 
+    "tfa.metrics.cohens_kappa.CohenKappa.__call__": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.__init__": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.activity_regularizer": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.add_loss": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.add_metric": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.add_update": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.add_variable": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.add_weight": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.apply": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.build": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.call": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.compute_mask": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.compute_output_shape": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.compute_output_signature": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.count_params": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.dtype": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.dynamic": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.from_config": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.get_config": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.get_input_at": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.get_input_mask_at": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.get_input_shape_at": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.get_losses_for": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.get_output_at": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.get_output_mask_at": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.get_output_shape_at": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.get_updates_for": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.get_weights": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.input": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.input_mask": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.input_shape": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.input_spec": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.losses": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.metrics": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.name": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.name_scope": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.non_trainable_variables": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.non_trainable_weights": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.output": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.output_mask": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.output_shape": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.reset_states": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.result": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.set_weights": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.submodules": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.trainable": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.trainable_variables": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.trainable_weights": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.update_state": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.updates": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.variables": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.weights": true, 
+    "tfa.metrics.cohens_kappa.CohenKappa.with_name_scope": true, 
+    "tfa.metrics.cohens_kappa.absolute_import": true, 
+    "tfa.metrics.cohens_kappa.division": true, 
+    "tfa.metrics.cohens_kappa.print_function": true, 
+    "tfa.metrics.division": true, 
+    "tfa.metrics.print_function": true, 
+    "tfa.optimizers": false, 
+    "tfa.optimizers.AdamW": false, 
+    "tfa.optimizers.AdamW.__init__": true, 
+    "tfa.optimizers.AdamW.add_slot": true, 
+    "tfa.optimizers.AdamW.add_weight": true, 
+    "tfa.optimizers.AdamW.apply_gradients": true, 
+    "tfa.optimizers.AdamW.from_config": true, 
+    "tfa.optimizers.AdamW.get_config": true, 
+    "tfa.optimizers.AdamW.get_gradients": true, 
+    "tfa.optimizers.AdamW.get_slot": true, 
+    "tfa.optimizers.AdamW.get_slot_names": true, 
+    "tfa.optimizers.AdamW.get_updates": true, 
+    "tfa.optimizers.AdamW.get_weights": true, 
+    "tfa.optimizers.AdamW.iterations": true, 
+    "tfa.optimizers.AdamW.minimize": true, 
+    "tfa.optimizers.AdamW.set_weights": true, 
+    "tfa.optimizers.AdamW.variables": true, 
+    "tfa.optimizers.AdamW.weights": true, 
+    "tfa.optimizers.LazyAdam": false, 
+    "tfa.optimizers.LazyAdam.__init__": true, 
+    "tfa.optimizers.LazyAdam.add_slot": true, 
+    "tfa.optimizers.LazyAdam.add_weight": true, 
+    "tfa.optimizers.LazyAdam.apply_gradients": true, 
+    "tfa.optimizers.LazyAdam.from_config": true, 
+    "tfa.optimizers.LazyAdam.get_config": true, 
+    "tfa.optimizers.LazyAdam.get_gradients": true, 
+    "tfa.optimizers.LazyAdam.get_slot": true, 
+    "tfa.optimizers.LazyAdam.get_slot_names": true, 
+    "tfa.optimizers.LazyAdam.get_updates": true, 
+    "tfa.optimizers.LazyAdam.get_weights": true, 
+    "tfa.optimizers.LazyAdam.iterations": true, 
+    "tfa.optimizers.LazyAdam.minimize": true, 
+    "tfa.optimizers.LazyAdam.set_weights": true, 
+    "tfa.optimizers.LazyAdam.variables": true, 
+    "tfa.optimizers.LazyAdam.weights": true, 
+    "tfa.optimizers.MovingAverage": false, 
+    "tfa.optimizers.MovingAverage.__init__": true, 
+    "tfa.optimizers.MovingAverage.add_slot": true, 
+    "tfa.optimizers.MovingAverage.add_weight": true, 
+    "tfa.optimizers.MovingAverage.apply_gradients": true, 
+    "tfa.optimizers.MovingAverage.assign_average_vars": true, 
+    "tfa.optimizers.MovingAverage.from_config": true, 
+    "tfa.optimizers.MovingAverage.get_config": true, 
+    "tfa.optimizers.MovingAverage.get_gradients": true, 
+    "tfa.optimizers.MovingAverage.get_slot": true, 
+    "tfa.optimizers.MovingAverage.get_slot_names": true, 
+    "tfa.optimizers.MovingAverage.get_updates": true, 
+    "tfa.optimizers.MovingAverage.get_weights": true, 
+    "tfa.optimizers.MovingAverage.iterations": true, 
+    "tfa.optimizers.MovingAverage.minimize": true, 
+    "tfa.optimizers.MovingAverage.set_weights": true, 
+    "tfa.optimizers.MovingAverage.variables": true, 
+    "tfa.optimizers.MovingAverage.weights": true, 
+    "tfa.optimizers.SGDW": false, 
+    "tfa.optimizers.SGDW.__init__": true, 
+    "tfa.optimizers.SGDW.add_slot": true, 
+    "tfa.optimizers.SGDW.add_weight": true, 
+    "tfa.optimizers.SGDW.apply_gradients": true, 
+    "tfa.optimizers.SGDW.from_config": true, 
+    "tfa.optimizers.SGDW.get_config": true, 
+    "tfa.optimizers.SGDW.get_gradients": true, 
+    "tfa.optimizers.SGDW.get_slot": true, 
+    "tfa.optimizers.SGDW.get_slot_names": true, 
+    "tfa.optimizers.SGDW.get_updates": true, 
+    "tfa.optimizers.SGDW.get_weights": true, 
+    "tfa.optimizers.SGDW.iterations": true, 
+    "tfa.optimizers.SGDW.minimize": true, 
+    "tfa.optimizers.SGDW.set_weights": true, 
+    "tfa.optimizers.SGDW.variables": true, 
+    "tfa.optimizers.SGDW.weights": true, 
+    "tfa.optimizers.absolute_import": true, 
+    "tfa.optimizers.division": true, 
+    "tfa.optimizers.extend_with_decoupled_weight_decay": false, 
+    "tfa.optimizers.lazy_adam": false, 
+    "tfa.optimizers.lazy_adam.LazyAdam": false, 
+    "tfa.optimizers.lazy_adam.LazyAdam.__init__": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.add_slot": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.add_weight": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.apply_gradients": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.from_config": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.get_config": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.get_gradients": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.get_slot": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.get_slot_names": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.get_updates": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.get_weights": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.iterations": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.minimize": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.set_weights": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.variables": true, 
+    "tfa.optimizers.lazy_adam.LazyAdam.weights": true, 
+    "tfa.optimizers.lazy_adam.absolute_import": true, 
+    "tfa.optimizers.lazy_adam.division": true, 
+    "tfa.optimizers.lazy_adam.print_function": true, 
+    "tfa.optimizers.moving_average": false, 
+    "tfa.optimizers.moving_average.MovingAverage": false, 
+    "tfa.optimizers.moving_average.MovingAverage.__init__": true, 
+    "tfa.optimizers.moving_average.MovingAverage.add_slot": true, 
+    "tfa.optimizers.moving_average.MovingAverage.add_weight": true, 
+    "tfa.optimizers.moving_average.MovingAverage.apply_gradients": true, 
+    "tfa.optimizers.moving_average.MovingAverage.assign_average_vars": true, 
+    "tfa.optimizers.moving_average.MovingAverage.from_config": true, 
+    "tfa.optimizers.moving_average.MovingAverage.get_config": true, 
+    "tfa.optimizers.moving_average.MovingAverage.get_gradients": true, 
+    "tfa.optimizers.moving_average.MovingAverage.get_slot": true, 
+    "tfa.optimizers.moving_average.MovingAverage.get_slot_names": true, 
+    "tfa.optimizers.moving_average.MovingAverage.get_updates": true, 
+    "tfa.optimizers.moving_average.MovingAverage.get_weights": true, 
+    "tfa.optimizers.moving_average.MovingAverage.iterations": true, 
+    "tfa.optimizers.moving_average.MovingAverage.minimize": true, 
+    "tfa.optimizers.moving_average.MovingAverage.set_weights": true, 
+    "tfa.optimizers.moving_average.MovingAverage.variables": true, 
+    "tfa.optimizers.moving_average.MovingAverage.weights": true, 
+    "tfa.optimizers.moving_average.absolute_import": true, 
+    "tfa.optimizers.moving_average.division": true, 
+    "tfa.optimizers.moving_average.print_function": true, 
+    "tfa.optimizers.print_function": true, 
+    "tfa.optimizers.weight_decay_optimizers": false, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW": false, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.__init__": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.add_slot": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.add_weight": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.apply_gradients": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.from_config": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.get_config": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.get_gradients": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.get_slot": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.get_slot_names": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.get_updates": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.get_weights": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.iterations": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.minimize": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.set_weights": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.variables": true, 
+    "tfa.optimizers.weight_decay_optimizers.AdamW.weights": true, 
+    "tfa.optimizers.weight_decay_optimizers.DecoupledWeightDecayExtension": false, 
+    "tfa.optimizers.weight_decay_optimizers.DecoupledWeightDecayExtension.__init__": true, 
+    "tfa.optimizers.weight_decay_optimizers.DecoupledWeightDecayExtension.apply_gradients": true, 
+    "tfa.optimizers.weight_decay_optimizers.DecoupledWeightDecayExtension.get_config": true, 
+    "tfa.optimizers.weight_decay_optimizers.DecoupledWeightDecayExtension.minimize": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW": false, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.__init__": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.add_slot": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.add_weight": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.apply_gradients": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.from_config": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.get_config": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.get_gradients": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.get_slot": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.get_slot_names": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.get_updates": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.get_weights": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.iterations": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.minimize": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.set_weights": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.variables": true, 
+    "tfa.optimizers.weight_decay_optimizers.SGDW.weights": true, 
+    "tfa.optimizers.weight_decay_optimizers.absolute_import": true, 
+    "tfa.optimizers.weight_decay_optimizers.division": true, 
+    "tfa.optimizers.weight_decay_optimizers.extend_with_decoupled_weight_decay": false, 
+    "tfa.optimizers.weight_decay_optimizers.print_function": true, 
+    "tfa.rnn": false, 
+    "tfa.rnn.LayerNormLSTMCell": false, 
+    "tfa.rnn.LayerNormLSTMCell.__call__": true, 
+    "tfa.rnn.LayerNormLSTMCell.__init__": true, 
+    "tfa.rnn.LayerNormLSTMCell.activity_regularizer": true, 
+    "tfa.rnn.LayerNormLSTMCell.add_loss": true, 
+    "tfa.rnn.LayerNormLSTMCell.add_metric": true, 
+    "tfa.rnn.LayerNormLSTMCell.add_update": true, 
+    "tfa.rnn.LayerNormLSTMCell.add_variable": true, 
+    "tfa.rnn.LayerNormLSTMCell.add_weight": true, 
+    "tfa.rnn.LayerNormLSTMCell.apply": true, 
+    "tfa.rnn.LayerNormLSTMCell.build": true, 
+    "tfa.rnn.LayerNormLSTMCell.call": true, 
+    "tfa.rnn.LayerNormLSTMCell.compute_mask": true, 
+    "tfa.rnn.LayerNormLSTMCell.compute_output_shape": true, 
+    "tfa.rnn.LayerNormLSTMCell.compute_output_signature": true, 
+    "tfa.rnn.LayerNormLSTMCell.count_params": true, 
+    "tfa.rnn.LayerNormLSTMCell.dtype": true, 
+    "tfa.rnn.LayerNormLSTMCell.dynamic": true, 
+    "tfa.rnn.LayerNormLSTMCell.from_config": true, 
+    "tfa.rnn.LayerNormLSTMCell.get_config": true, 
+    "tfa.rnn.LayerNormLSTMCell.get_dropout_mask_for_cell": true, 
+    "tfa.rnn.LayerNormLSTMCell.get_initial_state": true, 
+    "tfa.rnn.LayerNormLSTMCell.get_input_at": true, 
+    "tfa.rnn.LayerNormLSTMCell.get_input_mask_at": true, 
+    "tfa.rnn.LayerNormLSTMCell.get_input_shape_at": true, 
+    "tfa.rnn.LayerNormLSTMCell.get_losses_for": true, 
+    "tfa.rnn.LayerNormLSTMCell.get_output_at": true, 
+    "tfa.rnn.LayerNormLSTMCell.get_output_mask_at": true, 
+    "tfa.rnn.LayerNormLSTMCell.get_output_shape_at": true, 
+    "tfa.rnn.LayerNormLSTMCell.get_recurrent_dropout_mask_for_cell": true, 
+    "tfa.rnn.LayerNormLSTMCell.get_updates_for": true, 
+    "tfa.rnn.LayerNormLSTMCell.get_weights": true, 
+    "tfa.rnn.LayerNormLSTMCell.input": true, 
+    "tfa.rnn.LayerNormLSTMCell.input_mask": true, 
+    "tfa.rnn.LayerNormLSTMCell.input_shape": true, 
+    "tfa.rnn.LayerNormLSTMCell.input_spec": true, 
+    "tfa.rnn.LayerNormLSTMCell.losses": true, 
+    "tfa.rnn.LayerNormLSTMCell.metrics": true, 
+    "tfa.rnn.LayerNormLSTMCell.name": true, 
+    "tfa.rnn.LayerNormLSTMCell.name_scope": true, 
+    "tfa.rnn.LayerNormLSTMCell.non_trainable_variables": true, 
+    "tfa.rnn.LayerNormLSTMCell.non_trainable_weights": true, 
+    "tfa.rnn.LayerNormLSTMCell.output": true, 
+    "tfa.rnn.LayerNormLSTMCell.output_mask": true, 
+    "tfa.rnn.LayerNormLSTMCell.output_shape": true, 
+    "tfa.rnn.LayerNormLSTMCell.reset_dropout_mask": true, 
+    "tfa.rnn.LayerNormLSTMCell.reset_recurrent_dropout_mask": true, 
+    "tfa.rnn.LayerNormLSTMCell.set_weights": true, 
+    "tfa.rnn.LayerNormLSTMCell.submodules": true, 
+    "tfa.rnn.LayerNormLSTMCell.trainable": true, 
+    "tfa.rnn.LayerNormLSTMCell.trainable_variables": true, 
+    "tfa.rnn.LayerNormLSTMCell.trainable_weights": true, 
+    "tfa.rnn.LayerNormLSTMCell.updates": true, 
+    "tfa.rnn.LayerNormLSTMCell.variables": true, 
+    "tfa.rnn.LayerNormLSTMCell.weights": true, 
+    "tfa.rnn.LayerNormLSTMCell.with_name_scope": true, 
+    "tfa.rnn.NASCell": false, 
+    "tfa.rnn.NASCell.__call__": true, 
+    "tfa.rnn.NASCell.__init__": true, 
+    "tfa.rnn.NASCell.activity_regularizer": true, 
+    "tfa.rnn.NASCell.add_loss": true, 
+    "tfa.rnn.NASCell.add_metric": true, 
+    "tfa.rnn.NASCell.add_update": true, 
+    "tfa.rnn.NASCell.add_variable": true, 
+    "tfa.rnn.NASCell.add_weight": true, 
+    "tfa.rnn.NASCell.apply": true, 
+    "tfa.rnn.NASCell.build": true, 
+    "tfa.rnn.NASCell.call": true, 
+    "tfa.rnn.NASCell.compute_mask": true, 
+    "tfa.rnn.NASCell.compute_output_shape": true, 
+    "tfa.rnn.NASCell.compute_output_signature": true, 
+    "tfa.rnn.NASCell.count_params": true, 
+    "tfa.rnn.NASCell.dtype": true, 
+    "tfa.rnn.NASCell.dynamic": true, 
+    "tfa.rnn.NASCell.from_config": true, 
+    "tfa.rnn.NASCell.get_config": true, 
+    "tfa.rnn.NASCell.get_initial_state": true, 
+    "tfa.rnn.NASCell.get_input_at": true, 
+    "tfa.rnn.NASCell.get_input_mask_at": true, 
+    "tfa.rnn.NASCell.get_input_shape_at": true, 
+    "tfa.rnn.NASCell.get_losses_for": true, 
+    "tfa.rnn.NASCell.get_output_at": true, 
+    "tfa.rnn.NASCell.get_output_mask_at": true, 
+    "tfa.rnn.NASCell.get_output_shape_at": true, 
+    "tfa.rnn.NASCell.get_updates_for": true, 
+    "tfa.rnn.NASCell.get_weights": true, 
+    "tfa.rnn.NASCell.input": true, 
+    "tfa.rnn.NASCell.input_mask": true, 
+    "tfa.rnn.NASCell.input_shape": true, 
+    "tfa.rnn.NASCell.input_spec": true, 
+    "tfa.rnn.NASCell.losses": true, 
+    "tfa.rnn.NASCell.metrics": true, 
+    "tfa.rnn.NASCell.name": true, 
+    "tfa.rnn.NASCell.name_scope": true, 
+    "tfa.rnn.NASCell.non_trainable_variables": true, 
+    "tfa.rnn.NASCell.non_trainable_weights": true, 
+    "tfa.rnn.NASCell.output": true, 
+    "tfa.rnn.NASCell.output_mask": true, 
+    "tfa.rnn.NASCell.output_shape": true, 
+    "tfa.rnn.NASCell.output_size": true, 
+    "tfa.rnn.NASCell.set_weights": true, 
+    "tfa.rnn.NASCell.state_size": true, 
+    "tfa.rnn.NASCell.submodules": true, 
+    "tfa.rnn.NASCell.trainable": true, 
+    "tfa.rnn.NASCell.trainable_variables": true, 
+    "tfa.rnn.NASCell.trainable_weights": true, 
+    "tfa.rnn.NASCell.updates": true, 
+    "tfa.rnn.NASCell.variables": true, 
+    "tfa.rnn.NASCell.weights": true, 
+    "tfa.rnn.NASCell.with_name_scope": true, 
+    "tfa.rnn.absolute_import": true, 
+    "tfa.rnn.cell": false, 
+    "tfa.rnn.cell.LayerNormLSTMCell": false, 
+    "tfa.rnn.cell.LayerNormLSTMCell.__call__": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.__init__": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.activity_regularizer": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.add_loss": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.add_metric": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.add_update": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.add_variable": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.add_weight": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.apply": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.build": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.call": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.compute_mask": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.compute_output_shape": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.compute_output_signature": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.count_params": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.dtype": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.dynamic": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.from_config": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.get_config": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.get_dropout_mask_for_cell": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.get_initial_state": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.get_input_at": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.get_input_mask_at": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.get_input_shape_at": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.get_losses_for": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.get_output_at": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.get_output_mask_at": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.get_output_shape_at": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.get_recurrent_dropout_mask_for_cell": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.get_updates_for": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.get_weights": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.input": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.input_mask": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.input_shape": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.input_spec": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.losses": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.metrics": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.name": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.name_scope": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.non_trainable_variables": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.non_trainable_weights": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.output": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.output_mask": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.output_shape": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.reset_dropout_mask": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.reset_recurrent_dropout_mask": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.set_weights": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.submodules": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.trainable": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.trainable_variables": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.trainable_weights": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.updates": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.variables": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.weights": true, 
+    "tfa.rnn.cell.LayerNormLSTMCell.with_name_scope": true, 
+    "tfa.rnn.cell.NASCell": false, 
+    "tfa.rnn.cell.NASCell.__call__": true, 
+    "tfa.rnn.cell.NASCell.__init__": true, 
+    "tfa.rnn.cell.NASCell.activity_regularizer": true, 
+    "tfa.rnn.cell.NASCell.add_loss": true, 
+    "tfa.rnn.cell.NASCell.add_metric": true, 
+    "tfa.rnn.cell.NASCell.add_update": true, 
+    "tfa.rnn.cell.NASCell.add_variable": true, 
+    "tfa.rnn.cell.NASCell.add_weight": true, 
+    "tfa.rnn.cell.NASCell.apply": true, 
+    "tfa.rnn.cell.NASCell.build": true, 
+    "tfa.rnn.cell.NASCell.call": true, 
+    "tfa.rnn.cell.NASCell.compute_mask": true, 
+    "tfa.rnn.cell.NASCell.compute_output_shape": true, 
+    "tfa.rnn.cell.NASCell.compute_output_signature": true, 
+    "tfa.rnn.cell.NASCell.count_params": true, 
+    "tfa.rnn.cell.NASCell.dtype": true, 
+    "tfa.rnn.cell.NASCell.dynamic": true, 
+    "tfa.rnn.cell.NASCell.from_config": true, 
+    "tfa.rnn.cell.NASCell.get_config": true, 
+    "tfa.rnn.cell.NASCell.get_initial_state": true, 
+    "tfa.rnn.cell.NASCell.get_input_at": true, 
+    "tfa.rnn.cell.NASCell.get_input_mask_at": true, 
+    "tfa.rnn.cell.NASCell.get_input_shape_at": true, 
+    "tfa.rnn.cell.NASCell.get_losses_for": true, 
+    "tfa.rnn.cell.NASCell.get_output_at": true, 
+    "tfa.rnn.cell.NASCell.get_output_mask_at": true, 
+    "tfa.rnn.cell.NASCell.get_output_shape_at": true, 
+    "tfa.rnn.cell.NASCell.get_updates_for": true, 
+    "tfa.rnn.cell.NASCell.get_weights": true, 
+    "tfa.rnn.cell.NASCell.input": true, 
+    "tfa.rnn.cell.NASCell.input_mask": true, 
+    "tfa.rnn.cell.NASCell.input_shape": true, 
+    "tfa.rnn.cell.NASCell.input_spec": true, 
+    "tfa.rnn.cell.NASCell.losses": true, 
+    "tfa.rnn.cell.NASCell.metrics": true, 
+    "tfa.rnn.cell.NASCell.name": true, 
+    "tfa.rnn.cell.NASCell.name_scope": true, 
+    "tfa.rnn.cell.NASCell.non_trainable_variables": true, 
+    "tfa.rnn.cell.NASCell.non_trainable_weights": true, 
+    "tfa.rnn.cell.NASCell.output": true, 
+    "tfa.rnn.cell.NASCell.output_mask": true, 
+    "tfa.rnn.cell.NASCell.output_shape": true, 
+    "tfa.rnn.cell.NASCell.output_size": true, 
+    "tfa.rnn.cell.NASCell.set_weights": true, 
+    "tfa.rnn.cell.NASCell.state_size": true, 
+    "tfa.rnn.cell.NASCell.submodules": true, 
+    "tfa.rnn.cell.NASCell.trainable": true, 
+    "tfa.rnn.cell.NASCell.trainable_variables": true, 
+    "tfa.rnn.cell.NASCell.trainable_weights": true, 
+    "tfa.rnn.cell.NASCell.updates": true, 
+    "tfa.rnn.cell.NASCell.variables": true, 
+    "tfa.rnn.cell.NASCell.weights": true, 
+    "tfa.rnn.cell.NASCell.with_name_scope": true, 
+    "tfa.rnn.cell.absolute_import": true, 
+    "tfa.rnn.cell.division": true, 
+    "tfa.rnn.cell.print_function": true, 
+    "tfa.rnn.division": true, 
+    "tfa.rnn.print_function": true, 
+    "tfa.seq2seq": false, 
+    "tfa.seq2seq.AttentionMechanism": false, 
+    "tfa.seq2seq.AttentionMechanism.__init__": true, 
+    "tfa.seq2seq.AttentionMechanism.alignments_size": true, 
+    "tfa.seq2seq.AttentionMechanism.state_size": true, 
+    "tfa.seq2seq.AttentionWrapper": false, 
+    "tfa.seq2seq.AttentionWrapper.__call__": true, 
+    "tfa.seq2seq.AttentionWrapper.__init__": true, 
+    "tfa.seq2seq.AttentionWrapper.activity_regularizer": true, 
+    "tfa.seq2seq.AttentionWrapper.add_loss": true, 
+    "tfa.seq2seq.AttentionWrapper.add_metric": true, 
+    "tfa.seq2seq.AttentionWrapper.add_update": true, 
+    "tfa.seq2seq.AttentionWrapper.add_variable": true, 
+    "tfa.seq2seq.AttentionWrapper.add_weight": true, 
+    "tfa.seq2seq.AttentionWrapper.apply": true, 
+    "tfa.seq2seq.AttentionWrapper.build": true, 
+    "tfa.seq2seq.AttentionWrapper.call": true, 
+    "tfa.seq2seq.AttentionWrapper.compute_mask": true, 
+    "tfa.seq2seq.AttentionWrapper.compute_output_shape": true, 
+    "tfa.seq2seq.AttentionWrapper.compute_output_signature": true, 
+    "tfa.seq2seq.AttentionWrapper.count_params": true, 
+    "tfa.seq2seq.AttentionWrapper.dtype": true, 
+    "tfa.seq2seq.AttentionWrapper.dynamic": true, 
+    "tfa.seq2seq.AttentionWrapper.from_config": true, 
+    "tfa.seq2seq.AttentionWrapper.get_config": true, 
+    "tfa.seq2seq.AttentionWrapper.get_initial_state": true, 
+    "tfa.seq2seq.AttentionWrapper.get_input_at": true, 
+    "tfa.seq2seq.AttentionWrapper.get_input_mask_at": true, 
+    "tfa.seq2seq.AttentionWrapper.get_input_shape_at": true, 
+    "tfa.seq2seq.AttentionWrapper.get_losses_for": true, 
+    "tfa.seq2seq.AttentionWrapper.get_output_at": true, 
+    "tfa.seq2seq.AttentionWrapper.get_output_mask_at": true, 
+    "tfa.seq2seq.AttentionWrapper.get_output_shape_at": true, 
+    "tfa.seq2seq.AttentionWrapper.get_updates_for": true, 
+    "tfa.seq2seq.AttentionWrapper.get_weights": true, 
+    "tfa.seq2seq.AttentionWrapper.input": true, 
+    "tfa.seq2seq.AttentionWrapper.input_mask": true, 
+    "tfa.seq2seq.AttentionWrapper.input_shape": true, 
+    "tfa.seq2seq.AttentionWrapper.input_spec": true, 
+    "tfa.seq2seq.AttentionWrapper.losses": true, 
+    "tfa.seq2seq.AttentionWrapper.metrics": true, 
+    "tfa.seq2seq.AttentionWrapper.name": true, 
+    "tfa.seq2seq.AttentionWrapper.name_scope": true, 
+    "tfa.seq2seq.AttentionWrapper.non_trainable_variables": true, 
+    "tfa.seq2seq.AttentionWrapper.non_trainable_weights": true, 
+    "tfa.seq2seq.AttentionWrapper.output": true, 
+    "tfa.seq2seq.AttentionWrapper.output_mask": true, 
+    "tfa.seq2seq.AttentionWrapper.output_shape": true, 
+    "tfa.seq2seq.AttentionWrapper.output_size": true, 
+    "tfa.seq2seq.AttentionWrapper.set_weights": true, 
+    "tfa.seq2seq.AttentionWrapper.state_size": true, 
+    "tfa.seq2seq.AttentionWrapper.submodules": true, 
+    "tfa.seq2seq.AttentionWrapper.trainable": true, 
+    "tfa.seq2seq.AttentionWrapper.trainable_variables": true, 
+    "tfa.seq2seq.AttentionWrapper.trainable_weights": true, 
+    "tfa.seq2seq.AttentionWrapper.updates": true, 
+    "tfa.seq2seq.AttentionWrapper.variables": true, 
+    "tfa.seq2seq.AttentionWrapper.weights": true, 
+    "tfa.seq2seq.AttentionWrapper.with_name_scope": true, 
+    "tfa.seq2seq.AttentionWrapperState": false, 
+    "tfa.seq2seq.AttentionWrapperState.__add__": true, 
+    "tfa.seq2seq.AttentionWrapperState.__contains__": true, 
+    "tfa.seq2seq.AttentionWrapperState.__eq__": true, 
+    "tfa.seq2seq.AttentionWrapperState.__ge__": true, 
+    "tfa.seq2seq.AttentionWrapperState.__getitem__": true, 
+    "tfa.seq2seq.AttentionWrapperState.__gt__": true, 
+    "tfa.seq2seq.AttentionWrapperState.__init__": true, 
+    "tfa.seq2seq.AttentionWrapperState.__iter__": true, 
+    "tfa.seq2seq.AttentionWrapperState.__le__": true, 
+    "tfa.seq2seq.AttentionWrapperState.__len__": true, 
+    "tfa.seq2seq.AttentionWrapperState.__lt__": true, 
+    "tfa.seq2seq.AttentionWrapperState.__mul__": true, 
+    "tfa.seq2seq.AttentionWrapperState.__ne__": true, 
+    "tfa.seq2seq.AttentionWrapperState.__rmul__": true, 
+    "tfa.seq2seq.AttentionWrapperState.alignment_history": true, 
+    "tfa.seq2seq.AttentionWrapperState.alignments": true, 
+    "tfa.seq2seq.AttentionWrapperState.attention": true, 
+    "tfa.seq2seq.AttentionWrapperState.attention_state": true, 
+    "tfa.seq2seq.AttentionWrapperState.cell_state": true, 
+    "tfa.seq2seq.AttentionWrapperState.clone": true, 
+    "tfa.seq2seq.AttentionWrapperState.count": true, 
+    "tfa.seq2seq.AttentionWrapperState.index": true, 
+    "tfa.seq2seq.AttentionWrapperState.time": true, 
+    "tfa.seq2seq.BahdanauAttention": false, 
+    "tfa.seq2seq.BahdanauAttention.__call__": true, 
+    "tfa.seq2seq.BahdanauAttention.__init__": true, 
+    "tfa.seq2seq.BahdanauAttention.activity_regularizer": true, 
+    "tfa.seq2seq.BahdanauAttention.add_loss": true, 
+    "tfa.seq2seq.BahdanauAttention.add_metric": true, 
+    "tfa.seq2seq.BahdanauAttention.add_update": true, 
+    "tfa.seq2seq.BahdanauAttention.add_variable": true, 
+    "tfa.seq2seq.BahdanauAttention.add_weight": true, 
+    "tfa.seq2seq.BahdanauAttention.alignments_size": true, 
+    "tfa.seq2seq.BahdanauAttention.apply": true, 
+    "tfa.seq2seq.BahdanauAttention.build": true, 
+    "tfa.seq2seq.BahdanauAttention.call": true, 
+    "tfa.seq2seq.BahdanauAttention.compute_mask": true, 
+    "tfa.seq2seq.BahdanauAttention.compute_output_shape": true, 
+    "tfa.seq2seq.BahdanauAttention.compute_output_signature": true, 
+    "tfa.seq2seq.BahdanauAttention.count_params": true, 
+    "tfa.seq2seq.BahdanauAttention.deserialize_inner_layer_from_config": true, 
+    "tfa.seq2seq.BahdanauAttention.dtype": true, 
+    "tfa.seq2seq.BahdanauAttention.dynamic": true, 
+    "tfa.seq2seq.BahdanauAttention.from_config": true, 
+    "tfa.seq2seq.BahdanauAttention.get_config": true, 
+    "tfa.seq2seq.BahdanauAttention.get_input_at": true, 
+    "tfa.seq2seq.BahdanauAttention.get_input_mask_at": true, 
+    "tfa.seq2seq.BahdanauAttention.get_input_shape_at": true, 
+    "tfa.seq2seq.BahdanauAttention.get_losses_for": true, 
+    "tfa.seq2seq.BahdanauAttention.get_output_at": true, 
+    "tfa.seq2seq.BahdanauAttention.get_output_mask_at": true, 
+    "tfa.seq2seq.BahdanauAttention.get_output_shape_at": true, 
+    "tfa.seq2seq.BahdanauAttention.get_updates_for": true, 
+    "tfa.seq2seq.BahdanauAttention.get_weights": true, 
+    "tfa.seq2seq.BahdanauAttention.initial_alignments": true, 
+    "tfa.seq2seq.BahdanauAttention.initial_state": true, 
+    "tfa.seq2seq.BahdanauAttention.input": true, 
+    "tfa.seq2seq.BahdanauAttention.input_mask": true, 
+    "tfa.seq2seq.BahdanauAttention.input_shape": true, 
+    "tfa.seq2seq.BahdanauAttention.input_spec": true, 
+    "tfa.seq2seq.BahdanauAttention.losses": true, 
+    "tfa.seq2seq.BahdanauAttention.metrics": true, 
+    "tfa.seq2seq.BahdanauAttention.name": true, 
+    "tfa.seq2seq.BahdanauAttention.name_scope": true, 
+    "tfa.seq2seq.BahdanauAttention.non_trainable_variables": true, 
+    "tfa.seq2seq.BahdanauAttention.non_trainable_weights": true, 
+    "tfa.seq2seq.BahdanauAttention.output": true, 
+    "tfa.seq2seq.BahdanauAttention.output_mask": true, 
+    "tfa.seq2seq.BahdanauAttention.output_shape": true, 
+    "tfa.seq2seq.BahdanauAttention.set_weights": true, 
+    "tfa.seq2seq.BahdanauAttention.state_size": true, 
+    "tfa.seq2seq.BahdanauAttention.submodules": true, 
+    "tfa.seq2seq.BahdanauAttention.trainable": true, 
+    "tfa.seq2seq.BahdanauAttention.trainable_variables": true, 
+    "tfa.seq2seq.BahdanauAttention.trainable_weights": true, 
+    "tfa.seq2seq.BahdanauAttention.updates": true, 
+    "tfa.seq2seq.BahdanauAttention.variables": true, 
+    "tfa.seq2seq.BahdanauAttention.weights": true, 
+    "tfa.seq2seq.BahdanauAttention.with_name_scope": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention": false, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.__call__": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.__init__": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.activity_regularizer": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.add_loss": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.add_metric": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.add_update": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.add_variable": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.add_weight": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.alignments_size": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.apply": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.build": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.call": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.compute_mask": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.compute_output_shape": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.compute_output_signature": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.count_params": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.deserialize_inner_layer_from_config": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.dtype": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.dynamic": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.from_config": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.get_config": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.get_input_at": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.get_input_mask_at": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.get_input_shape_at": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.get_losses_for": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.get_output_at": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.get_output_mask_at": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.get_output_shape_at": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.get_updates_for": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.get_weights": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.initial_alignments": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.initial_state": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.input": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.input_mask": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.input_shape": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.input_spec": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.losses": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.metrics": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.name": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.name_scope": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.non_trainable_variables": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.non_trainable_weights": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.output": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.output_mask": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.output_shape": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.set_weights": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.state_size": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.submodules": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.trainable": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.trainable_variables": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.trainable_weights": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.updates": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.variables": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.weights": true, 
+    "tfa.seq2seq.BahdanauMonotonicAttention.with_name_scope": true, 
+    "tfa.seq2seq.BaseDecoder": false, 
+    "tfa.seq2seq.BaseDecoder.__call__": true, 
+    "tfa.seq2seq.BaseDecoder.__init__": true, 
+    "tfa.seq2seq.BaseDecoder.activity_regularizer": true, 
+    "tfa.seq2seq.BaseDecoder.add_loss": true, 
+    "tfa.seq2seq.BaseDecoder.add_metric": true, 
+    "tfa.seq2seq.BaseDecoder.add_update": true, 
+    "tfa.seq2seq.BaseDecoder.add_variable": true, 
+    "tfa.seq2seq.BaseDecoder.add_weight": true, 
+    "tfa.seq2seq.BaseDecoder.apply": true, 
+    "tfa.seq2seq.BaseDecoder.batch_size": true, 
+    "tfa.seq2seq.BaseDecoder.build": true, 
+    "tfa.seq2seq.BaseDecoder.call": true, 
+    "tfa.seq2seq.BaseDecoder.compute_mask": true, 
+    "tfa.seq2seq.BaseDecoder.compute_output_shape": true, 
+    "tfa.seq2seq.BaseDecoder.compute_output_signature": true, 
+    "tfa.seq2seq.BaseDecoder.count_params": true, 
+    "tfa.seq2seq.BaseDecoder.dtype": true, 
+    "tfa.seq2seq.BaseDecoder.dynamic": true, 
+    "tfa.seq2seq.BaseDecoder.finalize": true, 
+    "tfa.seq2seq.BaseDecoder.from_config": true, 
+    "tfa.seq2seq.BaseDecoder.get_config": true, 
+    "tfa.seq2seq.BaseDecoder.get_input_at": true, 
+    "tfa.seq2seq.BaseDecoder.get_input_mask_at": true, 
+    "tfa.seq2seq.BaseDecoder.get_input_shape_at": true, 
+    "tfa.seq2seq.BaseDecoder.get_losses_for": true, 
+    "tfa.seq2seq.BaseDecoder.get_output_at": true, 
+    "tfa.seq2seq.BaseDecoder.get_output_mask_at": true, 
+    "tfa.seq2seq.BaseDecoder.get_output_shape_at": true, 
+    "tfa.seq2seq.BaseDecoder.get_updates_for": true, 
+    "tfa.seq2seq.BaseDecoder.get_weights": true, 
+    "tfa.seq2seq.BaseDecoder.initialize": true, 
+    "tfa.seq2seq.BaseDecoder.input": true, 
+    "tfa.seq2seq.BaseDecoder.input_mask": true, 
+    "tfa.seq2seq.BaseDecoder.input_shape": true, 
+    "tfa.seq2seq.BaseDecoder.input_spec": true, 
+    "tfa.seq2seq.BaseDecoder.losses": true, 
+    "tfa.seq2seq.BaseDecoder.metrics": true, 
+    "tfa.seq2seq.BaseDecoder.name": true, 
+    "tfa.seq2seq.BaseDecoder.name_scope": true, 
+    "tfa.seq2seq.BaseDecoder.non_trainable_variables": true, 
+    "tfa.seq2seq.BaseDecoder.non_trainable_weights": true, 
+    "tfa.seq2seq.BaseDecoder.output": true, 
+    "tfa.seq2seq.BaseDecoder.output_dtype": true, 
+    "tfa.seq2seq.BaseDecoder.output_mask": true, 
+    "tfa.seq2seq.BaseDecoder.output_shape": true, 
+    "tfa.seq2seq.BaseDecoder.output_size": true, 
+    "tfa.seq2seq.BaseDecoder.set_weights": true, 
+    "tfa.seq2seq.BaseDecoder.step": true, 
+    "tfa.seq2seq.BaseDecoder.submodules": true, 
+    "tfa.seq2seq.BaseDecoder.tracks_own_finished": true, 
+    "tfa.seq2seq.BaseDecoder.trainable": true, 
+    "tfa.seq2seq.BaseDecoder.trainable_variables": true, 
+    "tfa.seq2seq.BaseDecoder.trainable_weights": true, 
+    "tfa.seq2seq.BaseDecoder.updates": true, 
+    "tfa.seq2seq.BaseDecoder.variables": true, 
+    "tfa.seq2seq.BaseDecoder.weights": true, 
+    "tfa.seq2seq.BaseDecoder.with_name_scope": true, 
+    "tfa.seq2seq.BasicDecoder": false, 
+    "tfa.seq2seq.BasicDecoder.__call__": true, 
+    "tfa.seq2seq.BasicDecoder.__init__": true, 
+    "tfa.seq2seq.BasicDecoder.activity_regularizer": true, 
+    "tfa.seq2seq.BasicDecoder.add_loss": true, 
+    "tfa.seq2seq.BasicDecoder.add_metric": true, 
+    "tfa.seq2seq.BasicDecoder.add_update": true, 
+    "tfa.seq2seq.BasicDecoder.add_variable": true, 
+    "tfa.seq2seq.BasicDecoder.add_weight": true, 
+    "tfa.seq2seq.BasicDecoder.apply": true, 
+    "tfa.seq2seq.BasicDecoder.batch_size": true, 
+    "tfa.seq2seq.BasicDecoder.build": true, 
+    "tfa.seq2seq.BasicDecoder.call": true, 
+    "tfa.seq2seq.BasicDecoder.compute_mask": true, 
+    "tfa.seq2seq.BasicDecoder.compute_output_shape": true, 
+    "tfa.seq2seq.BasicDecoder.compute_output_signature": true, 
+    "tfa.seq2seq.BasicDecoder.count_params": true, 
+    "tfa.seq2seq.BasicDecoder.dtype": true, 
+    "tfa.seq2seq.BasicDecoder.dynamic": true, 
+    "tfa.seq2seq.BasicDecoder.finalize": true, 
+    "tfa.seq2seq.BasicDecoder.from_config": true, 
+    "tfa.seq2seq.BasicDecoder.get_config": true, 
+    "tfa.seq2seq.BasicDecoder.get_input_at": true, 
+    "tfa.seq2seq.BasicDecoder.get_input_mask_at": true, 
+    "tfa.seq2seq.BasicDecoder.get_input_shape_at": true, 
+    "tfa.seq2seq.BasicDecoder.get_losses_for": true, 
+    "tfa.seq2seq.BasicDecoder.get_output_at": true, 
+    "tfa.seq2seq.BasicDecoder.get_output_mask_at": true, 
+    "tfa.seq2seq.BasicDecoder.get_output_shape_at": true, 
+    "tfa.seq2seq.BasicDecoder.get_updates_for": true, 
+    "tfa.seq2seq.BasicDecoder.get_weights": true, 
+    "tfa.seq2seq.BasicDecoder.initialize": true, 
+    "tfa.seq2seq.BasicDecoder.input": true, 
+    "tfa.seq2seq.BasicDecoder.input_mask": true, 
+    "tfa.seq2seq.BasicDecoder.input_shape": true, 
+    "tfa.seq2seq.BasicDecoder.input_spec": true, 
+    "tfa.seq2seq.BasicDecoder.losses": true, 
+    "tfa.seq2seq.BasicDecoder.metrics": true, 
+    "tfa.seq2seq.BasicDecoder.name": true, 
+    "tfa.seq2seq.BasicDecoder.name_scope": true, 
+    "tfa.seq2seq.BasicDecoder.non_trainable_variables": true, 
+    "tfa.seq2seq.BasicDecoder.non_trainable_weights": true, 
+    "tfa.seq2seq.BasicDecoder.output": true, 
+    "tfa.seq2seq.BasicDecoder.output_dtype": true, 
+    "tfa.seq2seq.BasicDecoder.output_mask": true, 
+    "tfa.seq2seq.BasicDecoder.output_shape": true, 
+    "tfa.seq2seq.BasicDecoder.output_size": true, 
+    "tfa.seq2seq.BasicDecoder.set_weights": true, 
+    "tfa.seq2seq.BasicDecoder.step": true, 
+    "tfa.seq2seq.BasicDecoder.submodules": true, 
+    "tfa.seq2seq.BasicDecoder.tracks_own_finished": true, 
+    "tfa.seq2seq.BasicDecoder.trainable": true, 
+    "tfa.seq2seq.BasicDecoder.trainable_variables": true, 
+    "tfa.seq2seq.BasicDecoder.trainable_weights": true, 
+    "tfa.seq2seq.BasicDecoder.updates": true, 
+    "tfa.seq2seq.BasicDecoder.variables": true, 
+    "tfa.seq2seq.BasicDecoder.weights": true, 
+    "tfa.seq2seq.BasicDecoder.with_name_scope": true, 
+    "tfa.seq2seq.BasicDecoderOutput": false, 
+    "tfa.seq2seq.BasicDecoderOutput.__add__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.__contains__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.__eq__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.__ge__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.__getitem__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.__gt__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.__init__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.__iter__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.__le__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.__len__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.__lt__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.__mul__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.__ne__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.__rmul__": true, 
+    "tfa.seq2seq.BasicDecoderOutput.count": true, 
+    "tfa.seq2seq.BasicDecoderOutput.index": true, 
+    "tfa.seq2seq.BasicDecoderOutput.rnn_output": true, 
+    "tfa.seq2seq.BasicDecoderOutput.sample_id": true, 
+    "tfa.seq2seq.BeamSearchDecoder": false, 
+    "tfa.seq2seq.BeamSearchDecoder.__call__": true, 
+    "tfa.seq2seq.BeamSearchDecoder.__init__": true, 
+    "tfa.seq2seq.BeamSearchDecoder.activity_regularizer": true, 
+    "tfa.seq2seq.BeamSearchDecoder.add_loss": true, 
+    "tfa.seq2seq.BeamSearchDecoder.add_metric": true, 
+    "tfa.seq2seq.BeamSearchDecoder.add_update": true, 
+    "tfa.seq2seq.BeamSearchDecoder.add_variable": true, 
+    "tfa.seq2seq.BeamSearchDecoder.add_weight": true, 
+    "tfa.seq2seq.BeamSearchDecoder.apply": true, 
+    "tfa.seq2seq.BeamSearchDecoder.batch_size": true, 
+    "tfa.seq2seq.BeamSearchDecoder.build": true, 
+    "tfa.seq2seq.BeamSearchDecoder.call": true, 
+    "tfa.seq2seq.BeamSearchDecoder.compute_mask": true, 
+    "tfa.seq2seq.BeamSearchDecoder.compute_output_shape": true, 
+    "tfa.seq2seq.BeamSearchDecoder.compute_output_signature": true, 
+    "tfa.seq2seq.BeamSearchDecoder.count_params": true, 
+    "tfa.seq2seq.BeamSearchDecoder.dtype": true, 
+    "tfa.seq2seq.BeamSearchDecoder.dynamic": true, 
+    "tfa.seq2seq.BeamSearchDecoder.finalize": true, 
+    "tfa.seq2seq.BeamSearchDecoder.from_config": true, 
+    "tfa.seq2seq.BeamSearchDecoder.get_config": true, 
+    "tfa.seq2seq.BeamSearchDecoder.get_input_at": true, 
+    "tfa.seq2seq.BeamSearchDecoder.get_input_mask_at": true, 
+    "tfa.seq2seq.BeamSearchDecoder.get_input_shape_at": true, 
+    "tfa.seq2seq.BeamSearchDecoder.get_losses_for": true, 
+    "tfa.seq2seq.BeamSearchDecoder.get_output_at": true, 
+    "tfa.seq2seq.BeamSearchDecoder.get_output_mask_at": true, 
+    "tfa.seq2seq.BeamSearchDecoder.get_output_shape_at": true, 
+    "tfa.seq2seq.BeamSearchDecoder.get_updates_for": true, 
+    "tfa.seq2seq.BeamSearchDecoder.get_weights": true, 
+    "tfa.seq2seq.BeamSearchDecoder.initialize": true, 
+    "tfa.seq2seq.BeamSearchDecoder.input": true, 
+    "tfa.seq2seq.BeamSearchDecoder.input_mask": true, 
+    "tfa.seq2seq.BeamSearchDecoder.input_shape": true, 
+    "tfa.seq2seq.BeamSearchDecoder.input_spec": true, 
+    "tfa.seq2seq.BeamSearchDecoder.losses": true, 
+    "tfa.seq2seq.BeamSearchDecoder.metrics": true, 
+    "tfa.seq2seq.BeamSearchDecoder.name": true, 
+    "tfa.seq2seq.BeamSearchDecoder.name_scope": true, 
+    "tfa.seq2seq.BeamSearchDecoder.non_trainable_variables": true, 
+    "tfa.seq2seq.BeamSearchDecoder.non_trainable_weights": true, 
+    "tfa.seq2seq.BeamSearchDecoder.output": true, 
+    "tfa.seq2seq.BeamSearchDecoder.output_dtype": true, 
+    "tfa.seq2seq.BeamSearchDecoder.output_mask": true, 
+    "tfa.seq2seq.BeamSearchDecoder.output_shape": true, 
+    "tfa.seq2seq.BeamSearchDecoder.output_size": true, 
+    "tfa.seq2seq.BeamSearchDecoder.set_weights": true, 
+    "tfa.seq2seq.BeamSearchDecoder.step": true, 
+    "tfa.seq2seq.BeamSearchDecoder.submodules": true, 
+    "tfa.seq2seq.BeamSearchDecoder.tracks_own_finished": true, 
+    "tfa.seq2seq.BeamSearchDecoder.trainable": true, 
+    "tfa.seq2seq.BeamSearchDecoder.trainable_variables": true, 
+    "tfa.seq2seq.BeamSearchDecoder.trainable_weights": true, 
+    "tfa.seq2seq.BeamSearchDecoder.updates": true, 
+    "tfa.seq2seq.BeamSearchDecoder.variables": true, 
+    "tfa.seq2seq.BeamSearchDecoder.weights": true, 
+    "tfa.seq2seq.BeamSearchDecoder.with_name_scope": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput": false, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__add__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__contains__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__eq__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__ge__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__getitem__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__gt__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__init__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__iter__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__le__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__len__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__lt__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__mul__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__ne__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.__rmul__": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.count": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.index": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.parent_ids": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.predicted_ids": true, 
+    "tfa.seq2seq.BeamSearchDecoderOutput.scores": true, 
+    "tfa.seq2seq.BeamSearchDecoderState": false, 
+    "tfa.seq2seq.BeamSearchDecoderState.__add__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.__contains__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.__eq__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.__ge__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.__getitem__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.__gt__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.__init__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.__iter__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.__le__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.__len__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.__lt__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.__mul__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.__ne__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.__rmul__": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.accumulated_attention_probs": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.cell_state": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.count": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.finished": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.index": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.lengths": true, 
+    "tfa.seq2seq.BeamSearchDecoderState.log_probs": true, 
+    "tfa.seq2seq.CustomSampler": false, 
+    "tfa.seq2seq.CustomSampler.__init__": true, 
+    "tfa.seq2seq.CustomSampler.batch_size": true, 
+    "tfa.seq2seq.CustomSampler.initialize": true, 
+    "tfa.seq2seq.CustomSampler.next_inputs": true, 
+    "tfa.seq2seq.CustomSampler.sample": true, 
+    "tfa.seq2seq.CustomSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.CustomSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.Decoder": false, 
+    "tfa.seq2seq.Decoder.__init__": true, 
+    "tfa.seq2seq.Decoder.batch_size": true, 
+    "tfa.seq2seq.Decoder.finalize": true, 
+    "tfa.seq2seq.Decoder.initialize": true, 
+    "tfa.seq2seq.Decoder.output_dtype": true, 
+    "tfa.seq2seq.Decoder.output_size": true, 
+    "tfa.seq2seq.Decoder.step": true, 
+    "tfa.seq2seq.Decoder.tracks_own_finished": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput": false, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__add__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__contains__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__eq__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__ge__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__getitem__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__gt__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__init__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__iter__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__le__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__len__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__lt__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__mul__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__ne__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.__rmul__": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.beam_search_decoder_output": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.count": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.index": true, 
+    "tfa.seq2seq.FinalBeamSearchDecoderOutput.predicted_ids": true, 
+    "tfa.seq2seq.GreedyEmbeddingSampler": false, 
+    "tfa.seq2seq.GreedyEmbeddingSampler.__init__": true, 
+    "tfa.seq2seq.GreedyEmbeddingSampler.batch_size": true, 
+    "tfa.seq2seq.GreedyEmbeddingSampler.initialize": true, 
+    "tfa.seq2seq.GreedyEmbeddingSampler.next_inputs": true, 
+    "tfa.seq2seq.GreedyEmbeddingSampler.sample": true, 
+    "tfa.seq2seq.GreedyEmbeddingSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.GreedyEmbeddingSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.InferenceSampler": false, 
+    "tfa.seq2seq.InferenceSampler.__init__": true, 
+    "tfa.seq2seq.InferenceSampler.batch_size": true, 
+    "tfa.seq2seq.InferenceSampler.initialize": true, 
+    "tfa.seq2seq.InferenceSampler.next_inputs": true, 
+    "tfa.seq2seq.InferenceSampler.sample": true, 
+    "tfa.seq2seq.InferenceSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.InferenceSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.LuongAttention": false, 
+    "tfa.seq2seq.LuongAttention.__call__": true, 
+    "tfa.seq2seq.LuongAttention.__init__": true, 
+    "tfa.seq2seq.LuongAttention.activity_regularizer": true, 
+    "tfa.seq2seq.LuongAttention.add_loss": true, 
+    "tfa.seq2seq.LuongAttention.add_metric": true, 
+    "tfa.seq2seq.LuongAttention.add_update": true, 
+    "tfa.seq2seq.LuongAttention.add_variable": true, 
+    "tfa.seq2seq.LuongAttention.add_weight": true, 
+    "tfa.seq2seq.LuongAttention.alignments_size": true, 
+    "tfa.seq2seq.LuongAttention.apply": true, 
+    "tfa.seq2seq.LuongAttention.build": true, 
+    "tfa.seq2seq.LuongAttention.call": true, 
+    "tfa.seq2seq.LuongAttention.compute_mask": true, 
+    "tfa.seq2seq.LuongAttention.compute_output_shape": true, 
+    "tfa.seq2seq.LuongAttention.compute_output_signature": true, 
+    "tfa.seq2seq.LuongAttention.count_params": true, 
+    "tfa.seq2seq.LuongAttention.deserialize_inner_layer_from_config": true, 
+    "tfa.seq2seq.LuongAttention.dtype": true, 
+    "tfa.seq2seq.LuongAttention.dynamic": true, 
+    "tfa.seq2seq.LuongAttention.from_config": true, 
+    "tfa.seq2seq.LuongAttention.get_config": true, 
+    "tfa.seq2seq.LuongAttention.get_input_at": true, 
+    "tfa.seq2seq.LuongAttention.get_input_mask_at": true, 
+    "tfa.seq2seq.LuongAttention.get_input_shape_at": true, 
+    "tfa.seq2seq.LuongAttention.get_losses_for": true, 
+    "tfa.seq2seq.LuongAttention.get_output_at": true, 
+    "tfa.seq2seq.LuongAttention.get_output_mask_at": true, 
+    "tfa.seq2seq.LuongAttention.get_output_shape_at": true, 
+    "tfa.seq2seq.LuongAttention.get_updates_for": true, 
+    "tfa.seq2seq.LuongAttention.get_weights": true, 
+    "tfa.seq2seq.LuongAttention.initial_alignments": true, 
+    "tfa.seq2seq.LuongAttention.initial_state": true, 
+    "tfa.seq2seq.LuongAttention.input": true, 
+    "tfa.seq2seq.LuongAttention.input_mask": true, 
+    "tfa.seq2seq.LuongAttention.input_shape": true, 
+    "tfa.seq2seq.LuongAttention.input_spec": true, 
+    "tfa.seq2seq.LuongAttention.losses": true, 
+    "tfa.seq2seq.LuongAttention.metrics": true, 
+    "tfa.seq2seq.LuongAttention.name": true, 
+    "tfa.seq2seq.LuongAttention.name_scope": true, 
+    "tfa.seq2seq.LuongAttention.non_trainable_variables": true, 
+    "tfa.seq2seq.LuongAttention.non_trainable_weights": true, 
+    "tfa.seq2seq.LuongAttention.output": true, 
+    "tfa.seq2seq.LuongAttention.output_mask": true, 
+    "tfa.seq2seq.LuongAttention.output_shape": true, 
+    "tfa.seq2seq.LuongAttention.set_weights": true, 
+    "tfa.seq2seq.LuongAttention.state_size": true, 
+    "tfa.seq2seq.LuongAttention.submodules": true, 
+    "tfa.seq2seq.LuongAttention.trainable": true, 
+    "tfa.seq2seq.LuongAttention.trainable_variables": true, 
+    "tfa.seq2seq.LuongAttention.trainable_weights": true, 
+    "tfa.seq2seq.LuongAttention.updates": true, 
+    "tfa.seq2seq.LuongAttention.variables": true, 
+    "tfa.seq2seq.LuongAttention.weights": true, 
+    "tfa.seq2seq.LuongAttention.with_name_scope": true, 
+    "tfa.seq2seq.LuongMonotonicAttention": false, 
+    "tfa.seq2seq.LuongMonotonicAttention.__call__": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.__init__": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.activity_regularizer": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.add_loss": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.add_metric": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.add_update": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.add_variable": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.add_weight": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.alignments_size": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.apply": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.build": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.call": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.compute_mask": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.compute_output_shape": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.compute_output_signature": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.count_params": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.deserialize_inner_layer_from_config": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.dtype": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.dynamic": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.from_config": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.get_config": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.get_input_at": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.get_input_mask_at": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.get_input_shape_at": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.get_losses_for": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.get_output_at": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.get_output_mask_at": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.get_output_shape_at": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.get_updates_for": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.get_weights": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.initial_alignments": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.initial_state": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.input": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.input_mask": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.input_shape": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.input_spec": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.losses": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.metrics": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.name": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.name_scope": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.non_trainable_variables": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.non_trainable_weights": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.output": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.output_mask": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.output_shape": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.set_weights": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.state_size": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.submodules": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.trainable": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.trainable_variables": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.trainable_weights": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.updates": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.variables": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.weights": true, 
+    "tfa.seq2seq.LuongMonotonicAttention.with_name_scope": true, 
+    "tfa.seq2seq.SampleEmbeddingSampler": false, 
+    "tfa.seq2seq.SampleEmbeddingSampler.__init__": true, 
+    "tfa.seq2seq.SampleEmbeddingSampler.batch_size": true, 
+    "tfa.seq2seq.SampleEmbeddingSampler.initialize": true, 
+    "tfa.seq2seq.SampleEmbeddingSampler.next_inputs": true, 
+    "tfa.seq2seq.SampleEmbeddingSampler.sample": true, 
+    "tfa.seq2seq.SampleEmbeddingSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.SampleEmbeddingSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.Sampler": false, 
+    "tfa.seq2seq.Sampler.__init__": true, 
+    "tfa.seq2seq.Sampler.batch_size": true, 
+    "tfa.seq2seq.Sampler.initialize": true, 
+    "tfa.seq2seq.Sampler.next_inputs": true, 
+    "tfa.seq2seq.Sampler.sample": true, 
+    "tfa.seq2seq.Sampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.Sampler.sample_ids_shape": true, 
+    "tfa.seq2seq.ScheduledEmbeddingTrainingSampler": false, 
+    "tfa.seq2seq.ScheduledEmbeddingTrainingSampler.__init__": true, 
+    "tfa.seq2seq.ScheduledEmbeddingTrainingSampler.batch_size": true, 
+    "tfa.seq2seq.ScheduledEmbeddingTrainingSampler.initialize": true, 
+    "tfa.seq2seq.ScheduledEmbeddingTrainingSampler.next_inputs": true, 
+    "tfa.seq2seq.ScheduledEmbeddingTrainingSampler.sample": true, 
+    "tfa.seq2seq.ScheduledEmbeddingTrainingSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.ScheduledEmbeddingTrainingSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.ScheduledOutputTrainingSampler": false, 
+    "tfa.seq2seq.ScheduledOutputTrainingSampler.__init__": true, 
+    "tfa.seq2seq.ScheduledOutputTrainingSampler.batch_size": true, 
+    "tfa.seq2seq.ScheduledOutputTrainingSampler.initialize": true, 
+    "tfa.seq2seq.ScheduledOutputTrainingSampler.next_inputs": true, 
+    "tfa.seq2seq.ScheduledOutputTrainingSampler.sample": true, 
+    "tfa.seq2seq.ScheduledOutputTrainingSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.ScheduledOutputTrainingSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.SequenceLoss": false, 
+    "tfa.seq2seq.SequenceLoss.__call__": true, 
+    "tfa.seq2seq.SequenceLoss.__init__": true, 
+    "tfa.seq2seq.SequenceLoss.call": true, 
+    "tfa.seq2seq.SequenceLoss.from_config": true, 
+    "tfa.seq2seq.SequenceLoss.get_config": true, 
+    "tfa.seq2seq.TrainingSampler": false, 
+    "tfa.seq2seq.TrainingSampler.__init__": true, 
+    "tfa.seq2seq.TrainingSampler.batch_size": true, 
+    "tfa.seq2seq.TrainingSampler.initialize": true, 
+    "tfa.seq2seq.TrainingSampler.next_inputs": true, 
+    "tfa.seq2seq.TrainingSampler.sample": true, 
+    "tfa.seq2seq.TrainingSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.TrainingSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.absolute_import": true, 
+    "tfa.seq2seq.attention_wrapper": false, 
+    "tfa.seq2seq.attention_wrapper.AttentionMechanism": false, 
+    "tfa.seq2seq.attention_wrapper.AttentionMechanism.__init__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionMechanism.alignments_size": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionMechanism.state_size": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper": false, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.__call__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.__init__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.activity_regularizer": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.add_loss": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.add_metric": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.add_update": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.add_variable": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.add_weight": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.apply": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.build": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.call": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.compute_mask": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.compute_output_shape": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.compute_output_signature": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.count_params": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.dtype": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.dynamic": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.from_config": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.get_config": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.get_initial_state": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.get_input_at": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.get_input_mask_at": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.get_input_shape_at": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.get_losses_for": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.get_output_at": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.get_output_mask_at": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.get_output_shape_at": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.get_updates_for": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.get_weights": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.input": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.input_mask": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.input_shape": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.input_spec": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.losses": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.metrics": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.name": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.name_scope": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.non_trainable_variables": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.non_trainable_weights": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.output": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.output_mask": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.output_shape": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.output_size": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.set_weights": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.state_size": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.submodules": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.trainable": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.trainable_variables": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.trainable_weights": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.updates": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.variables": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.weights": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapper.with_name_scope": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState": false, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__add__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__contains__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__eq__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__ge__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__getitem__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__gt__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__init__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__iter__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__le__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__len__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__lt__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__mul__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__ne__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.__rmul__": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.alignment_history": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.alignments": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.attention": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.attention_state": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.cell_state": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.clone": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.count": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.index": true, 
+    "tfa.seq2seq.attention_wrapper.AttentionWrapperState.time": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention": false, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.__call__": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.__init__": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.activity_regularizer": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.add_loss": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.add_metric": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.add_update": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.add_variable": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.add_weight": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.alignments_size": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.apply": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.build": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.call": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.compute_mask": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.compute_output_shape": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.compute_output_signature": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.count_params": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.deserialize_inner_layer_from_config": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.dtype": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.dynamic": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.from_config": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.get_config": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.get_input_at": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.get_input_mask_at": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.get_input_shape_at": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.get_losses_for": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.get_output_at": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.get_output_mask_at": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.get_output_shape_at": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.get_updates_for": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.get_weights": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.initial_alignments": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.initial_state": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.input": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.input_mask": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.input_shape": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.input_spec": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.losses": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.metrics": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.name": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.name_scope": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.non_trainable_variables": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.non_trainable_weights": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.output": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.output_mask": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.output_shape": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.set_weights": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.state_size": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.submodules": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.trainable": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.trainable_variables": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.trainable_weights": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.updates": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.variables": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.weights": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauAttention.with_name_scope": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention": false, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.__call__": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.__init__": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.activity_regularizer": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.add_loss": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.add_metric": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.add_update": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.add_variable": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.add_weight": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.alignments_size": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.apply": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.build": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.call": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.compute_mask": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.compute_output_shape": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.compute_output_signature": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.count_params": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.deserialize_inner_layer_from_config": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.dtype": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.dynamic": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.from_config": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.get_config": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.get_input_at": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.get_input_mask_at": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.get_input_shape_at": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.get_losses_for": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.get_output_at": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.get_output_mask_at": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.get_output_shape_at": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.get_updates_for": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.get_weights": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.initial_alignments": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.initial_state": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.input": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.input_mask": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.input_shape": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.input_spec": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.losses": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.metrics": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.name": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.name_scope": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.non_trainable_variables": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.non_trainable_weights": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.output": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.output_mask": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.output_shape": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.set_weights": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.state_size": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.submodules": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.trainable": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.trainable_variables": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.trainable_weights": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.updates": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.variables": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.weights": true, 
+    "tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention.with_name_scope": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention": false, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.__call__": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.__init__": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.activity_regularizer": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.add_loss": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.add_metric": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.add_update": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.add_variable": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.add_weight": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.alignments_size": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.apply": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.build": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.call": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.compute_mask": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.compute_output_shape": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.compute_output_signature": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.count_params": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.deserialize_inner_layer_from_config": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.dtype": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.dynamic": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.from_config": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.get_config": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.get_input_at": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.get_input_mask_at": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.get_input_shape_at": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.get_losses_for": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.get_output_at": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.get_output_mask_at": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.get_output_shape_at": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.get_updates_for": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.get_weights": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.initial_alignments": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.initial_state": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.input": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.input_mask": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.input_shape": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.input_spec": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.losses": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.metrics": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.name": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.name_scope": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.non_trainable_variables": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.non_trainable_weights": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.output": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.output_mask": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.output_shape": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.set_weights": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.state_size": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.submodules": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.trainable": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.trainable_variables": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.trainable_weights": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.updates": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.variables": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.weights": true, 
+    "tfa.seq2seq.attention_wrapper.LuongAttention.with_name_scope": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention": false, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.__call__": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.__init__": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.activity_regularizer": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.add_loss": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.add_metric": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.add_update": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.add_variable": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.add_weight": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.alignments_size": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.apply": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.build": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.call": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.compute_mask": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.compute_output_shape": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.compute_output_signature": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.count_params": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.deserialize_inner_layer_from_config": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.dtype": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.dynamic": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.from_config": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.get_config": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.get_input_at": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.get_input_mask_at": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.get_input_shape_at": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.get_losses_for": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.get_output_at": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.get_output_mask_at": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.get_output_shape_at": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.get_updates_for": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.get_weights": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.initial_alignments": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.initial_state": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.input": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.input_mask": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.input_shape": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.input_spec": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.losses": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.metrics": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.name": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.name_scope": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.non_trainable_variables": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.non_trainable_weights": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.output": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.output_mask": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.output_shape": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.set_weights": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.state_size": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.submodules": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.trainable": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.trainable_variables": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.trainable_weights": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.updates": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.variables": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.weights": true, 
+    "tfa.seq2seq.attention_wrapper.LuongMonotonicAttention.with_name_scope": true, 
+    "tfa.seq2seq.attention_wrapper.absolute_import": true, 
+    "tfa.seq2seq.attention_wrapper.division": true, 
+    "tfa.seq2seq.attention_wrapper.hardmax": false, 
+    "tfa.seq2seq.attention_wrapper.monotonic_attention": false, 
+    "tfa.seq2seq.attention_wrapper.print_function": true, 
+    "tfa.seq2seq.attention_wrapper.safe_cumprod": false, 
+    "tfa.seq2seq.basic_decoder": false, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder": false, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.__call__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.__init__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.activity_regularizer": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.add_loss": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.add_metric": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.add_update": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.add_variable": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.add_weight": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.apply": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.batch_size": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.build": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.call": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.compute_mask": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.compute_output_shape": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.compute_output_signature": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.count_params": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.dtype": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.dynamic": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.finalize": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.from_config": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.get_config": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.get_input_at": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.get_input_mask_at": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.get_input_shape_at": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.get_losses_for": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.get_output_at": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.get_output_mask_at": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.get_output_shape_at": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.get_updates_for": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.get_weights": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.initialize": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.input": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.input_mask": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.input_shape": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.input_spec": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.losses": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.metrics": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.name": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.name_scope": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.non_trainable_variables": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.non_trainable_weights": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.output": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.output_dtype": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.output_mask": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.output_shape": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.output_size": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.set_weights": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.step": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.submodules": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.tracks_own_finished": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.trainable": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.trainable_variables": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.trainable_weights": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.updates": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.variables": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.weights": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoder.with_name_scope": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput": false, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__add__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__contains__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__eq__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__ge__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__getitem__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__gt__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__init__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__iter__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__le__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__len__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__lt__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__mul__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__ne__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.__rmul__": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.count": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.index": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.rnn_output": true, 
+    "tfa.seq2seq.basic_decoder.BasicDecoderOutput.sample_id": true, 
+    "tfa.seq2seq.basic_decoder.absolute_import": true, 
+    "tfa.seq2seq.basic_decoder.division": true, 
+    "tfa.seq2seq.basic_decoder.print_function": true, 
+    "tfa.seq2seq.beam_search_decoder": false, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder": false, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.__call__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.__init__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.activity_regularizer": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.add_loss": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.add_metric": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.add_update": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.add_variable": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.add_weight": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.apply": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.batch_size": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.build": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.call": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.compute_mask": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.compute_output_shape": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.compute_output_signature": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.count_params": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.dtype": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.dynamic": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.finalize": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.from_config": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.get_config": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.get_input_at": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.get_input_mask_at": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.get_input_shape_at": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.get_losses_for": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.get_output_at": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.get_output_mask_at": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.get_output_shape_at": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.get_updates_for": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.get_weights": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.initialize": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.input": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.input_mask": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.input_shape": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.input_spec": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.losses": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.metrics": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.name": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.name_scope": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.non_trainable_variables": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.non_trainable_weights": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.output": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.output_dtype": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.output_mask": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.output_shape": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.output_size": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.set_weights": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.step": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.submodules": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.tracks_own_finished": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.trainable": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.trainable_variables": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.trainable_weights": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.updates": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.variables": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.weights": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoder.with_name_scope": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin": false, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin.__init__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin.batch_size": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin.finalize": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin.output_size": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin.step": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin.tracks_own_finished": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput": false, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__add__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__contains__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__eq__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__ge__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__getitem__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__gt__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__init__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__iter__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__le__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__len__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__lt__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__mul__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__ne__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.__rmul__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.count": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.index": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.parent_ids": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.predicted_ids": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput.scores": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState": false, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__add__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__contains__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__eq__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__ge__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__getitem__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__gt__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__init__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__iter__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__le__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__len__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__lt__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__mul__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__ne__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.__rmul__": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.accumulated_attention_probs": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.cell_state": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.count": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.finished": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.index": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.lengths": true, 
+    "tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState.log_probs": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput": false, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__add__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__contains__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__eq__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__ge__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__getitem__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__gt__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__init__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__iter__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__le__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__len__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__lt__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__mul__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__ne__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.__rmul__": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.beam_search_decoder_output": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.count": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.index": true, 
+    "tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput.predicted_ids": true, 
+    "tfa.seq2seq.beam_search_decoder.absolute_import": true, 
+    "tfa.seq2seq.beam_search_decoder.attention_probs_from_attn_state": false, 
+    "tfa.seq2seq.beam_search_decoder.division": true, 
+    "tfa.seq2seq.beam_search_decoder.gather_tree_from_array": false, 
+    "tfa.seq2seq.beam_search_decoder.get_attention_probs": false, 
+    "tfa.seq2seq.beam_search_decoder.print_function": true, 
+    "tfa.seq2seq.beam_search_decoder.tile_batch": false, 
+    "tfa.seq2seq.decoder": false, 
+    "tfa.seq2seq.decoder.BaseDecoder": false, 
+    "tfa.seq2seq.decoder.BaseDecoder.__call__": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.__init__": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.activity_regularizer": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.add_loss": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.add_metric": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.add_update": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.add_variable": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.add_weight": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.apply": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.batch_size": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.build": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.call": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.compute_mask": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.compute_output_shape": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.compute_output_signature": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.count_params": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.dtype": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.dynamic": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.finalize": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.from_config": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.get_config": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.get_input_at": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.get_input_mask_at": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.get_input_shape_at": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.get_losses_for": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.get_output_at": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.get_output_mask_at": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.get_output_shape_at": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.get_updates_for": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.get_weights": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.initialize": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.input": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.input_mask": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.input_shape": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.input_spec": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.losses": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.metrics": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.name": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.name_scope": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.non_trainable_variables": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.non_trainable_weights": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.output": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.output_dtype": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.output_mask": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.output_shape": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.output_size": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.set_weights": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.step": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.submodules": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.tracks_own_finished": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.trainable": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.trainable_variables": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.trainable_weights": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.updates": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.variables": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.weights": true, 
+    "tfa.seq2seq.decoder.BaseDecoder.with_name_scope": true, 
+    "tfa.seq2seq.decoder.Decoder": false, 
+    "tfa.seq2seq.decoder.Decoder.__init__": true, 
+    "tfa.seq2seq.decoder.Decoder.batch_size": true, 
+    "tfa.seq2seq.decoder.Decoder.finalize": true, 
+    "tfa.seq2seq.decoder.Decoder.initialize": true, 
+    "tfa.seq2seq.decoder.Decoder.output_dtype": true, 
+    "tfa.seq2seq.decoder.Decoder.output_size": true, 
+    "tfa.seq2seq.decoder.Decoder.step": true, 
+    "tfa.seq2seq.decoder.Decoder.tracks_own_finished": true, 
+    "tfa.seq2seq.decoder.absolute_import": true, 
+    "tfa.seq2seq.decoder.division": true, 
+    "tfa.seq2seq.decoder.dynamic_decode": false, 
+    "tfa.seq2seq.decoder.print_function": true, 
+    "tfa.seq2seq.division": true, 
+    "tfa.seq2seq.dynamic_decode": false, 
+    "tfa.seq2seq.gather_tree_from_array": false, 
+    "tfa.seq2seq.hardmax": false, 
+    "tfa.seq2seq.loss": false, 
+    "tfa.seq2seq.loss.SequenceLoss": false, 
+    "tfa.seq2seq.loss.SequenceLoss.__call__": true, 
+    "tfa.seq2seq.loss.SequenceLoss.__init__": true, 
+    "tfa.seq2seq.loss.SequenceLoss.call": true, 
+    "tfa.seq2seq.loss.SequenceLoss.from_config": true, 
+    "tfa.seq2seq.loss.SequenceLoss.get_config": true, 
+    "tfa.seq2seq.loss.absolute_import": true, 
+    "tfa.seq2seq.loss.division": true, 
+    "tfa.seq2seq.loss.print_function": true, 
+    "tfa.seq2seq.loss.sequence_loss": false, 
+    "tfa.seq2seq.monotonic_attention": false, 
+    "tfa.seq2seq.print_function": true, 
+    "tfa.seq2seq.safe_cumprod": false, 
+    "tfa.seq2seq.sampler": false, 
+    "tfa.seq2seq.sampler.CustomSampler": false, 
+    "tfa.seq2seq.sampler.CustomSampler.__init__": true, 
+    "tfa.seq2seq.sampler.CustomSampler.batch_size": true, 
+    "tfa.seq2seq.sampler.CustomSampler.initialize": true, 
+    "tfa.seq2seq.sampler.CustomSampler.next_inputs": true, 
+    "tfa.seq2seq.sampler.CustomSampler.sample": true, 
+    "tfa.seq2seq.sampler.CustomSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.sampler.CustomSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.sampler.GreedyEmbeddingSampler": false, 
+    "tfa.seq2seq.sampler.GreedyEmbeddingSampler.__init__": true, 
+    "tfa.seq2seq.sampler.GreedyEmbeddingSampler.batch_size": true, 
+    "tfa.seq2seq.sampler.GreedyEmbeddingSampler.initialize": true, 
+    "tfa.seq2seq.sampler.GreedyEmbeddingSampler.next_inputs": true, 
+    "tfa.seq2seq.sampler.GreedyEmbeddingSampler.sample": true, 
+    "tfa.seq2seq.sampler.GreedyEmbeddingSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.sampler.GreedyEmbeddingSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.sampler.InferenceSampler": false, 
+    "tfa.seq2seq.sampler.InferenceSampler.__init__": true, 
+    "tfa.seq2seq.sampler.InferenceSampler.batch_size": true, 
+    "tfa.seq2seq.sampler.InferenceSampler.initialize": true, 
+    "tfa.seq2seq.sampler.InferenceSampler.next_inputs": true, 
+    "tfa.seq2seq.sampler.InferenceSampler.sample": true, 
+    "tfa.seq2seq.sampler.InferenceSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.sampler.InferenceSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.sampler.SampleEmbeddingSampler": false, 
+    "tfa.seq2seq.sampler.SampleEmbeddingSampler.__init__": true, 
+    "tfa.seq2seq.sampler.SampleEmbeddingSampler.batch_size": true, 
+    "tfa.seq2seq.sampler.SampleEmbeddingSampler.initialize": true, 
+    "tfa.seq2seq.sampler.SampleEmbeddingSampler.next_inputs": true, 
+    "tfa.seq2seq.sampler.SampleEmbeddingSampler.sample": true, 
+    "tfa.seq2seq.sampler.SampleEmbeddingSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.sampler.SampleEmbeddingSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.sampler.Sampler": false, 
+    "tfa.seq2seq.sampler.Sampler.__init__": true, 
+    "tfa.seq2seq.sampler.Sampler.batch_size": true, 
+    "tfa.seq2seq.sampler.Sampler.initialize": true, 
+    "tfa.seq2seq.sampler.Sampler.next_inputs": true, 
+    "tfa.seq2seq.sampler.Sampler.sample": true, 
+    "tfa.seq2seq.sampler.Sampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.sampler.Sampler.sample_ids_shape": true, 
+    "tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler": false, 
+    "tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler.__init__": true, 
+    "tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler.batch_size": true, 
+    "tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler.initialize": true, 
+    "tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler.next_inputs": true, 
+    "tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler.sample": true, 
+    "tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.sampler.ScheduledOutputTrainingSampler": false, 
+    "tfa.seq2seq.sampler.ScheduledOutputTrainingSampler.__init__": true, 
+    "tfa.seq2seq.sampler.ScheduledOutputTrainingSampler.batch_size": true, 
+    "tfa.seq2seq.sampler.ScheduledOutputTrainingSampler.initialize": true, 
+    "tfa.seq2seq.sampler.ScheduledOutputTrainingSampler.next_inputs": true, 
+    "tfa.seq2seq.sampler.ScheduledOutputTrainingSampler.sample": true, 
+    "tfa.seq2seq.sampler.ScheduledOutputTrainingSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.sampler.ScheduledOutputTrainingSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.sampler.TrainingSampler": false, 
+    "tfa.seq2seq.sampler.TrainingSampler.__init__": true, 
+    "tfa.seq2seq.sampler.TrainingSampler.batch_size": true, 
+    "tfa.seq2seq.sampler.TrainingSampler.initialize": true, 
+    "tfa.seq2seq.sampler.TrainingSampler.next_inputs": true, 
+    "tfa.seq2seq.sampler.TrainingSampler.sample": true, 
+    "tfa.seq2seq.sampler.TrainingSampler.sample_ids_dtype": true, 
+    "tfa.seq2seq.sampler.TrainingSampler.sample_ids_shape": true, 
+    "tfa.seq2seq.sampler.absolute_import": true, 
+    "tfa.seq2seq.sampler.bernoulli_sample": false, 
+    "tfa.seq2seq.sampler.categorical_sample": false, 
+    "tfa.seq2seq.sampler.division": true, 
+    "tfa.seq2seq.sampler.print_function": true, 
+    "tfa.seq2seq.sequence_loss": false, 
+    "tfa.seq2seq.tile_batch": false, 
+    "tfa.text": false, 
+    "tfa.text.absolute_import": true, 
+    "tfa.text.division": true, 
+    "tfa.text.print_function": true, 
+    "tfa.text.skip_gram_ops": false, 
+    "tfa.text.skip_gram_ops.absolute_import": true, 
+    "tfa.text.skip_gram_ops.division": true, 
+    "tfa.text.skip_gram_ops.print_function": true, 
+    "tfa.text.skip_gram_ops.skip_gram_sample": false, 
+    "tfa.text.skip_gram_ops.skip_gram_sample_with_text_vocab": false, 
+    "tfa.text.skip_gram_sample": false, 
+    "tfa.text.skip_gram_sample_with_text_vocab": false
+  }, 
+  "py_module_names": [
+    "tfa"
+  ]
+}
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/activations.md b/docs/api_docs/python/tfa/activations.md
new file mode 100644
index 0000000000..f688c75501
--- /dev/null
+++ b/docs/api_docs/python/tfa/activations.md
@@ -0,0 +1,20 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.activations" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.activations
+
+A module containing activation routines.
+
+
+
+Defined in [`activations/__init__.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/activations/__init__.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Functions
+
+[`sparsemax(...)`](../tfa/activations/sparsemax.md): Sparsemax activation function [1].
+
diff --git a/docs/api_docs/python/tfa/activations/sparsemax.md b/docs/api_docs/python/tfa/activations/sparsemax.md
new file mode 100644
index 0000000000..d3b19f68d7
--- /dev/null
+++ b/docs/api_docs/python/tfa/activations/sparsemax.md
@@ -0,0 +1,50 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.activations.sparsemax" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.activations.sparsemax
+
+Sparsemax activation function [1].
+
+### Aliases:
+
+* `tfa.activations.sparsemax`
+* `tfa.layers.sparsemax.sparsemax`
+
+``` python
+tfa.activations.sparsemax(
+    logits,
+    axis=-1,
+    name=None
+)
+```
+
+
+
+Defined in [`activations/sparsemax.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/activations/sparsemax.py).
+
+<!-- Placeholder for "Used in" -->
+
+For each batch `i` and class `j` we have
+  $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$
+
+[1]: https://arxiv.org/abs/1602.02068
+
+#### Args:
+
+
+* <b>`logits`</b>: Input tensor.
+* <b>`axis`</b>: Integer, axis along which the sparsemax operation is applied.
+* <b>`name`</b>: A name for the operation (optional).
+
+#### Returns:
+
+Tensor, output of sparsemax transformation. Has the same type and
+shape as `logits`.
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: In case `dim(logits) == 1`.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/image.md b/docs/api_docs/python/tfa/image.md
new file mode 100644
index 0000000000..5ac3048ce8
--- /dev/null
+++ b/docs/api_docs/python/tfa/image.md
@@ -0,0 +1,46 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.image
+
+Image manipulation ops.
+
+
+
+Defined in [`image/__init__.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/__init__.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Modules
+
+[`distance_transform`](../tfa/image/distance_transform.md) module: Distance transform ops.
+
+[`distort_image_ops`](../tfa/image/distort_image_ops.md) module: Python layer for distort_image_ops.
+
+[`filters`](../tfa/image/filters.md) module
+
+[`transform_ops`](../tfa/image/transform_ops.md) module: Image transform ops.
+
+## Functions
+
+[`adjust_hsv_in_yiq(...)`](../tfa/image/adjust_hsv_in_yiq.md): Adjust hue, saturation, value of an RGB image in YIQ color space.
+
+[`dense_image_warp(...)`](../tfa/image/dense_image_warp.md): Image warping using per-pixel flow vectors.
+
+[`euclidean_dist_transform(...)`](../tfa/image/euclidean_dist_transform.md): Applies euclidean distance transform(s) to the image(s).
+
+[`interpolate_bilinear(...)`](../tfa/image/interpolate_bilinear.md): Similar to Matlab's interp2 function.
+
+[`mean_filter2d(...)`](../tfa/image/mean_filter2d.md): Perform mean filtering on image(s).
+
+[`median_filter2d(...)`](../tfa/image/median_filter2d.md): This method performs Median Filtering on image. Filter shape can be user
+
+[`random_hsv_in_yiq(...)`](../tfa/image/random_hsv_in_yiq.md): Adjust hue, saturation, value of an RGB image randomly in YIQ color
+
+[`rotate(...)`](../tfa/image/rotate.md): Rotate image(s) counterclockwise by the passed angle(s) in radians.
+
+[`transform(...)`](../tfa/image/transform.md): Applies the given transform(s) to the image(s).
+
diff --git a/docs/api_docs/python/tfa/image/adjust_hsv_in_yiq.md b/docs/api_docs/python/tfa/image/adjust_hsv_in_yiq.md
new file mode 100644
index 0000000000..fa5ff5eb71
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/adjust_hsv_in_yiq.md
@@ -0,0 +1,55 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.adjust_hsv_in_yiq" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.image.adjust_hsv_in_yiq
+
+Adjust hue, saturation, value of an RGB image in YIQ color space.
+
+### Aliases:
+
+* `tfa.image.adjust_hsv_in_yiq`
+* `tfa.image.distort_image_ops.adjust_hsv_in_yiq`
+
+``` python
+tfa.image.adjust_hsv_in_yiq(
+    image,
+    delta_hue=0,
+    scale_saturation=1,
+    scale_value=1,
+    name=None
+)
+```
+
+
+
+Defined in [`image/distort_image_ops.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/distort_image_ops.py).
+
+<!-- Placeholder for "Used in" -->
+
+This is a convenience method that converts an RGB image to float
+representation, converts it to YIQ, rotates the color around the
+Y channel by delta_hue in radians, scales the chrominance channels
+(I, Q) by scale_saturation, scales all channels (Y, I, Q) by scale_value,
+converts back to RGB, and then back to the original data type.
+
+`image` is an RGB image. The image hue is adjusted by converting the
+image to YIQ, rotating around the luminance channel (Y) by
+`delta_hue` in radians, multiplying the chrominance channels (I, Q) by
+`scale_saturation`, and multiplying all channels (Y, I, Q) by
+`scale_value`. The image is then converted back to RGB.
+
+#### Args:
+
+
+* <b>`image`</b>: RGB image or images. Size of the last dimension must be 3.
+* <b>`delta_hue`</b>: float, the hue rotation amount, in radians.
+* <b>`scale_saturation`</b>: float, factor to multiply the saturation by.
+* <b>`scale_value`</b>: float, factor to multiply the value by.
+* <b>`name`</b>: A name for this operation (optional).
+
+
+#### Returns:
+
+Adjusted image(s), same shape and dtype as `image`.
diff --git a/docs/api_docs/python/tfa/image/dense_image_warp.md b/docs/api_docs/python/tfa/image/dense_image_warp.md
new file mode 100644
index 0000000000..0650d89ee6
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/dense_image_warp.md
@@ -0,0 +1,58 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.dense_image_warp" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.image.dense_image_warp
+
+Image warping using per-pixel flow vectors.
+
+``` python
+tfa.image.dense_image_warp(
+    image,
+    flow,
+    name=None
+)
+```
+
+
+
+Defined in [`image/dense_image_warp.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/dense_image_warp.py).
+
+<!-- Placeholder for "Used in" -->
+
+Apply a non-linear warp to the image, where the warp is specified by a
+dense flow field of offset vectors that define the correspondences of
+pixel values in the output image back to locations in the source image.
+Specifically, the pixel value at output[b, j, i, c] is
+images[b, j - flow[b, j, i, 0], i - flow[b, j, i, 1], c].
+
+The locations specified by this formula do not necessarily map to an int
+index. Therefore, the pixel value is obtained by bilinear
+interpolation of the 4 nearest pixels around
+(b, j - flow[b, j, i, 0], i - flow[b, j, i, 1]). For locations outside
+of the image, we use the nearest pixel values at the image boundary.
+
+#### Args:
+
+
+* <b>`image`</b>: 4-D float `Tensor` with shape `[batch, height, width, channels]`.
+* <b>`flow`</b>: A 4-D float `Tensor` with shape `[batch, height, width, 2]`.
+* <b>`name`</b>: A name for the operation (optional).
+
+Note that image and flow can be of type tf.half, tf.float32, or
+tf.float64, and do not necessarily have to be the same type.
+
+
+#### Returns:
+
+A 4-D float `Tensor` with shape`[batch, height, width, channels]`
+  and same type as input image.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if height < 2 or width < 2 or the inputs have the wrong
+  number of dimensions.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/image/distance_transform.md b/docs/api_docs/python/tfa/image/distance_transform.md
new file mode 100644
index 0000000000..3fe00d0dfc
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/distance_transform.md
@@ -0,0 +1,20 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.distance_transform" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.image.distance_transform
+
+Distance transform ops.
+
+
+
+Defined in [`image/distance_transform.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/distance_transform.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Functions
+
+[`euclidean_dist_transform(...)`](../../tfa/image/euclidean_dist_transform.md): Applies euclidean distance transform(s) to the image(s).
+
diff --git a/docs/api_docs/python/tfa/image/distort_image_ops.md b/docs/api_docs/python/tfa/image/distort_image_ops.md
new file mode 100644
index 0000000000..526cd491d4
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/distort_image_ops.md
@@ -0,0 +1,22 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.distort_image_ops" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.image.distort_image_ops
+
+Python layer for distort_image_ops.
+
+
+
+Defined in [`image/distort_image_ops.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/distort_image_ops.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Functions
+
+[`adjust_hsv_in_yiq(...)`](../../tfa/image/adjust_hsv_in_yiq.md): Adjust hue, saturation, value of an RGB image in YIQ color space.
+
+[`random_hsv_in_yiq(...)`](../../tfa/image/random_hsv_in_yiq.md): Adjust hue, saturation, value of an RGB image randomly in YIQ color
+
diff --git a/docs/api_docs/python/tfa/image/euclidean_dist_transform.md b/docs/api_docs/python/tfa/image/euclidean_dist_transform.md
new file mode 100644
index 0000000000..f862c54f9a
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/euclidean_dist_transform.md
@@ -0,0 +1,53 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.euclidean_dist_transform" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.image.euclidean_dist_transform
+
+Applies euclidean distance transform(s) to the image(s).
+
+### Aliases:
+
+* `tfa.image.distance_transform.euclidean_dist_transform`
+* `tfa.image.euclidean_dist_transform`
+
+``` python
+tfa.image.euclidean_dist_transform(
+    images,
+    dtype=tf.float32,
+    name=None
+)
+```
+
+
+
+Defined in [`image/distance_transform.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/distance_transform.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+#### Args:
+
+
+* <b>`images`</b>: A tensor of shape (num_images, num_rows, num_columns, 1) (NHWC),
+  or (num_rows, num_columns, 1) (HWC). The rank must be statically known
+  (the shape is not `TensorShape(None)`.
+* <b>`dtype`</b>: DType of the output tensor.
+* <b>`name`</b>: The name of the op.
+
+
+#### Returns:
+
+Image(s) with the type `dtype` and same shape as `images`, with the
+transform applied. If a tensor of all ones is given as input, the
+output tensor will be filled with the max value of the `dtype`.
+
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: If `image` is not tf.uint8, or `dtype` is not floating point.
+* <b>`ValueError`</b>: If `image` more than one channel, or `image` is not of
+  rank 3 or 4.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/image/filters.md b/docs/api_docs/python/tfa/image/filters.md
new file mode 100644
index 0000000000..ee8f4931cf
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/filters.md
@@ -0,0 +1,22 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.filters" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.image.filters
+
+
+
+
+
+Defined in [`image/filters.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/filters.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Functions
+
+[`mean_filter2d(...)`](../../tfa/image/mean_filter2d.md): Perform mean filtering on image(s).
+
+[`median_filter2d(...)`](../../tfa/image/median_filter2d.md): This method performs Median Filtering on image. Filter shape can be user
+
diff --git a/docs/api_docs/python/tfa/image/interpolate_bilinear.md b/docs/api_docs/python/tfa/image/interpolate_bilinear.md
new file mode 100644
index 0000000000..e2abbf5875
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/interpolate_bilinear.md
@@ -0,0 +1,48 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.interpolate_bilinear" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.image.interpolate_bilinear
+
+Similar to Matlab's interp2 function.
+
+``` python
+tfa.image.interpolate_bilinear(
+    grid,
+    query_points,
+    indexing='ij',
+    name=None
+)
+```
+
+
+
+Defined in [`image/dense_image_warp.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/dense_image_warp.py).
+
+<!-- Placeholder for "Used in" -->
+
+Finds values for query points on a grid using bilinear interpolation.
+
+#### Args:
+
+
+* <b>`grid`</b>: a 4-D float `Tensor` of shape `[batch, height, width, channels]`.
+* <b>`query_points`</b>: a 3-D float `Tensor` of N points with shape
+  `[batch, N, 2]`.
+* <b>`indexing`</b>: whether the query points are specified as row and column (ij),
+  or Cartesian coordinates (xy).
+* <b>`name`</b>: a name for the operation (optional).
+
+
+#### Returns:
+
+
+* <b>`values`</b>: a 3-D `Tensor` with shape `[batch, N, channels]`
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the indexing mode is invalid, or if the shape of the
+  inputs invalid.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/image/mean_filter2d.md b/docs/api_docs/python/tfa/image/mean_filter2d.md
new file mode 100644
index 0000000000..dd6e3c8e95
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/mean_filter2d.md
@@ -0,0 +1,58 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.mean_filter2d" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.image.mean_filter2d
+
+Perform mean filtering on image(s).
+
+### Aliases:
+
+* `tfa.image.filters.mean_filter2d`
+* `tfa.image.mean_filter2d`
+
+``` python
+tfa.image.mean_filter2d(
+    image,
+    filter_shape=(3, 3),
+    padding='REFLECT',
+    constant_values=0,
+    name=None
+)
+```
+
+
+
+Defined in [`image/filters.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/filters.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+#### Args:
+
+
+* <b>`image`</b>: Either a 3-D `Tensor` of shape `[height, width, channels]`,
+  or a 4-D `Tensor` of shape `[batch_size, height, width, channels]`.
+* <b>`filter_shape`</b>: An `integer` or `tuple`/`list` of 2 integers, specifying
+  the height and width of the 2-D mean filter. Can be a single integer
+  to specify the same value for all spatial dimensions.
+* <b>`padding`</b>: A `string`, one of "REFLECT", "CONSTANT", or "SYMMETRIC".
+  The type of padding algorithm to use, which is compatible with
+  `mode` argument in `tf.pad`. For more details, please refer to
+  https://www.tensorflow.org/api_docs/python/tf/pad.
+* <b>`constant_values`</b>: A `scalar`, the pad value to use in "CONSTANT"
+  padding mode.
+* <b>`name`</b>: A name for this operation (optional).
+
+#### Returns:
+
+3-D or 4-D `Tensor` of the same dtype as input.
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If `image` is not 3 or 4-dimensional,
+  if `padding` is other than "REFLECT", "CONSTANT" or "SYMMETRIC",
+  or if `filter_shape` is invalid.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/image/median_filter2d.md b/docs/api_docs/python/tfa/image/median_filter2d.md
new file mode 100644
index 0000000000..5d250f4245
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/median_filter2d.md
@@ -0,0 +1,45 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.median_filter2d" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.image.median_filter2d
+
+This method performs Median Filtering on image. Filter shape can be user
+
+### Aliases:
+
+* `tfa.image.filters.median_filter2d`
+* `tfa.image.median_filter2d`
+
+``` python
+tfa.image.median_filter2d(
+    image,
+    filter_shape=(3, 3),
+    name=None
+)
+```
+
+
+
+Defined in [`image/filters.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/filters.py).
+
+<!-- Placeholder for "Used in" -->
+given.
+
+This method takes both kind of images where pixel values lie between 0 to
+255 and where it lies between 0.0 and 1.0
+Args:
+    image: A 3D `Tensor` of type `float32` or 'int32' or 'float64' or
+           'int64 and of shape`[rows, columns, channels]`
+
+    filter_shape: Optional Argument. A tuple of 2 integers (R,C).
+           R is the first value is the number of rows in the filter and
+           C is the second value in the filter is the number of columns
+           in the filter. This creates a filter of shape (R,C) or RxC
+           filter. Default value = (3,3)
+    name: The name of the op.
+
+ Returns:
+     A 3D median filtered image tensor of shape [rows,columns,channels] and
+     type 'int32'. Pixel value of returned tensor ranges between 0 to 255
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/image/random_hsv_in_yiq.md b/docs/api_docs/python/tfa/image/random_hsv_in_yiq.md
new file mode 100644
index 0000000000..808ca51d6e
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/random_hsv_in_yiq.md
@@ -0,0 +1,68 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.random_hsv_in_yiq" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.image.random_hsv_in_yiq
+
+Adjust hue, saturation, value of an RGB image randomly in YIQ color
+
+### Aliases:
+
+* `tfa.image.distort_image_ops.random_hsv_in_yiq`
+* `tfa.image.random_hsv_in_yiq`
+
+``` python
+tfa.image.random_hsv_in_yiq(
+    image,
+    max_delta_hue=0,
+    lower_saturation=1,
+    upper_saturation=1,
+    lower_value=1,
+    upper_value=1,
+    seed=None,
+    name=None
+)
+```
+
+
+
+Defined in [`image/distort_image_ops.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/distort_image_ops.py).
+
+<!-- Placeholder for "Used in" -->
+space.
+
+Equivalent to `adjust_yiq_hsv()` but uses a `delta_h` randomly
+picked in the interval `[-max_delta_hue, max_delta_hue]`, a
+`scale_saturation` randomly picked in the interval
+`[lower_saturation, upper_saturation]`, and a `scale_value`
+randomly picked in the interval `[lower_saturation, upper_saturation]`.
+
+#### Args:
+
+
+* <b>`image`</b>: RGB image or images. Size of the last dimension must be 3.
+* <b>`max_delta_hue`</b>: float. Maximum value for the random delta_hue. Passing 0
+  disables adjusting hue.
+* <b>`lower_saturation`</b>: float. Lower bound for the random scale_saturation.
+* <b>`upper_saturation`</b>: float. Upper bound for the random scale_saturation.
+* <b>`lower_value`</b>: float. Lower bound for the random scale_value.
+* <b>`upper_value`</b>: float. Upper bound for the random scale_value.
+* <b>`seed`</b>: An operation-specific seed. It will be used in conjunction
+  with the graph-level seed to determine the real seeds that will be
+  used in this operation. Please see the documentation of
+  set_random_seed for its interaction with the graph-level random seed.
+* <b>`name`</b>: A name for this operation (optional).
+
+
+#### Returns:
+
+3-D float tensor of shape `[height, width, channels]`.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if `max_delta`, `lower_saturation`, `upper_saturation`,
+  `lower_value`, or `upper_value` is invalid.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/image/rotate.md b/docs/api_docs/python/tfa/image/rotate.md
new file mode 100644
index 0000000000..63da52f409
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/rotate.md
@@ -0,0 +1,57 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.rotate" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.image.rotate
+
+Rotate image(s) counterclockwise by the passed angle(s) in radians.
+
+### Aliases:
+
+* `tfa.image.rotate`
+* `tfa.image.transform_ops.rotate`
+
+``` python
+tfa.image.rotate(
+    images,
+    angles,
+    interpolation='NEAREST',
+    name=None
+)
+```
+
+
+
+Defined in [`image/transform_ops.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/transform_ops.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+#### Args:
+
+
+* <b>`images`</b>: A tensor of shape
+  (num_images, num_rows, num_columns, num_channels)
+  (NHWC), (num_rows, num_columns, num_channels) (HWC), or
+  (num_rows, num_columns) (HW). The rank must be statically known (the
+  shape is not `TensorShape(None)`.
+* <b>`angles`</b>: A scalar angle to rotate all images by, or (if images has rank 4)
+  a vector of length num_images, with an angle for each image in the
+  batch.
+* <b>`interpolation`</b>: Interpolation mode. Supported values: "NEAREST",
+  "BILINEAR".
+* <b>`name`</b>: The name of the op.
+
+
+#### Returns:
+
+Image(s) with the same type and shape as `images`, rotated by the given
+angle(s). Empty space due to the rotation will be filled with zeros.
+
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: If `image` is an invalid type.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/image/transform.md b/docs/api_docs/python/tfa/image/transform.md
new file mode 100644
index 0000000000..63bc05ac0e
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/transform.md
@@ -0,0 +1,67 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.transform" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.image.transform
+
+Applies the given transform(s) to the image(s).
+
+### Aliases:
+
+* `tfa.image.transform`
+* `tfa.image.transform_ops.transform`
+
+``` python
+tfa.image.transform(
+    images,
+    transforms,
+    interpolation='NEAREST',
+    output_shape=None,
+    name=None
+)
+```
+
+
+
+Defined in [`image/transform_ops.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/transform_ops.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+#### Args:
+
+
+* <b>`images`</b>: A tensor of shape (num_images, num_rows, num_columns,
+  num_channels) (NHWC), (num_rows, num_columns, num_channels) (HWC), or
+  (num_rows, num_columns) (HW). The rank must be statically known (the
+  shape is not `TensorShape(None)`.
+* <b>`transforms`</b>: Projective transform matrix/matrices. A vector of length 8 or
+  tensor of size N x 8. If one row of transforms is
+  [a0, a1, a2, b0, b1, b2, c0, c1], then it maps the *output* point
+  `(x, y)` to a transformed *input* point
+  `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
+  where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to
+  the transform mapping input points to output points. Note that
+  gradients are not backpropagated into transformation parameters.
+* <b>`interpolation`</b>: Interpolation mode.
+  Supported values: "NEAREST", "BILINEAR".
+* <b>`output_shape`</b>: Output dimesion after the transform, [height, width].
+  If None, output is the same size as input image.
+
+* <b>`name`</b>: The name of the op.
+
+
+#### Returns:
+
+Image(s) with the same type and shape as `images`, with the given
+transform(s) applied. Transformed coordinates outside of the input image
+will be filled with zeros.
+
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: If `image` is an invalid type.
+* <b>`ValueError`</b>: If output shape is not 1-D int32 Tensor.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/image/transform_ops.md b/docs/api_docs/python/tfa/image/transform_ops.md
new file mode 100644
index 0000000000..c163e8799e
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/transform_ops.md
@@ -0,0 +1,30 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.transform_ops" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.image.transform_ops
+
+Image transform ops.
+
+
+
+Defined in [`image/transform_ops.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/transform_ops.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Functions
+
+[`angles_to_projective_transforms(...)`](../../tfa/image/transform_ops/angles_to_projective_transforms.md): Returns projective transform(s) for the given angle(s).
+
+[`compose_transforms(...)`](../../tfa/image/transform_ops/compose_transforms.md): Composes the transforms tensors.
+
+[`flat_transforms_to_matrices(...)`](../../tfa/image/transform_ops/flat_transforms_to_matrices.md): Converts projective transforms to affine matrices.
+
+[`matrices_to_flat_transforms(...)`](../../tfa/image/transform_ops/matrices_to_flat_transforms.md): Converts affine matrices to projective transforms.
+
+[`rotate(...)`](../../tfa/image/rotate.md): Rotate image(s) counterclockwise by the passed angle(s) in radians.
+
+[`transform(...)`](../../tfa/image/transform.md): Applies the given transform(s) to the image(s).
+
diff --git a/docs/api_docs/python/tfa/image/transform_ops/angles_to_projective_transforms.md b/docs/api_docs/python/tfa/image/transform_ops/angles_to_projective_transforms.md
new file mode 100644
index 0000000000..49421a051c
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/transform_ops/angles_to_projective_transforms.md
@@ -0,0 +1,39 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.transform_ops.angles_to_projective_transforms" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.image.transform_ops.angles_to_projective_transforms
+
+Returns projective transform(s) for the given angle(s).
+
+``` python
+tfa.image.transform_ops.angles_to_projective_transforms(
+    angles,
+    image_height,
+    image_width,
+    name=None
+)
+```
+
+
+
+Defined in [`image/transform_ops.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/transform_ops.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+#### Args:
+
+
+* <b>`angles`</b>: A scalar angle to rotate all images by, or (for batches of
+  images) a vector with an angle to rotate each image in the batch. The
+  rank must be statically known (the shape is not `TensorShape(None)`.
+* <b>`image_height`</b>: Height of the image(s) to be transformed.
+* <b>`image_width`</b>: Width of the image(s) to be transformed.
+
+
+#### Returns:
+
+A tensor of shape (num_images, 8). Projective transforms which can be
+given to `transform` op.
diff --git a/docs/api_docs/python/tfa/image/transform_ops/compose_transforms.md b/docs/api_docs/python/tfa/image/transform_ops/compose_transforms.md
new file mode 100644
index 0000000000..c574234aae
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/transform_ops/compose_transforms.md
@@ -0,0 +1,38 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.transform_ops.compose_transforms" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.image.transform_ops.compose_transforms
+
+Composes the transforms tensors.
+
+``` python
+tfa.image.transform_ops.compose_transforms(
+    transforms,
+    name=None
+)
+```
+
+
+
+Defined in [`image/transform_ops.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/transform_ops.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+#### Args:
+
+
+* <b>`transforms`</b>: List of image projective transforms to be composed. Each
+  transform is length 8 (single transform) or shape (N, 8) (batched
+  transforms). The shapes of all inputs must be equal, and at least one
+  input must be given.
+* <b>`name`</b>: The name for the op.
+
+
+#### Returns:
+
+A composed transform tensor. When passed to `transform` op,
+    equivalent to applying each of the given transforms to the image in
+    order.
diff --git a/docs/api_docs/python/tfa/image/transform_ops/flat_transforms_to_matrices.md b/docs/api_docs/python/tfa/image/transform_ops/flat_transforms_to_matrices.md
new file mode 100644
index 0000000000..e16562a78f
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/transform_ops/flat_transforms_to_matrices.md
@@ -0,0 +1,45 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.transform_ops.flat_transforms_to_matrices" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.image.transform_ops.flat_transforms_to_matrices
+
+Converts projective transforms to affine matrices.
+
+``` python
+tfa.image.transform_ops.flat_transforms_to_matrices(
+    transforms,
+    name=None
+)
+```
+
+
+
+Defined in [`image/transform_ops.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/transform_ops.py).
+
+<!-- Placeholder for "Used in" -->
+
+Note that the output matrices map output coordinates to input coordinates.
+For the forward transformation matrix, call `tf.linalg.inv` on the result.
+
+#### Args:
+
+
+* <b>`transforms`</b>: Vector of length 8, or batches of transforms with shape
+  `(N, 8)`.
+* <b>`name`</b>: The name for the op.
+
+
+#### Returns:
+
+3D tensor of matrices with shape `(N, 3, 3)`. The output matrices map the
+  *output coordinates* (in homogeneous coordinates) of each transform to
+  the corresponding *input coordinates*.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If `transforms` have an invalid shape.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/image/transform_ops/matrices_to_flat_transforms.md b/docs/api_docs/python/tfa/image/transform_ops/matrices_to_flat_transforms.md
new file mode 100644
index 0000000000..f7c79c8e2f
--- /dev/null
+++ b/docs/api_docs/python/tfa/image/transform_ops/matrices_to_flat_transforms.md
@@ -0,0 +1,46 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.image.transform_ops.matrices_to_flat_transforms" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.image.transform_ops.matrices_to_flat_transforms
+
+Converts affine matrices to projective transforms.
+
+``` python
+tfa.image.transform_ops.matrices_to_flat_transforms(
+    transform_matrices,
+    name=None
+)
+```
+
+
+
+Defined in [`image/transform_ops.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/image/transform_ops.py).
+
+<!-- Placeholder for "Used in" -->
+
+Note that we expect matrices that map output coordinates to input
+coordinates. To convert forward transformation matrices,
+call `tf.linalg.inv` on the matrices and use the result here.
+
+#### Args:
+
+
+* <b>`transform_matrices`</b>: One or more affine transformation matrices, for the
+  reverse transformation in homogeneous coordinates. Shape `(3, 3)` or
+  `(N, 3, 3)`.
+* <b>`name`</b>: The name for the op.
+
+
+#### Returns:
+
+2D tensor of flat transforms with shape `(N, 8)`, which may be passed
+into `transform` op.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If `transform_matrices` have an invalid shape.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/layers.md b/docs/api_docs/python/tfa/layers.md
new file mode 100644
index 0000000000..10f9af0f22
--- /dev/null
+++ b/docs/api_docs/python/tfa/layers.md
@@ -0,0 +1,42 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.layers" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.layers
+
+Additional layers that conform to Keras API.
+
+
+
+Defined in [`layers/__init__.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/layers/__init__.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Modules
+
+[`maxout`](../tfa/layers/maxout.md) module: Implementing Maxout layer.
+
+[`normalizations`](../tfa/layers/normalizations.md) module
+
+[`poincare`](../tfa/layers/poincare.md) module: Implementing PoincareNormalize layer.
+
+[`sparsemax`](../tfa/layers/sparsemax.md) module
+
+[`wrappers`](../tfa/layers/wrappers.md) module
+
+## Classes
+
+[`class GroupNormalization`](../tfa/layers/GroupNormalization.md): Group normalization layer.
+
+[`class InstanceNormalization`](../tfa/layers/InstanceNormalization.md): Instance normalization layer.
+
+[`class Maxout`](../tfa/layers/Maxout.md): Applies Maxout to the input.
+
+[`class PoincareNormalize`](../tfa/layers/PoincareNormalize.md): Project into the Poincare ball with norm <= 1.0 - epsilon.
+
+[`class Sparsemax`](../tfa/layers/Sparsemax.md): Sparsemax activation function [1].
+
+[`class WeightNormalization`](../tfa/layers/WeightNormalization.md): This wrapper reparameterizes a layer by decoupling the weight's
+
diff --git a/docs/api_docs/python/tfa/layers/GroupNormalization.md b/docs/api_docs/python/tfa/layers/GroupNormalization.md
new file mode 100644
index 0000000000..0eb32cf473
--- /dev/null
+++ b/docs/api_docs/python/tfa/layers/GroupNormalization.md
@@ -0,0 +1,852 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.layers.GroupNormalization" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.layers.GroupNormalization
+
+## Class `GroupNormalization`
+
+Group normalization layer.
+
+
+
+### Aliases:
+
+* Class `tfa.layers.GroupNormalization`
+* Class `tfa.layers.normalizations.GroupNormalization`
+
+
+
+Defined in [`layers/normalizations.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/layers/normalizations.py).
+
+<!-- Placeholder for "Used in" -->
+
+Group Normalization divides the channels into groups and computes
+within each group the mean and variance for normalization.
+Empirically, its accuracy is more stable than batch norm in a wide
+range of small batch sizes, if learning rate is adjusted linearly
+with batch sizes.
+
+Relation to Layer Normalization:
+If the number of groups is set to 1, then this operation becomes identical
+to Layer Normalization.
+
+Relation to Instance Normalization:
+If the number of groups is set to the
+input dimension (number of groups is equal
+to number of channels), then this operation becomes
+identical to Instance Normalization.
+
+Arguments
+    groups: Integer, the number of groups for Group Normalization.
+        Can be in the range [1, N] where N is the input dimension.
+        The input dimension must be divisible by the number of groups.
+    axis: Integer, the axis that should be normalized.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    center: If True, add offset of `beta` to normalized tensor.
+        If False, `beta` is ignored.
+    scale: If True, multiply by `gamma`.
+        If False, `gamma` is not used.
+    beta_initializer: Initializer for the beta weight.
+    gamma_initializer: Initializer for the gamma weight.
+    beta_regularizer: Optional regularizer for the beta weight.
+    gamma_regularizer: Optional regularizer for the gamma weight.
+    beta_constraint: Optional constraint for the beta weight.
+    gamma_constraint: Optional constraint for the gamma weight.
+
+Input shape
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
+
+Output shape
+    Same shape as input.
+References
+    - [Group Normalization](https://arxiv.org/abs/1803.08494)
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    groups=2,
+    axis=-1,
+    epsilon=0.001,
+    center=True,
+    scale=True,
+    beta_initializer='zeros',
+    gamma_initializer='ones',
+    beta_regularizer=None,
+    gamma_regularizer=None,
+    beta_constraint=None,
+    gamma_constraint=None,
+    **kwargs
+)
+```
+
+
+
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Wraps `call`, applying pre- and post-processing steps.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+
+#### Note:
+
+- The following optional keyword arguments are reserved for specific uses:
+  * `training`: Boolean scalar tensor of Python boolean indicating
+    whether the `call` is meant for training or inference.
+  * `mask`: Boolean input mask.
+- If the layer's `call` method takes a `mask` argument (as some Keras
+  layers do), its default value will be set to the mask generated
+  for `inputs` by the previous layer (if `input` did come from
+  a layer that generated a corresponding mask, i.e. if it came from
+  a Keras layer with masking support.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer's `call` method returns None (an invalid value).
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+
+
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+Computes an output mask tensor.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Tensor or list of tensors.
+* <b>`mask`</b>: Tensor or list of tensors.
+
+
+#### Returns:
+
+None or a tensor (or list of tensors,
+    one per output tensor of the layer).
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Creates a layer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same layer from the config
+dictionary. It does not handle layer connectivity
+(handled by Network), nor weights (handled by `set_weights`).
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the
+    output of get_config.
+
+
+#### Returns:
+
+A layer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/layers/InstanceNormalization.md b/docs/api_docs/python/tfa/layers/InstanceNormalization.md
new file mode 100644
index 0000000000..8046e4bee7
--- /dev/null
+++ b/docs/api_docs/python/tfa/layers/InstanceNormalization.md
@@ -0,0 +1,828 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.layers.InstanceNormalization" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.layers.InstanceNormalization
+
+## Class `InstanceNormalization`
+
+Instance normalization layer.
+
+Inherits From: [`GroupNormalization`](../../tfa/layers/GroupNormalization.md)
+
+### Aliases:
+
+* Class `tfa.layers.InstanceNormalization`
+* Class `tfa.layers.normalizations.InstanceNormalization`
+
+
+
+Defined in [`layers/normalizations.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/layers/normalizations.py).
+
+<!-- Placeholder for "Used in" -->
+
+Instance Normalization is an specific case of ```GroupNormalization```since
+it normalizes all features of one channel. The Groupsize is equal to the
+channel size. Empirically, its accuracy is more stable than batch norm in a
+wide range of small batch sizes, if learning rate is adjusted linearly
+with batch sizes.
+
+Arguments
+    axis: Integer, the axis that should be normalized.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    center: If True, add offset of `beta` to normalized tensor.
+        If False, `beta` is ignored.
+    scale: If True, multiply by `gamma`.
+        If False, `gamma` is not used.
+    beta_initializer: Initializer for the beta weight.
+    gamma_initializer: Initializer for the gamma weight.
+    beta_regularizer: Optional regularizer for the beta weight.
+    gamma_regularizer: Optional regularizer for the gamma weight.
+    beta_constraint: Optional constraint for the beta weight.
+    gamma_constraint: Optional constraint for the gamma weight.
+
+Input shape
+    Arbitrary. Use the keyword argument `input_shape`
+    (tuple of integers, does not include the samples axis)
+    when using this layer as the first layer in a model.
+
+Output shape
+    Same shape as input.
+
+References
+    - [Instance Normalization: The Missing Ingredient for Fast Stylization]
+    (https://arxiv.org/abs/1607.08022)
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(**kwargs)
+```
+
+
+
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Wraps `call`, applying pre- and post-processing steps.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+
+#### Note:
+
+- The following optional keyword arguments are reserved for specific uses:
+  * `training`: Boolean scalar tensor of Python boolean indicating
+    whether the `call` is meant for training or inference.
+  * `mask`: Boolean input mask.
+- If the layer's `call` method takes a `mask` argument (as some Keras
+  layers do), its default value will be set to the mask generated
+  for `inputs` by the previous layer (if `input` did come from
+  a layer that generated a corresponding mask, i.e. if it came from
+  a Keras layer with masking support.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer's `call` method returns None (an invalid value).
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+
+
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+Computes an output mask tensor.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Tensor or list of tensors.
+* <b>`mask`</b>: Tensor or list of tensors.
+
+
+#### Returns:
+
+None or a tensor (or list of tensors,
+    one per output tensor of the layer).
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Creates a layer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same layer from the config
+dictionary. It does not handle layer connectivity
+(handled by Network), nor weights (handled by `set_weights`).
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the
+    output of get_config.
+
+
+#### Returns:
+
+A layer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/layers/Maxout.md b/docs/api_docs/python/tfa/layers/Maxout.md
new file mode 100644
index 0000000000..c5651e4bd1
--- /dev/null
+++ b/docs/api_docs/python/tfa/layers/Maxout.md
@@ -0,0 +1,839 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.layers.Maxout" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.layers.Maxout
+
+## Class `Maxout`
+
+Applies Maxout to the input.
+
+
+
+### Aliases:
+
+* Class `tfa.layers.Maxout`
+* Class `tfa.layers.maxout.Maxout`
+
+
+
+Defined in [`layers/maxout.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/layers/maxout.py).
+
+<!-- Placeholder for "Used in" -->
+
+"Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
+Courville, Yoshua Bengio. https://arxiv.org/abs/1302.4389
+
+Usually the operation is performed in the filter/channel dimension. This
+can also be used after Dense layers to reduce number of features.
+
+#### Arguments:
+
+
+* <b>`num_units`</b>: Specifies how many features will remain after maxout
+  in the `axis` dimension (usually channel).
+  This must be a factor of number of features.
+* <b>`axis`</b>: The dimension where max pooling will be performed. Default is the
+  last dimension.
+
+
+#### Input shape:
+
+nD tensor with shape: `(batch_size, ..., axis_dim, ...)`.
+
+
+
+#### Output shape:
+
+nD tensor with shape: `(batch_size, ..., num_units, ...)`.
+
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    num_units,
+    axis=-1,
+    **kwargs
+)
+```
+
+
+
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Wraps `call`, applying pre- and post-processing steps.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+
+#### Note:
+
+- The following optional keyword arguments are reserved for specific uses:
+  * `training`: Boolean scalar tensor of Python boolean indicating
+    whether the `call` is meant for training or inference.
+  * `mask`: Boolean input mask.
+- If the layer's `call` method takes a `mask` argument (as some Keras
+  layers do), its default value will be set to the mask generated
+  for `inputs` by the previous layer (if `input` did come from
+  a layer that generated a corresponding mask, i.e. if it came from
+  a Keras layer with masking support.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer's `call` method returns None (an invalid value).
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+Creates the variables of the layer (optional, for subclass implementers).
+
+This is a method that implementers of subclasses of `Layer` or `Model`
+can override if they need a state-creation step in-between
+layer instantiation and layer call.
+
+This is typically used to create the weights of `Layer` subclasses.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Instance of `TensorShape`, or list of instances of
+  `TensorShape` if the layer expects a list of inputs
+  (one instance per input).
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+Computes an output mask tensor.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Tensor or list of tensors.
+* <b>`mask`</b>: Tensor or list of tensors.
+
+
+#### Returns:
+
+None or a tensor (or list of tensors,
+    one per output tensor of the layer).
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Creates a layer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same layer from the config
+dictionary. It does not handle layer connectivity
+(handled by Network), nor weights (handled by `set_weights`).
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the
+    output of get_config.
+
+
+#### Returns:
+
+A layer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/layers/PoincareNormalize.md b/docs/api_docs/python/tfa/layers/PoincareNormalize.md
new file mode 100644
index 0000000000..3e8713f410
--- /dev/null
+++ b/docs/api_docs/python/tfa/layers/PoincareNormalize.md
@@ -0,0 +1,833 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.layers.PoincareNormalize" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.layers.PoincareNormalize
+
+## Class `PoincareNormalize`
+
+Project into the Poincare ball with norm <= 1.0 - epsilon.
+
+
+
+### Aliases:
+
+* Class `tfa.layers.PoincareNormalize`
+* Class `tfa.layers.poincare.PoincareNormalize`
+
+
+
+Defined in [`layers/poincare.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/layers/poincare.py).
+
+<!-- Placeholder for "Used in" -->
+
+https://en.wikipedia.org/wiki/Poincare_ball_model
+
+Used in Poincare Embeddings for Learning Hierarchical Representations
+Maximilian Nickel, Douwe Kiela https://arxiv.org/pdf/1705.08039.pdf
+
+For a 1-D tensor with `axis = 0`, computes
+
+              (x * (1 - epsilon)) / ||x||     if ||x|| > 1 - epsilon
+    output =
+               x                              otherwise
+
+For `x` with more dimensions, independently normalizes each 1-D slice along
+dimension `axis`.
+
+#### Arguments:
+
+
+* <b>`axis`</b>: Axis along which to normalize.  A scalar or a vector of integers.
+* <b>`epsilon`</b>: A small deviation from the edge of the unit sphere for
+  numerical stability.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    axis=1,
+    epsilon=1e-05,
+    **kwargs
+)
+```
+
+
+
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Wraps `call`, applying pre- and post-processing steps.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+
+#### Note:
+
+- The following optional keyword arguments are reserved for specific uses:
+  * `training`: Boolean scalar tensor of Python boolean indicating
+    whether the `call` is meant for training or inference.
+  * `mask`: Boolean input mask.
+- If the layer's `call` method takes a `mask` argument (as some Keras
+  layers do), its default value will be set to the mask generated
+  for `inputs` by the previous layer (if `input` did come from
+  a layer that generated a corresponding mask, i.e. if it came from
+  a Keras layer with masking support.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer's `call` method returns None (an invalid value).
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+Creates the variables of the layer (optional, for subclass implementers).
+
+This is a method that implementers of subclasses of `Layer` or `Model`
+can override if they need a state-creation step in-between
+layer instantiation and layer call.
+
+This is typically used to create the weights of `Layer` subclasses.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Instance of `TensorShape`, or list of instances of
+  `TensorShape` if the layer expects a list of inputs
+  (one instance per input).
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+Computes an output mask tensor.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Tensor or list of tensors.
+* <b>`mask`</b>: Tensor or list of tensors.
+
+
+#### Returns:
+
+None or a tensor (or list of tensors,
+    one per output tensor of the layer).
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Creates a layer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same layer from the config
+dictionary. It does not handle layer connectivity
+(handled by Network), nor weights (handled by `set_weights`).
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the
+    output of get_config.
+
+
+#### Returns:
+
+A layer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/layers/Sparsemax.md b/docs/api_docs/python/tfa/layers/Sparsemax.md
new file mode 100644
index 0000000000..3c136b3676
--- /dev/null
+++ b/docs/api_docs/python/tfa/layers/Sparsemax.md
@@ -0,0 +1,820 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.layers.Sparsemax" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.layers.Sparsemax
+
+## Class `Sparsemax`
+
+Sparsemax activation function [1].
+
+
+
+### Aliases:
+
+* Class `tfa.layers.Sparsemax`
+* Class `tfa.layers.sparsemax.Sparsemax`
+
+
+
+Defined in [`layers/sparsemax.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/layers/sparsemax.py).
+
+<!-- Placeholder for "Used in" -->
+
+The ouput shape is the same as the input shape.
+
+[1]: https://arxiv.org/abs/1602.02068
+
+#### Arguments:
+
+
+* <b>`axis`</b>: Integer, axis along which the sparsemax normalization is applied.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    axis=-1,
+    **kwargs
+)
+```
+
+
+
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Wraps `call`, applying pre- and post-processing steps.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+
+#### Note:
+
+- The following optional keyword arguments are reserved for specific uses:
+  * `training`: Boolean scalar tensor of Python boolean indicating
+    whether the `call` is meant for training or inference.
+  * `mask`: Boolean input mask.
+- If the layer's `call` method takes a `mask` argument (as some Keras
+  layers do), its default value will be set to the mask generated
+  for `inputs` by the previous layer (if `input` did come from
+  a layer that generated a corresponding mask, i.e. if it came from
+  a Keras layer with masking support.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer's `call` method returns None (an invalid value).
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+Creates the variables of the layer (optional, for subclass implementers).
+
+This is a method that implementers of subclasses of `Layer` or `Model`
+can override if they need a state-creation step in-between
+layer instantiation and layer call.
+
+This is typically used to create the weights of `Layer` subclasses.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Instance of `TensorShape`, or list of instances of
+  `TensorShape` if the layer expects a list of inputs
+  (one instance per input).
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+Computes an output mask tensor.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Tensor or list of tensors.
+* <b>`mask`</b>: Tensor or list of tensors.
+
+
+#### Returns:
+
+None or a tensor (or list of tensors,
+    one per output tensor of the layer).
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Creates a layer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same layer from the config
+dictionary. It does not handle layer connectivity
+(handled by Network), nor weights (handled by `set_weights`).
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the
+    output of get_config.
+
+
+#### Returns:
+
+A layer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/layers/WeightNormalization.md b/docs/api_docs/python/tfa/layers/WeightNormalization.md
new file mode 100644
index 0000000000..0722840c90
--- /dev/null
+++ b/docs/api_docs/python/tfa/layers/WeightNormalization.md
@@ -0,0 +1,815 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.layers.WeightNormalization" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.layers.WeightNormalization
+
+## Class `WeightNormalization`
+
+This wrapper reparameterizes a layer by decoupling the weight's
+
+
+
+### Aliases:
+
+* Class `tfa.layers.WeightNormalization`
+* Class `tfa.layers.wrappers.WeightNormalization`
+
+
+
+Defined in [`layers/wrappers.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/layers/wrappers.py).
+
+<!-- Placeholder for "Used in" -->
+magnitude and direction.
+
+This speeds up convergence by improving the
+conditioning of the optimization problem.
+Weight Normalization: A Simple Reparameterization to Accelerate
+Training of Deep Neural Networks: https://arxiv.org/abs/1602.07868
+Tim Salimans, Diederik P. Kingma (2016)
+WeightNormalization wrapper works for keras and tf layers.
+```python
+  net = WeightNormalization(
+      tf.keras.layers.Conv2D(2, 2, activation='relu'),
+      input_shape=(32, 32, 3),
+      data_init=True)(x)
+  net = WeightNormalization(
+      tf.keras.layers.Conv2D(16, 5, activation='relu'),
+      data_init=True)(net)
+  net = WeightNormalization(
+      tf.keras.layers.Dense(120, activation='relu'),
+      data_init=True)(net)
+  net = WeightNormalization(
+      tf.keras.layers.Dense(n_classes),
+      data_init=True)(net)
+```
+Arguments:
+  layer: a layer instance.
+  data_init: If `True` use data dependent variable initialization
+Raises:
+  ValueError: If not initialized with a `Layer` instance.
+  ValueError: If `Layer` does not contain a `kernel` of weights
+  NotImplementedError: If `data_init` is True and running graph execution
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    layer,
+    data_init=True,
+    **kwargs
+)
+```
+
+
+
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Wraps `call`, applying pre- and post-processing steps.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+
+#### Note:
+
+- The following optional keyword arguments are reserved for specific uses:
+  * `training`: Boolean scalar tensor of Python boolean indicating
+    whether the `call` is meant for training or inference.
+  * `mask`: Boolean input mask.
+- If the layer's `call` method takes a `mask` argument (as some Keras
+  layers do), its default value will be set to the mask generated
+  for `inputs` by the previous layer (if `input` did come from
+  a layer that generated a corresponding mask, i.e. if it came from
+  a Keras layer with masking support.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer's `call` method returns None (an invalid value).
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+Build `Layer`
+
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+Computes an output mask tensor.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Tensor or list of tensors.
+* <b>`mask`</b>: Tensor or list of tensors.
+
+
+#### Returns:
+
+None or a tensor (or list of tensors,
+    one per output tensor of the layer).
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config,
+    custom_objects=None
+)
+```
+
+
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/layers/maxout.md b/docs/api_docs/python/tfa/layers/maxout.md
new file mode 100644
index 0000000000..549999941b
--- /dev/null
+++ b/docs/api_docs/python/tfa/layers/maxout.md
@@ -0,0 +1,20 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.layers.maxout" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.layers.maxout
+
+Implementing Maxout layer.
+
+
+
+Defined in [`layers/maxout.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/layers/maxout.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class Maxout`](../../tfa/layers/Maxout.md): Applies Maxout to the input.
+
diff --git a/docs/api_docs/python/tfa/layers/normalizations.md b/docs/api_docs/python/tfa/layers/normalizations.md
new file mode 100644
index 0000000000..e0daf2a090
--- /dev/null
+++ b/docs/api_docs/python/tfa/layers/normalizations.md
@@ -0,0 +1,22 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.layers.normalizations" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.layers.normalizations
+
+
+
+
+
+Defined in [`layers/normalizations.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/layers/normalizations.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class GroupNormalization`](../../tfa/layers/GroupNormalization.md): Group normalization layer.
+
+[`class InstanceNormalization`](../../tfa/layers/InstanceNormalization.md): Instance normalization layer.
+
diff --git a/docs/api_docs/python/tfa/layers/poincare.md b/docs/api_docs/python/tfa/layers/poincare.md
new file mode 100644
index 0000000000..62db1071b0
--- /dev/null
+++ b/docs/api_docs/python/tfa/layers/poincare.md
@@ -0,0 +1,20 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.layers.poincare" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.layers.poincare
+
+Implementing PoincareNormalize layer.
+
+
+
+Defined in [`layers/poincare.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/layers/poincare.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class PoincareNormalize`](../../tfa/layers/PoincareNormalize.md): Project into the Poincare ball with norm <= 1.0 - epsilon.
+
diff --git a/docs/api_docs/python/tfa/layers/sparsemax.md b/docs/api_docs/python/tfa/layers/sparsemax.md
new file mode 100644
index 0000000000..9820c5c185
--- /dev/null
+++ b/docs/api_docs/python/tfa/layers/sparsemax.md
@@ -0,0 +1,24 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.layers.sparsemax" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.layers.sparsemax
+
+
+
+
+
+Defined in [`layers/sparsemax.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/layers/sparsemax.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class Sparsemax`](../../tfa/layers/Sparsemax.md): Sparsemax activation function [1].
+
+## Functions
+
+[`sparsemax(...)`](../../tfa/activations/sparsemax.md): Sparsemax activation function [1].
+
diff --git a/docs/api_docs/python/tfa/layers/wrappers.md b/docs/api_docs/python/tfa/layers/wrappers.md
new file mode 100644
index 0000000000..6ed1fdf9ee
--- /dev/null
+++ b/docs/api_docs/python/tfa/layers/wrappers.md
@@ -0,0 +1,20 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.layers.wrappers" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.layers.wrappers
+
+
+
+
+
+Defined in [`layers/wrappers.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/layers/wrappers.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class WeightNormalization`](../../tfa/layers/WeightNormalization.md): This wrapper reparameterizes a layer by decoupling the weight's
+
diff --git a/docs/api_docs/python/tfa/losses.md b/docs/api_docs/python/tfa/losses.md
new file mode 100644
index 0000000000..07c74a8db3
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses.md
@@ -0,0 +1,52 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.losses
+
+Additional losses that conform to Keras API.
+
+
+
+Defined in [`losses/__init__.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/__init__.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Modules
+
+[`contrastive`](../tfa/losses/contrastive.md) module: Implements contrastive loss.
+
+[`focal_loss`](../tfa/losses/focal_loss.md) module: Implements Focal loss.
+
+[`lifted`](../tfa/losses/lifted.md) module: Implements lifted_struct_loss.
+
+[`metric_learning`](../tfa/losses/metric_learning.md) module: Functions of metric learning.
+
+[`triplet`](../tfa/losses/triplet.md) module: Implements triplet loss.
+
+## Classes
+
+[`class ContrastiveLoss`](../tfa/losses/ContrastiveLoss.md): Computes the contrastive loss between `y_true` and `y_pred`.
+
+[`class LiftedStructLoss`](../tfa/losses/LiftedStructLoss.md): Computes the lifted structured loss.
+
+[`class SigmoidFocalCrossEntropy`](../tfa/losses/SigmoidFocalCrossEntropy.md): Implements the focal loss function.
+
+[`class SparsemaxLoss`](../tfa/losses/SparsemaxLoss.md): Sparsemax loss function.
+
+[`class TripletSemiHardLoss`](../tfa/losses/TripletSemiHardLoss.md): Computes the triplet loss with semi-hard negative mining.
+
+## Functions
+
+[`contrastive_loss(...)`](../tfa/losses/contrastive_loss.md): Computes the contrastive loss between `y_true` and `y_pred`.
+
+[`lifted_struct_loss(...)`](../tfa/losses/lifted_struct_loss.md): Computes the lifted structured loss.
+
+[`sigmoid_focal_crossentropy(...)`](../tfa/losses/sigmoid_focal_crossentropy.md): Args
+
+[`sparsemax_loss(...)`](../tfa/losses/sparsemax_loss.md): Sparsemax loss function [1].
+
+[`triplet_semihard_loss(...)`](../tfa/losses/triplet_semihard_loss.md): Computes the triplet loss with semi-hard negative mining.
+
diff --git a/docs/api_docs/python/tfa/losses/ContrastiveLoss.md b/docs/api_docs/python/tfa/losses/ContrastiveLoss.md
new file mode 100644
index 0000000000..5c9e4dcf48
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/ContrastiveLoss.md
@@ -0,0 +1,149 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.ContrastiveLoss" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+</div>
+
+# tfa.losses.ContrastiveLoss
+
+## Class `ContrastiveLoss`
+
+Computes the contrastive loss between `y_true` and `y_pred`.
+
+
+
+### Aliases:
+
+* Class `tfa.losses.ContrastiveLoss`
+* Class `tfa.losses.contrastive.ContrastiveLoss`
+
+
+
+Defined in [`losses/contrastive.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/contrastive.py).
+
+<!-- Placeholder for "Used in" -->
+
+This loss encourages the embedding to be close to each other for
+the samples of the same label and the embedding to be far apart at least
+by the margin constant for the samples of different labels.
+
+See: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
+
+We expect labels `y_true` to be provided as 1-D integer `Tensor`
+with shape [batch_size] of binary integer labels. And `y_pred` must be
+1-D float `Tensor` with shape [batch_size] of distances between two
+embedding matrices.
+
+The euclidean distances `y_pred` between two embedding matrices
+`a` and `b` with shape [batch_size, hidden_size] can be computed
+as follows:
+
+```python
+# y_pred = \sqrt (\sum_i (a[:, i] - b[:, i])^2)
+y_pred = tf.linalg.norm(a - b, axis=1)
+```
+
+#### Args:
+
+
+* <b>`margin`</b>: `Float`, margin term in the loss definition.
+  Default value is 1.0.
+* <b>`reduction`</b>: (Optional) Type of `tf.keras.losses.Reduction` to apply.
+  Default value is `SUM_OVER_BATCH_SIZE`.
+* <b>`name`</b>: (Optional) name for the loss.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    margin=1.0,
+    reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
+    name='contrasitve_loss'
+)
+```
+
+
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    y_true,
+    y_pred,
+    sample_weight=None
+)
+```
+
+Invokes the `Loss` instance.
+
+
+#### Args:
+
+
+* <b>`y_true`</b>: Ground truth values.
+* <b>`y_pred`</b>: The predicted values.
+* <b>`sample_weight`</b>: Optional `Tensor` whose rank is either 0, or the same rank
+  as `y_true`, or is broadcastable to `y_true`. `sample_weight` acts as a
+  coefficient for the loss. If a scalar is provided, then the loss is
+  simply scaled by the given value. If `sample_weight` is a tensor of size
+  `[batch_size]`, then the total loss for each sample of the batch is
+  rescaled by the corresponding element in the `sample_weight` vector. If
+  the shape of `sample_weight` matches the shape of `y_pred`, then the
+  loss of each measurable element of `y_pred` is scaled by the
+  corresponding value of `sample_weight`.
+
+
+#### Returns:
+
+Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+  shape as `y_true`; otherwise, it is scalar.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the shape of `sample_weight` is invalid.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Instantiates a `Loss` from its config (output of `get_config()`).
+
+
+#### Args:
+
+
+* <b>`config`</b>: Output of `get_config()`.
+
+
+#### Returns:
+
+A `Loss` instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/losses/LiftedStructLoss.md b/docs/api_docs/python/tfa/losses/LiftedStructLoss.md
new file mode 100644
index 0000000000..2f7329e147
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/LiftedStructLoss.md
@@ -0,0 +1,131 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.LiftedStructLoss" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+</div>
+
+# tfa.losses.LiftedStructLoss
+
+## Class `LiftedStructLoss`
+
+Computes the lifted structured loss.
+
+
+
+### Aliases:
+
+* Class `tfa.losses.LiftedStructLoss`
+* Class `tfa.losses.lifted.LiftedStructLoss`
+
+
+
+Defined in [`losses/lifted.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/lifted.py).
+
+<!-- Placeholder for "Used in" -->
+
+The loss encourages the positive distances (between a pair of embeddings
+with the same labels) to be smaller than any negative distances (between
+a pair of embeddings with different labels) in the mini-batch in a way
+that is differentiable with respect to the embedding vectors.
+See: https://arxiv.org/abs/1511.06452.
+
+#### Args:
+
+
+* <b>`margin`</b>: Float, margin term in the loss definition.
+* <b>`name`</b>: Optional name for the op.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    margin=1.0,
+    name=None
+)
+```
+
+
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    y_true,
+    y_pred,
+    sample_weight=None
+)
+```
+
+Invokes the `Loss` instance.
+
+
+#### Args:
+
+
+* <b>`y_true`</b>: Ground truth values.
+* <b>`y_pred`</b>: The predicted values.
+* <b>`sample_weight`</b>: Optional `Tensor` whose rank is either 0, or the same rank
+  as `y_true`, or is broadcastable to `y_true`. `sample_weight` acts as a
+  coefficient for the loss. If a scalar is provided, then the loss is
+  simply scaled by the given value. If `sample_weight` is a tensor of size
+  `[batch_size]`, then the total loss for each sample of the batch is
+  rescaled by the corresponding element in the `sample_weight` vector. If
+  the shape of `sample_weight` matches the shape of `y_pred`, then the
+  loss of each measurable element of `y_pred` is scaled by the
+  corresponding value of `sample_weight`.
+
+
+#### Returns:
+
+Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+  shape as `y_true`; otherwise, it is scalar.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the shape of `sample_weight` is invalid.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Instantiates a `Loss` from its config (output of `get_config()`).
+
+
+#### Args:
+
+
+* <b>`config`</b>: Output of `get_config()`.
+
+
+#### Returns:
+
+A `Loss` instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/losses/SigmoidFocalCrossEntropy.md b/docs/api_docs/python/tfa/losses/SigmoidFocalCrossEntropy.md
new file mode 100644
index 0000000000..d63ac959fb
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/SigmoidFocalCrossEntropy.md
@@ -0,0 +1,168 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.SigmoidFocalCrossEntropy" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+</div>
+
+# tfa.losses.SigmoidFocalCrossEntropy
+
+## Class `SigmoidFocalCrossEntropy`
+
+Implements the focal loss function.
+
+
+
+### Aliases:
+
+* Class `tfa.losses.SigmoidFocalCrossEntropy`
+* Class `tfa.losses.focal_loss.SigmoidFocalCrossEntropy`
+
+
+
+Defined in [`losses/focal_loss.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/focal_loss.py).
+
+<!-- Placeholder for "Used in" -->
+
+Focal loss was first introduced in the RetinaNet paper
+(https://arxiv.org/pdf/1708.02002.pdf). Focal loss is extremely useful for
+classification when you have highly imbalanced classes. It down-weights
+well-classified examples and focuses on hard examples. The loss value is
+much high for a sample which is misclassified by the classifier as compared
+to the loss value corresponding to a well-classified example. One of the
+best use-cases of focal loss is its usage in object detection where the
+imbalance between the background class and other classes is extremely high.
+
+#### Usage:
+
+
+
+```python
+fl = tfa.losses.SigmoidFocalCrossEntropy()
+loss = fl(
+  [[0.97], [0.91], [0.03]],
+  [[1], [1], [0])
+print('Loss: ', loss.numpy())  # Loss: [[0.03045921]
+                                        [0.09431068]
+                                        [0.31471074]
+```
+Usage with tf.keras API:
+
+```python
+model = tf.keras.Model(inputs, outputs)
+model.compile('sgd', loss=tf.keras.losses.SigmoidFocalCrossEntropy())
+```
+
+Args
+  alpha: balancing factor, default value is 0.25
+  gamma: modulating factor, default value is 2.0
+
+#### Returns:
+
+Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `y_true`; otherwise, it is scalar.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the shape of `sample_weight` is invalid or value of
+  `gamma` is less than zero
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    from_logits=False,
+    alpha=0.25,
+    gamma=2.0,
+    reduction=tf.keras.losses.Reduction.NONE,
+    name='sigmoid_focal_crossentropy'
+)
+```
+
+
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    y_true,
+    y_pred,
+    sample_weight=None
+)
+```
+
+Invokes the `Loss` instance.
+
+
+#### Args:
+
+
+* <b>`y_true`</b>: Ground truth values.
+* <b>`y_pred`</b>: The predicted values.
+* <b>`sample_weight`</b>: Optional `Tensor` whose rank is either 0, or the same rank
+  as `y_true`, or is broadcastable to `y_true`. `sample_weight` acts as a
+  coefficient for the loss. If a scalar is provided, then the loss is
+  simply scaled by the given value. If `sample_weight` is a tensor of size
+  `[batch_size]`, then the total loss for each sample of the batch is
+  rescaled by the corresponding element in the `sample_weight` vector. If
+  the shape of `sample_weight` matches the shape of `y_pred`, then the
+  loss of each measurable element of `y_pred` is scaled by the
+  corresponding value of `sample_weight`.
+
+
+#### Returns:
+
+Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+  shape as `y_true`; otherwise, it is scalar.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the shape of `sample_weight` is invalid.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Instantiates a `Loss` from its config (output of `get_config()`).
+
+
+#### Args:
+
+
+* <b>`config`</b>: Output of `get_config()`.
+
+
+#### Returns:
+
+A `Loss` instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/losses/SparsemaxLoss.md b/docs/api_docs/python/tfa/losses/SparsemaxLoss.md
new file mode 100644
index 0000000000..d58aeafa86
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/SparsemaxLoss.md
@@ -0,0 +1,133 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.SparsemaxLoss" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+</div>
+
+# tfa.losses.SparsemaxLoss
+
+## Class `SparsemaxLoss`
+
+Sparsemax loss function.
+
+
+
+
+
+Defined in [`losses/sparsemax_loss.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/sparsemax_loss.py).
+
+<!-- Placeholder for "Used in" -->
+
+Computes the generalized multi-label classification loss for the sparsemax
+function.
+
+Because the sparsemax loss function needs both the properbility output and
+the logits to compute the loss value, `from_logits` must be `True`.
+
+Because it computes the generalized multi-label loss, the shape of both
+`y_pred` and `y_true` must be `[batch_size, num_classes]`.
+
+#### Args:
+
+
+* <b>`from_logits`</b>: Whether `y_pred` is expected to be a logits tensor. Default
+  is `True`, meaning `y_pred` is the logits.
+* <b>`reduction`</b>: (Optional) Type of `tf.keras.losses.Reduction` to apply to
+  loss. Default value is `SUM_OVER_BATCH_SIZE`.
+* <b>`name`</b>: Optional name for the op.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    from_logits=True,
+    reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
+    name='sparsemax_loss'
+)
+```
+
+
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    y_true,
+    y_pred,
+    sample_weight=None
+)
+```
+
+Invokes the `Loss` instance.
+
+
+#### Args:
+
+
+* <b>`y_true`</b>: Ground truth values.
+* <b>`y_pred`</b>: The predicted values.
+* <b>`sample_weight`</b>: Optional `Tensor` whose rank is either 0, or the same rank
+  as `y_true`, or is broadcastable to `y_true`. `sample_weight` acts as a
+  coefficient for the loss. If a scalar is provided, then the loss is
+  simply scaled by the given value. If `sample_weight` is a tensor of size
+  `[batch_size]`, then the total loss for each sample of the batch is
+  rescaled by the corresponding element in the `sample_weight` vector. If
+  the shape of `sample_weight` matches the shape of `y_pred`, then the
+  loss of each measurable element of `y_pred` is scaled by the
+  corresponding value of `sample_weight`.
+
+
+#### Returns:
+
+Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+  shape as `y_true`; otherwise, it is scalar.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the shape of `sample_weight` is invalid.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Instantiates a `Loss` from its config (output of `get_config()`).
+
+
+#### Args:
+
+
+* <b>`config`</b>: Output of `get_config()`.
+
+
+#### Returns:
+
+A `Loss` instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/losses/TripletSemiHardLoss.md b/docs/api_docs/python/tfa/losses/TripletSemiHardLoss.md
new file mode 100644
index 0000000000..34e9b9a48a
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/TripletSemiHardLoss.md
@@ -0,0 +1,136 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.TripletSemiHardLoss" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+</div>
+
+# tfa.losses.TripletSemiHardLoss
+
+## Class `TripletSemiHardLoss`
+
+Computes the triplet loss with semi-hard negative mining.
+
+
+
+### Aliases:
+
+* Class `tfa.losses.TripletSemiHardLoss`
+* Class `tfa.losses.triplet.TripletSemiHardLoss`
+
+
+
+Defined in [`losses/triplet.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/triplet.py).
+
+<!-- Placeholder for "Used in" -->
+
+The loss encourages the positive distances (between a pair of embeddings
+with the same labels) to be smaller than the minimum negative distance
+among which are at least greater than the positive distance plus the
+margin constant (called semi-hard negative) in the mini-batch.
+If no such negative exists, uses the largest negative distance instead.
+See: https://arxiv.org/abs/1503.03832.
+
+We expect labels `y_true` to be provided as 1-D integer `Tensor` with shape
+[batch_size] of multi-class integer labels. And embeddings `y_pred` must be
+2-D float `Tensor` of l2 normalized embedding vectors.
+
+#### Args:
+
+
+* <b>`margin`</b>: Float, margin term in the loss definition. Default value is 1.0.
+* <b>`name`</b>: Optional name for the op.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    margin=1.0,
+    name=None
+)
+```
+
+
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    y_true,
+    y_pred,
+    sample_weight=None
+)
+```
+
+Invokes the `Loss` instance.
+
+
+#### Args:
+
+
+* <b>`y_true`</b>: Ground truth values.
+* <b>`y_pred`</b>: The predicted values.
+* <b>`sample_weight`</b>: Optional `Tensor` whose rank is either 0, or the same rank
+  as `y_true`, or is broadcastable to `y_true`. `sample_weight` acts as a
+  coefficient for the loss. If a scalar is provided, then the loss is
+  simply scaled by the given value. If `sample_weight` is a tensor of size
+  `[batch_size]`, then the total loss for each sample of the batch is
+  rescaled by the corresponding element in the `sample_weight` vector. If
+  the shape of `sample_weight` matches the shape of `y_pred`, then the
+  loss of each measurable element of `y_pred` is scaled by the
+  corresponding value of `sample_weight`.
+
+
+#### Returns:
+
+Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+  shape as `y_true`; otherwise, it is scalar.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the shape of `sample_weight` is invalid.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Instantiates a `Loss` from its config (output of `get_config()`).
+
+
+#### Args:
+
+
+* <b>`config`</b>: Output of `get_config()`.
+
+
+#### Returns:
+
+A `Loss` instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/losses/contrastive.md b/docs/api_docs/python/tfa/losses/contrastive.md
new file mode 100644
index 0000000000..2a46f15249
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/contrastive.md
@@ -0,0 +1,24 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.contrastive" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.losses.contrastive
+
+Implements contrastive loss.
+
+
+
+Defined in [`losses/contrastive.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/contrastive.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class ContrastiveLoss`](../../tfa/losses/ContrastiveLoss.md): Computes the contrastive loss between `y_true` and `y_pred`.
+
+## Functions
+
+[`contrastive_loss(...)`](../../tfa/losses/contrastive_loss.md): Computes the contrastive loss between `y_true` and `y_pred`.
+
diff --git a/docs/api_docs/python/tfa/losses/contrastive_loss.md b/docs/api_docs/python/tfa/losses/contrastive_loss.md
new file mode 100644
index 0000000000..fbfcb80da7
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/contrastive_loss.md
@@ -0,0 +1,57 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.contrastive_loss" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.losses.contrastive_loss
+
+Computes the contrastive loss between `y_true` and `y_pred`.
+
+### Aliases:
+
+* `tfa.losses.contrastive.contrastive_loss`
+* `tfa.losses.contrastive_loss`
+
+``` python
+tfa.losses.contrastive_loss(
+    y_true,
+    y_pred,
+    margin=1.0
+)
+```
+
+
+
+Defined in [`losses/contrastive.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/contrastive.py).
+
+<!-- Placeholder for "Used in" -->
+
+This loss encourages the embedding to be close to each other for
+the samples of the same label and the embedding to be far apart at least
+by the margin constant for the samples of different labels.
+
+The euclidean distances `y_pred` between two embedding matrices
+`a` and `b` with shape [batch_size, hidden_size] can be computed
+as follows:
+
+```python
+# y_pred = \sqrt (\sum_i (a[:, i] - b[:, i])^2)
+y_pred = tf.linalg.norm(a - b, axis=1)
+```
+
+See: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
+
+#### Args:
+
+
+* <b>`y_true`</b>: 1-D integer `Tensor` with shape [batch_size] of
+  binary labels indicating positive vs negative pair.
+* <b>`y_pred`</b>: 1-D float `Tensor` with shape [batch_size] of
+  distances between two embedding matrices.
+* <b>`margin`</b>: margin term in the loss definition.
+
+
+#### Returns:
+
+
+* <b>`contrastive_loss`</b>: 1-D float `Tensor` with shape [batch_size].
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/losses/focal_loss.md b/docs/api_docs/python/tfa/losses/focal_loss.md
new file mode 100644
index 0000000000..7cee5cd250
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/focal_loss.md
@@ -0,0 +1,24 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.focal_loss" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.losses.focal_loss
+
+Implements Focal loss.
+
+
+
+Defined in [`losses/focal_loss.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/focal_loss.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class SigmoidFocalCrossEntropy`](../../tfa/losses/SigmoidFocalCrossEntropy.md): Implements the focal loss function.
+
+## Functions
+
+[`sigmoid_focal_crossentropy(...)`](../../tfa/losses/sigmoid_focal_crossentropy.md): Args
+
diff --git a/docs/api_docs/python/tfa/losses/lifted.md b/docs/api_docs/python/tfa/losses/lifted.md
new file mode 100644
index 0000000000..17ccfdaea1
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/lifted.md
@@ -0,0 +1,24 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.lifted" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.losses.lifted
+
+Implements lifted_struct_loss.
+
+
+
+Defined in [`losses/lifted.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/lifted.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class LiftedStructLoss`](../../tfa/losses/LiftedStructLoss.md): Computes the lifted structured loss.
+
+## Functions
+
+[`lifted_struct_loss(...)`](../../tfa/losses/lifted_struct_loss.md): Computes the lifted structured loss.
+
diff --git a/docs/api_docs/python/tfa/losses/lifted_struct_loss.md b/docs/api_docs/python/tfa/losses/lifted_struct_loss.md
new file mode 100644
index 0000000000..269e1759de
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/lifted_struct_loss.md
@@ -0,0 +1,43 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.lifted_struct_loss" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.losses.lifted_struct_loss
+
+Computes the lifted structured loss.
+
+### Aliases:
+
+* `tfa.losses.lifted.lifted_struct_loss`
+* `tfa.losses.lifted_struct_loss`
+
+``` python
+tfa.losses.lifted_struct_loss(
+    labels,
+    embeddings,
+    margin=1.0
+)
+```
+
+
+
+Defined in [`losses/lifted.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/lifted.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+#### Args:
+
+
+* <b>`labels`</b>: 1-D tf.int32 `Tensor` with shape [batch_size] of
+  multiclass integer labels.
+* <b>`embeddings`</b>: 2-D float `Tensor` of embedding vectors. Embeddings should
+  not be l2 normalized.
+* <b>`margin`</b>: Float, margin term in the loss definition.
+
+
+#### Returns:
+
+
+* <b>`lifted_loss`</b>: tf.float32 scalar.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/losses/metric_learning.md b/docs/api_docs/python/tfa/losses/metric_learning.md
new file mode 100644
index 0000000000..87853e9490
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/metric_learning.md
@@ -0,0 +1,20 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.metric_learning" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.losses.metric_learning
+
+Functions of metric learning.
+
+
+
+Defined in [`losses/metric_learning.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/metric_learning.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Functions
+
+[`pairwise_distance(...)`](../../tfa/losses/metric_learning/pairwise_distance.md): Computes the pairwise distance matrix with numerical stability.
+
diff --git a/docs/api_docs/python/tfa/losses/metric_learning/pairwise_distance.md b/docs/api_docs/python/tfa/losses/metric_learning/pairwise_distance.md
new file mode 100644
index 0000000000..0b2e1e459a
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/metric_learning/pairwise_distance.md
@@ -0,0 +1,35 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.metric_learning.pairwise_distance" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.losses.metric_learning.pairwise_distance
+
+Computes the pairwise distance matrix with numerical stability.
+
+``` python
+tfa.losses.metric_learning.pairwise_distance(
+    feature,
+    squared=False
+)
+```
+
+
+
+Defined in [`losses/metric_learning.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/metric_learning.py).
+
+<!-- Placeholder for "Used in" -->
+
+output[i, j] = || feature[i, :] - feature[j, :] ||_2
+
+#### Args:
+
+
+* <b>`feature`</b>: 2-D Tensor of size [number of data, feature dimension].
+* <b>`squared`</b>: Boolean, whether or not to square the pairwise distances.
+
+
+#### Returns:
+
+
+* <b>`pairwise_distances`</b>: 2-D Tensor of size [number of data, number of data].
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/losses/sigmoid_focal_crossentropy.md b/docs/api_docs/python/tfa/losses/sigmoid_focal_crossentropy.md
new file mode 100644
index 0000000000..fc27310f35
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/sigmoid_focal_crossentropy.md
@@ -0,0 +1,38 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.sigmoid_focal_crossentropy" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.losses.sigmoid_focal_crossentropy
+
+Args
+
+### Aliases:
+
+* `tfa.losses.focal_loss.sigmoid_focal_crossentropy`
+* `tfa.losses.sigmoid_focal_crossentropy`
+
+``` python
+tfa.losses.sigmoid_focal_crossentropy(
+    y_true,
+    y_pred,
+    alpha=0.25,
+    gamma=2.0,
+    from_logits=False
+)
+```
+
+
+
+Defined in [`losses/focal_loss.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/focal_loss.py).
+
+<!-- Placeholder for "Used in" -->
+    y_true: true targets tensor.
+    y_pred: predictions tensor.
+    alpha: balancing factor.
+    gamma: modulating factor.
+
+#### Returns:
+
+Weighted loss float `Tensor`. If `reduction` is `NONE`,this has the 
+same shape as `y_true`; otherwise, it is scalar.
diff --git a/docs/api_docs/python/tfa/losses/sparsemax_loss.md b/docs/api_docs/python/tfa/losses/sparsemax_loss.md
new file mode 100644
index 0000000000..5152b8ce68
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/sparsemax_loss.md
@@ -0,0 +1,44 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.sparsemax_loss" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.losses.sparsemax_loss
+
+Sparsemax loss function [1].
+
+``` python
+tfa.losses.sparsemax_loss(
+    logits,
+    sparsemax,
+    labels,
+    name=None
+)
+```
+
+
+
+Defined in [`losses/sparsemax_loss.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/sparsemax_loss.py).
+
+<!-- Placeholder for "Used in" -->
+
+Computes the generalized multi-label classification loss for the sparsemax
+function. The implementation is a reformulation of the original loss
+function such that it uses the sparsemax properbility output instead of the
+internal    au variable. However, the output is identical to the original
+loss function.
+
+[1]: https://arxiv.org/abs/1602.02068
+
+#### Args:
+
+
+* <b>`logits`</b>: A `Tensor`. Must be one of the following types: `float32`,
+  `float64`.
+* <b>`sparsemax`</b>: A `Tensor`. Must have the same type as `logits`.
+* <b>`labels`</b>: A `Tensor`. Must have the same type as `logits`.
+* <b>`name`</b>: A name for the operation (optional).
+
+#### Returns:
+
+A `Tensor`. Has the same type as `logits`.
diff --git a/docs/api_docs/python/tfa/losses/triplet.md b/docs/api_docs/python/tfa/losses/triplet.md
new file mode 100644
index 0000000000..0b392ac50e
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/triplet.md
@@ -0,0 +1,24 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.triplet" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.losses.triplet
+
+Implements triplet loss.
+
+
+
+Defined in [`losses/triplet.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/triplet.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class TripletSemiHardLoss`](../../tfa/losses/TripletSemiHardLoss.md): Computes the triplet loss with semi-hard negative mining.
+
+## Functions
+
+[`triplet_semihard_loss(...)`](../../tfa/losses/triplet_semihard_loss.md): Computes the triplet loss with semi-hard negative mining.
+
diff --git a/docs/api_docs/python/tfa/losses/triplet_semihard_loss.md b/docs/api_docs/python/tfa/losses/triplet_semihard_loss.md
new file mode 100644
index 0000000000..0c040e1f40
--- /dev/null
+++ b/docs/api_docs/python/tfa/losses/triplet_semihard_loss.md
@@ -0,0 +1,37 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.losses.triplet_semihard_loss" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.losses.triplet_semihard_loss
+
+Computes the triplet loss with semi-hard negative mining.
+
+### Aliases:
+
+* `tfa.losses.triplet.triplet_semihard_loss`
+* `tfa.losses.triplet_semihard_loss`
+
+``` python
+tfa.losses.triplet_semihard_loss(
+    y_true,
+    y_pred,
+    margin=1.0
+)
+```
+
+
+
+Defined in [`losses/triplet.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/losses/triplet.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+#### Args:
+
+
+* <b>`y_true`</b>: 1-D integer `Tensor` with shape [batch_size] of
+  multiclass integer labels.
+* <b>`y_pred`</b>: 2-D float `Tensor` of embedding vectors. Embeddings should
+  be l2 normalized.
+* <b>`margin`</b>: Float, margin term in the loss definition.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/metrics.md b/docs/api_docs/python/tfa/metrics.md
new file mode 100644
index 0000000000..ffe2d4da26
--- /dev/null
+++ b/docs/api_docs/python/tfa/metrics.md
@@ -0,0 +1,24 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.metrics" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.metrics
+
+A module containing metrics that conform to Keras API.
+
+
+
+Defined in [`metrics/__init__.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/metrics/__init__.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Modules
+
+[`cohens_kappa`](../tfa/metrics/cohens_kappa.md) module: Implements Cohen's Kappa.
+
+## Classes
+
+[`class CohenKappa`](../tfa/metrics/CohenKappa.md): Computes Kappa score between two raters.
+
diff --git a/docs/api_docs/python/tfa/metrics/CohenKappa.md b/docs/api_docs/python/tfa/metrics/CohenKappa.md
new file mode 100644
index 0000000000..b42fcfd6af
--- /dev/null
+++ b/docs/api_docs/python/tfa/metrics/CohenKappa.md
@@ -0,0 +1,911 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.metrics.CohenKappa" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="reset_states"/>
+<meta itemprop="property" content="result"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="update_state"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.metrics.CohenKappa
+
+## Class `CohenKappa`
+
+Computes Kappa score between two raters.
+
+
+
+### Aliases:
+
+* Class `tfa.metrics.CohenKappa`
+* Class `tfa.metrics.cohens_kappa.CohenKappa`
+
+
+
+Defined in [`metrics/cohens_kappa.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/metrics/cohens_kappa.py).
+
+<!-- Placeholder for "Used in" -->
+
+The score lies in the range [-1, 1]. A score of -1 represents
+complete disagreement between two raters whereas a score of 1
+represents complete agreement between the two raters.
+A score of 0 means agreement by chance.
+
+Note: As of now, this implementation considers all labels
+while calculating the Cohen's Kappa score.
+
+#### Usage:
+
+
+```python
+actuals = np.array([4, 4, 3, 4, 2, 4, 1, 1], dtype=np.int32)
+preds = np.array([4, 4, 3, 4, 4, 2, 1, 1], dtype=np.int32)
+
+m = tf.keras.metrics.CohenKappa(num_classes=5)
+m.update_state(actuals, preds, "quadratic")
+print('Final result: ', m.result().numpy()) # Result: 0.68932
+```
+Usage with tf.keras API:
+```python
+model = keras.models.Model(inputs, outputs)
+model.add_metric(tf.keras.metrics.CohenKappa(num_classes=5)(outputs))
+model.compile('sgd', loss='mse')
+```
+
+#### Args:
+
+
+* <b>`num_classes`</b>: Number of unique classes in your dataset
+* <b>`weightage`</b>: Weighting to be considered for calculating
+              kappa statistics. A valid value is one of
+              [None, 'linear', 'quadratic']. Defaults to None.
+
+
+#### Returns:
+
+
+* <b>`kappa_score`</b>: float
+  The kappa statistic, which is a number between -1 and 1. The maximum
+  value means complete agreement; zero or lower means chance agreement.
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the value passed for `weightage` is invalid
+  i.e. not any one of [None, 'linear', 'quadratic']
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    num_classes,
+    name='cohen_kappa',
+    weightage=None,
+    dtype=tf.float32
+)
+```
+
+
+
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    *args,
+    **kwargs
+)
+```
+
+Accumulates statistics and then computes metric result value.
+
+
+#### Args:
+
+
+* <b>`*args`</b>: * <b>`**kwargs`</b>: A mini-batch of inputs to the Metric,
+  passed on to `update_state()`.
+
+
+#### Returns:
+
+The metric value tensor.
+
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+Creates the variables of the layer (optional, for subclass implementers).
+
+This is a method that implementers of subclasses of `Layer` or `Model`
+can override if they need a state-creation step in-between
+layer instantiation and layer call.
+
+This is typically used to create the weights of `Layer` subclasses.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Instance of `TensorShape`, or list of instances of
+  `TensorShape` if the layer expects a list of inputs
+  (one instance per input).
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+Computes an output mask tensor.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Tensor or list of tensors.
+* <b>`mask`</b>: Tensor or list of tensors.
+
+
+#### Returns:
+
+None or a tensor (or list of tensors,
+    one per output tensor of the layer).
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+Computes the output shape of the layer.
+
+Assumes that the layer will be built
+to match that input shape provided.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Shape tuple (tuple of integers)
+    or list of shape tuples (one per output tensor of the layer).
+    Shape tuples can include None for free dimensions,
+    instead of an integer.
+
+
+#### Returns:
+
+An input shape tuple.
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Creates a layer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same layer from the config
+dictionary. It does not handle layer connectivity
+(handled by Network), nor weights (handled by `set_weights`).
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the
+    output of get_config.
+
+
+#### Returns:
+
+A layer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+Returns the serializable config of the metric.
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="reset_states"><code>reset_states</code></h3>
+
+``` python
+reset_states()
+```
+
+Resets all of the metric state variables.
+
+
+<h3 id="result"><code>result</code></h3>
+
+``` python
+result()
+```
+
+
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="update_state"><code>update_state</code></h3>
+
+``` python
+update_state(
+    y_true,
+    y_pred,
+    sample_weight=None
+)
+```
+
+Accumulates the confusion matrix condition statistics.
+
+
+#### Args:
+
+
+* <b>`y_true`</b>: array, shape = [n_samples]
+         Labels assigned by the first annotator.
+* <b>`y_pred`</b>: array, shape = [n_samples]
+         Labels assigned by the second annotator. The kappa statistic
+         is symmetric, so swapping ``y_true`` and ``y_pred`` doesn't
+         change the value.
+sample_weight(optional) : for weighting labels in confusion matrix
+         Default is None. The dtype for weights should be the same
+         as the dtype for confusion matrix. For more details,
+         please check tf.math.confusion_matrix.
+
+
+
+#### Returns:
+
+Update op.
+
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/metrics/cohens_kappa.md b/docs/api_docs/python/tfa/metrics/cohens_kappa.md
new file mode 100644
index 0000000000..7c01111f0a
--- /dev/null
+++ b/docs/api_docs/python/tfa/metrics/cohens_kappa.md
@@ -0,0 +1,20 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.metrics.cohens_kappa" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.metrics.cohens_kappa
+
+Implements Cohen's Kappa.
+
+
+
+Defined in [`metrics/cohens_kappa.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/metrics/cohens_kappa.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class CohenKappa`](../../tfa/metrics/CohenKappa.md): Computes Kappa score between two raters.
+
diff --git a/docs/api_docs/python/tfa/optimizers.md b/docs/api_docs/python/tfa/optimizers.md
new file mode 100644
index 0000000000..3dd4e6e411
--- /dev/null
+++ b/docs/api_docs/python/tfa/optimizers.md
@@ -0,0 +1,38 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.optimizers" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.optimizers
+
+Additional optimizers that conform to Keras API.
+
+
+
+Defined in [`optimizers/__init__.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/optimizers/__init__.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Modules
+
+[`lazy_adam`](../tfa/optimizers/lazy_adam.md) module: Variant of the Adam optimizer that handles sparse updates more efficiently.
+
+[`moving_average`](../tfa/optimizers/moving_average.md) module
+
+[`weight_decay_optimizers`](../tfa/optimizers/weight_decay_optimizers.md) module: Base class to make optimizers weight decay ready.
+
+## Classes
+
+[`class AdamW`](../tfa/optimizers/AdamW.md): Optimizer that implements the Adam algorithm with weight decay.
+
+[`class LazyAdam`](../tfa/optimizers/LazyAdam.md): Variant of the Adam optimizer that handles sparse updates more
+
+[`class MovingAverage`](../tfa/optimizers/MovingAverage.md): Optimizer that computes a moving average of the variables.
+
+[`class SGDW`](../tfa/optimizers/SGDW.md): Optimizer that implements the Momentum algorithm with weight_decay.
+
+## Functions
+
+[`extend_with_decoupled_weight_decay(...)`](../tfa/optimizers/extend_with_decoupled_weight_decay.md): Factory function returning an optimizer class with decoupled weight
+
diff --git a/docs/api_docs/python/tfa/optimizers/AdamW.md b/docs/api_docs/python/tfa/optimizers/AdamW.md
new file mode 100644
index 0000000000..92265900d5
--- /dev/null
+++ b/docs/api_docs/python/tfa/optimizers/AdamW.md
@@ -0,0 +1,384 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.optimizers.AdamW" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="iterations"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="add_slot"/>
+<meta itemprop="property" content="add_weight"/>
+<meta itemprop="property" content="apply_gradients"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_gradients"/>
+<meta itemprop="property" content="get_slot"/>
+<meta itemprop="property" content="get_slot_names"/>
+<meta itemprop="property" content="get_updates"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="minimize"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="variables"/>
+</div>
+
+# tfa.optimizers.AdamW
+
+## Class `AdamW`
+
+Optimizer that implements the Adam algorithm with weight decay.
+
+Inherits From: [`DecoupledWeightDecayExtension`](../../tfa/optimizers/weight_decay_optimizers/DecoupledWeightDecayExtension.md)
+
+### Aliases:
+
+* Class `tfa.optimizers.AdamW`
+* Class `tfa.optimizers.weight_decay_optimizers.AdamW`
+
+
+
+Defined in [`optimizers/weight_decay_optimizers.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/optimizers/weight_decay_optimizers.py).
+
+<!-- Placeholder for "Used in" -->
+
+This is an implementation of the AdamW optimizer described in "Decoupled
+Weight Decay Regularization" by Loshchilov & Hutter
+(https://arxiv.org/abs/1711.05101)
+([pdf])(https://arxiv.org/pdf/1711.05101.pdf).
+
+It computes the update step of `tf.keras.optimizers.Adam` and additionally
+decays the variable. Note that this is different from adding L2
+regularization on the variables to the loss: it regularizes variables with
+large gradients more than L2 regularization would, which was shown to yield
+better training loss and generalization error in the paper above.
+
+For further information see the documentation of the Adam Optimizer.
+
+This optimizer can also be instantiated as
+```python
+extend_with_decoupled_weight_decay(tf.keras.optimizers.Adam,
+                                   weight_decay=weight_decay)
+```
+
+Note: when applying a decay to the learning rate, be sure to manually apply
+the decay to the `weight_decay` as well. For example:
+
+```python
+step = tf.Variable(0, trainable=False)
+schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
+    [10000, 15000], [1e-0, 1e-1, 1e-2])
+# lr and wd can be a function or a tensor
+lr = 1e-1 * schedule(step)
+wd = lambda: 1e-4 * schedule(step)
+
+# ...
+
+optimizer = tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
+```
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    weight_decay,
+    learning_rate=0.001,
+    beta_1=0.9,
+    beta_2=0.999,
+    epsilon=1e-07,
+    amsgrad=False,
+    name='AdamW',
+    **kwargs
+)
+```
+
+Construct a new AdamW optimizer.
+
+For further information see the documentation of the Adam Optimizer.
+
+#### Args:
+
+
+* <b>`weight_decay`</b>: A Tensor or a floating point value. The weight decay.
+* <b>`learning_rate`</b>: A Tensor or a floating point value. The learning
+    rate.
+* <b>`beta_1`</b>: A float value or a constant float tensor. The exponential
+    decay rate for the 1st moment estimates.
+* <b>`beta_2`</b>: A float value or a constant float tensor. The exponential
+    decay rate for the 2nd moment estimates.
+* <b>`epsilon`</b>: A small constant for numerical stability. This epsilon is
+    "epsilon hat" in the Kingma and Ba paper (in the formula just
+    before Section 2.1), not the epsilon in Algorithm 1 of the
+    paper.
+* <b>`amsgrad`</b>: boolean. Whether to apply AMSGrad variant of this
+    algorithm from the paper "On the Convergence of Adam and
+    beyond".
+* <b>`name`</b>: Optional name for the operations created when applying
+    gradients. Defaults to "AdamW".
+* <b>`**kwargs`</b>: keyword arguments. Allowed to be {`clipnorm`,
+    `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
+    norm; `clipvalue` is clip gradients by value, `decay` is
+    included for backward compatibility to allow time inverse decay
+    of learning rate. `lr` is included for backward compatibility,
+    recommended to use `learning_rate` instead.
+
+
+
+## Properties
+
+<h3 id="iterations"><code>iterations</code></h3>
+
+Variable. The number of training steps this Optimizer has run.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns variables of this Optimizer based on the order created.
+
+
+
+
+## Methods
+
+<h3 id="add_slot"><code>add_slot</code></h3>
+
+``` python
+add_slot(
+    var,
+    slot_name,
+    initializer='zeros'
+)
+```
+
+Add a new slot variable for `var`.
+
+
+<h3 id="add_weight"><code>add_weight</code></h3>
+
+``` python
+add_weight(
+    name,
+    shape,
+    dtype=None,
+    initializer='zeros',
+    trainable=None,
+    synchronization=tf_variables.VariableSynchronization.AUTO,
+    aggregation=tf_variables.VariableAggregation.NONE
+)
+```
+
+
+
+
+<h3 id="apply_gradients"><code>apply_gradients</code></h3>
+
+``` python
+apply_gradients(
+    grads_and_vars,
+    name=None,
+    decay_var_list=None
+)
+```
+
+Apply gradients to variables.
+
+This is the second part of `minimize()`. It returns an `Operation` that
+applies gradients.
+
+#### Args:
+
+
+* <b>`grads_and_vars`</b>: List of (gradient, variable) pairs.
+* <b>`name`</b>: Optional name for the returned operation.  Default to the
+    name passed to the `Optimizer` constructor.
+* <b>`decay_var_list`</b>: Optional list of variables to be decayed. Defaults
+    to all variables in var_list.
+
+#### Returns:
+
+An `Operation` that applies the specified gradients. If
+`global_step` was not None, that operation also increments
+`global_step`.
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: If `grads_and_vars` is malformed.
+* <b>`ValueError`</b>: If none of the variables have gradients.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config,
+    custom_objects=None
+)
+```
+
+Creates an optimizer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same optimizer from the config
+dictionary.
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the output of get_config.
+* <b>`custom_objects`</b>: A Python dictionary mapping names to additional Python
+  objects used to create this optimizer, such as a function used for a
+  hyperparameter.
+
+
+#### Returns:
+
+An optimizer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_gradients"><code>get_gradients</code></h3>
+
+``` python
+get_gradients(
+    loss,
+    params
+)
+```
+
+Returns gradients of `loss` with respect to `params`.
+
+
+#### Arguments:
+
+
+* <b>`loss`</b>: Loss tensor.
+* <b>`params`</b>: List of variables.
+
+
+#### Returns:
+
+List of gradient tensors.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: In case any gradient cannot be computed (e.g. if gradient
+  function not implemented).
+
+<h3 id="get_slot"><code>get_slot</code></h3>
+
+``` python
+get_slot(
+    var,
+    slot_name
+)
+```
+
+
+
+
+<h3 id="get_slot_names"><code>get_slot_names</code></h3>
+
+``` python
+get_slot_names()
+```
+
+A list of names for this optimizer's slots.
+
+
+<h3 id="get_updates"><code>get_updates</code></h3>
+
+``` python
+get_updates(
+    loss,
+    params
+)
+```
+
+
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+
+
+
+<h3 id="minimize"><code>minimize</code></h3>
+
+``` python
+minimize(
+    loss,
+    var_list,
+    grad_loss=None,
+    name=None,
+    decay_var_list=None
+)
+```
+
+Minimize `loss` by updating `var_list`.
+
+This method simply computes gradient using `tf.GradientTape` and calls
+`apply_gradients()`. If you want to process the gradient before
+applying then call `tf.GradientTape` and `apply_gradients()` explicitly
+instead of using this function.
+
+#### Args:
+
+
+* <b>`loss`</b>: A callable taking no arguments which returns the value to
+    minimize.
+* <b>`var_list`</b>: list or tuple of `Variable` objects to update to
+    minimize `loss`, or a callable returning the list or tuple of
+    `Variable` objects. Use callable when the variable list would
+    otherwise be incomplete before `minimize` since the variables
+    are created at the first time `loss` is called.
+* <b>`grad_loss`</b>: Optional. A `Tensor` holding the gradient computed for
+    `loss`.
+* <b>`decay_var_list`</b>: Optional list of variables to be decayed. Defaults
+    to all variables in var_list.
+* <b>`name`</b>: Optional name for the returned operation.
+
+#### Returns:
+
+An Operation that updates the variables in `var_list`.  If
+`global_step` was not `None`, that operation also increments
+`global_step`.
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If some of the variables are not `Variable` objects.
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+``` python
+variables()
+```
+
+Returns variables of this Optimizer based on the order created.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/optimizers/LazyAdam.md b/docs/api_docs/python/tfa/optimizers/LazyAdam.md
new file mode 100644
index 0000000000..68439b093f
--- /dev/null
+++ b/docs/api_docs/python/tfa/optimizers/LazyAdam.md
@@ -0,0 +1,329 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.optimizers.LazyAdam" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="iterations"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="add_slot"/>
+<meta itemprop="property" content="add_weight"/>
+<meta itemprop="property" content="apply_gradients"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_gradients"/>
+<meta itemprop="property" content="get_slot"/>
+<meta itemprop="property" content="get_slot_names"/>
+<meta itemprop="property" content="get_updates"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="minimize"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="variables"/>
+</div>
+
+# tfa.optimizers.LazyAdam
+
+## Class `LazyAdam`
+
+Variant of the Adam optimizer that handles sparse updates more
+
+
+
+### Aliases:
+
+* Class `tfa.optimizers.LazyAdam`
+* Class `tfa.optimizers.lazy_adam.LazyAdam`
+
+
+
+Defined in [`optimizers/lazy_adam.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/optimizers/lazy_adam.py).
+
+<!-- Placeholder for "Used in" -->
+efficiently.
+
+The original Adam algorithm maintains two moving-average accumulators for
+each trainable variable; the accumulators are updated at every step.
+This class provides lazier handling of gradient updates for sparse
+variables.  It only updates moving-average accumulators for sparse variable
+indices that appear in the current batch, rather than updating the
+accumulators for all indices. Compared with the original Adam optimizer,
+it can provide large improvements in model training throughput for some
+applications. However, it provides slightly different semantics than the
+original Adam algorithm, and may lead to different empirical results.
+
+Note, amsgrad is currently not supported and the argument can only be
+False.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    learning_rate=0.001,
+    beta_1=0.9,
+    beta_2=0.999,
+    epsilon=1e-07,
+    amsgrad=False,
+    name='LazyAdam',
+    **kwargs
+)
+```
+
+
+
+
+
+
+## Properties
+
+<h3 id="iterations"><code>iterations</code></h3>
+
+Variable. The number of training steps this Optimizer has run.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns variables of this Optimizer based on the order created.
+
+
+
+
+## Methods
+
+<h3 id="add_slot"><code>add_slot</code></h3>
+
+``` python
+add_slot(
+    var,
+    slot_name,
+    initializer='zeros'
+)
+```
+
+Add a new slot variable for `var`.
+
+
+<h3 id="add_weight"><code>add_weight</code></h3>
+
+``` python
+add_weight(
+    name,
+    shape,
+    dtype=None,
+    initializer='zeros',
+    trainable=None,
+    synchronization=tf_variables.VariableSynchronization.AUTO,
+    aggregation=tf_variables.VariableAggregation.NONE
+)
+```
+
+
+
+
+<h3 id="apply_gradients"><code>apply_gradients</code></h3>
+
+``` python
+apply_gradients(
+    grads_and_vars,
+    name=None
+)
+```
+
+Apply gradients to variables.
+
+This is the second part of `minimize()`. It returns an `Operation` that
+applies gradients.
+
+#### Args:
+
+
+* <b>`grads_and_vars`</b>: List of (gradient, variable) pairs.
+* <b>`name`</b>: Optional name for the returned operation.  Default to the name
+  passed to the `Optimizer` constructor.
+
+
+#### Returns:
+
+An `Operation` that applies the specified gradients. If `global_step`
+was not None, that operation also increments `global_step`.
+
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: If `grads_and_vars` is malformed.
+* <b>`ValueError`</b>: If none of the variables have gradients.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config,
+    custom_objects=None
+)
+```
+
+Creates an optimizer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same optimizer from the config
+dictionary.
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the output of get_config.
+* <b>`custom_objects`</b>: A Python dictionary mapping names to additional Python
+  objects used to create this optimizer, such as a function used for a
+  hyperparameter.
+
+
+#### Returns:
+
+An optimizer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_gradients"><code>get_gradients</code></h3>
+
+``` python
+get_gradients(
+    loss,
+    params
+)
+```
+
+Returns gradients of `loss` with respect to `params`.
+
+
+#### Arguments:
+
+
+* <b>`loss`</b>: Loss tensor.
+* <b>`params`</b>: List of variables.
+
+
+#### Returns:
+
+List of gradient tensors.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: In case any gradient cannot be computed (e.g. if gradient
+  function not implemented).
+
+<h3 id="get_slot"><code>get_slot</code></h3>
+
+``` python
+get_slot(
+    var,
+    slot_name
+)
+```
+
+
+
+
+<h3 id="get_slot_names"><code>get_slot_names</code></h3>
+
+``` python
+get_slot_names()
+```
+
+A list of names for this optimizer's slots.
+
+
+<h3 id="get_updates"><code>get_updates</code></h3>
+
+``` python
+get_updates(
+    loss,
+    params
+)
+```
+
+
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+
+
+
+<h3 id="minimize"><code>minimize</code></h3>
+
+``` python
+minimize(
+    loss,
+    var_list,
+    grad_loss=None,
+    name=None
+)
+```
+
+Minimize `loss` by updating `var_list`.
+
+This method simply computes gradient using `tf.GradientTape` and calls
+`apply_gradients()`. If you want to process the gradient before applying
+then call `tf.GradientTape` and `apply_gradients()` explicitly instead
+of using this function.
+
+#### Args:
+
+
+* <b>`loss`</b>: A callable taking no arguments which returns the value to minimize.
+* <b>`var_list`</b>: list or tuple of `Variable` objects to update to minimize
+  `loss`, or a callable returning the list or tuple of `Variable` objects.
+  Use callable when the variable list would otherwise be incomplete before
+  `minimize` since the variables are created at the first time `loss` is
+  called.
+* <b>`grad_loss`</b>: Optional. A `Tensor` holding the gradient computed for `loss`.
+* <b>`name`</b>: Optional name for the returned operation.
+
+
+#### Returns:
+
+An Operation that updates the variables in `var_list`.  If `global_step`
+was not `None`, that operation also increments `global_step`.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If some of the variables are not `Variable` objects.
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+``` python
+variables()
+```
+
+Returns variables of this Optimizer based on the order created.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/optimizers/MovingAverage.md b/docs/api_docs/python/tfa/optimizers/MovingAverage.md
new file mode 100644
index 0000000000..acd7601ee0
--- /dev/null
+++ b/docs/api_docs/python/tfa/optimizers/MovingAverage.md
@@ -0,0 +1,334 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.optimizers.MovingAverage" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="iterations"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="add_slot"/>
+<meta itemprop="property" content="add_weight"/>
+<meta itemprop="property" content="apply_gradients"/>
+<meta itemprop="property" content="assign_average_vars"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_gradients"/>
+<meta itemprop="property" content="get_slot"/>
+<meta itemprop="property" content="get_slot_names"/>
+<meta itemprop="property" content="get_updates"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="minimize"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="variables"/>
+</div>
+
+# tfa.optimizers.MovingAverage
+
+## Class `MovingAverage`
+
+Optimizer that computes a moving average of the variables.
+
+
+
+### Aliases:
+
+* Class `tfa.optimizers.MovingAverage`
+* Class `tfa.optimizers.moving_average.MovingAverage`
+
+
+
+Defined in [`optimizers/moving_average.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/optimizers/moving_average.py).
+
+<!-- Placeholder for "Used in" -->
+
+Empirically it has been found that using the moving average of the trained
+parameters of a deep network is better than using its trained parameters
+directly. This optimizer allows you to compute this moving average and swap
+the variables at save time so that any code outside of the training loop
+will use by default the average values instead of the original ones.
+
+#### Example of usage:
+
+
+
+```python
+opt = tf.keras.optimizers.SGD(learning_rate)
+opt = tfa.optimizers.MovingAverage(opt)
+
+```
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    optimizer,
+    average_decay=0.1,
+    num_updates=None,
+    sequential_update=True,
+    name='MovingAverage',
+    **kwargs
+)
+```
+
+
+
+
+
+
+## Properties
+
+<h3 id="iterations"><code>iterations</code></h3>
+
+Variable. The number of training steps this Optimizer has run.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+
+
+
+
+
+## Methods
+
+<h3 id="add_slot"><code>add_slot</code></h3>
+
+``` python
+add_slot(
+    var,
+    slot_name,
+    initializer='zeros'
+)
+```
+
+Add a new slot variable for `var`.
+
+
+<h3 id="add_weight"><code>add_weight</code></h3>
+
+``` python
+add_weight(
+    name,
+    shape,
+    dtype=None,
+    initializer='zeros',
+    trainable=None,
+    synchronization=tf_variables.VariableSynchronization.AUTO,
+    aggregation=tf_variables.VariableAggregation.NONE
+)
+```
+
+
+
+
+<h3 id="apply_gradients"><code>apply_gradients</code></h3>
+
+``` python
+apply_gradients(
+    grads_and_vars,
+    name=None
+)
+```
+
+
+
+
+<h3 id="assign_average_vars"><code>assign_average_vars</code></h3>
+
+``` python
+assign_average_vars(var_list)
+```
+
+Update variables in var_list with the running mean of the variables.
+
+
+#### Example:
+
+
+```python
+model = tf.Sequential([...])
+opt = tfa.optimizers.MovingAverage(
+    tf.keras.optimizers.SGD(lr=2.0), 0.5)
+
+model.compile(opt, ...)
+model.fit(x, y, ...)
+
+# Update the weights to their mean before saving
+opt.assign_average_vars(model.variables)
+
+model.save('model.h5')
+```
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config,
+    custom_objects=None
+)
+```
+
+Creates an optimizer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same optimizer from the config
+dictionary.
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the output of get_config.
+* <b>`custom_objects`</b>: A Python dictionary mapping names to additional Python
+  objects used to create this optimizer, such as a function used for a
+  hyperparameter.
+
+
+#### Returns:
+
+An optimizer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_gradients"><code>get_gradients</code></h3>
+
+``` python
+get_gradients(
+    loss,
+    params
+)
+```
+
+Returns gradients of `loss` with respect to `params`.
+
+
+#### Arguments:
+
+
+* <b>`loss`</b>: Loss tensor.
+* <b>`params`</b>: List of variables.
+
+
+#### Returns:
+
+List of gradient tensors.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: In case any gradient cannot be computed (e.g. if gradient
+  function not implemented).
+
+<h3 id="get_slot"><code>get_slot</code></h3>
+
+``` python
+get_slot(
+    var,
+    slot_name
+)
+```
+
+
+
+
+<h3 id="get_slot_names"><code>get_slot_names</code></h3>
+
+``` python
+get_slot_names()
+```
+
+A list of names for this optimizer's slots.
+
+
+<h3 id="get_updates"><code>get_updates</code></h3>
+
+``` python
+get_updates(
+    loss,
+    params
+)
+```
+
+
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+
+
+
+<h3 id="minimize"><code>minimize</code></h3>
+
+``` python
+minimize(
+    loss,
+    var_list,
+    grad_loss=None,
+    name=None
+)
+```
+
+Minimize `loss` by updating `var_list`.
+
+This method simply computes gradient using `tf.GradientTape` and calls
+`apply_gradients()`. If you want to process the gradient before applying
+then call `tf.GradientTape` and `apply_gradients()` explicitly instead
+of using this function.
+
+#### Args:
+
+
+* <b>`loss`</b>: A callable taking no arguments which returns the value to minimize.
+* <b>`var_list`</b>: list or tuple of `Variable` objects to update to minimize
+  `loss`, or a callable returning the list or tuple of `Variable` objects.
+  Use callable when the variable list would otherwise be incomplete before
+  `minimize` since the variables are created at the first time `loss` is
+  called.
+* <b>`grad_loss`</b>: Optional. A `Tensor` holding the gradient computed for `loss`.
+* <b>`name`</b>: Optional name for the returned operation.
+
+
+#### Returns:
+
+An Operation that updates the variables in `var_list`.  If `global_step`
+was not `None`, that operation also increments `global_step`.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If some of the variables are not `Variable` objects.
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+``` python
+variables()
+```
+
+Returns variables of this Optimizer based on the order created.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/optimizers/SGDW.md b/docs/api_docs/python/tfa/optimizers/SGDW.md
new file mode 100644
index 0000000000..a564362933
--- /dev/null
+++ b/docs/api_docs/python/tfa/optimizers/SGDW.md
@@ -0,0 +1,372 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.optimizers.SGDW" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="iterations"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="add_slot"/>
+<meta itemprop="property" content="add_weight"/>
+<meta itemprop="property" content="apply_gradients"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_gradients"/>
+<meta itemprop="property" content="get_slot"/>
+<meta itemprop="property" content="get_slot_names"/>
+<meta itemprop="property" content="get_updates"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="minimize"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="variables"/>
+</div>
+
+# tfa.optimizers.SGDW
+
+## Class `SGDW`
+
+Optimizer that implements the Momentum algorithm with weight_decay.
+
+Inherits From: [`DecoupledWeightDecayExtension`](../../tfa/optimizers/weight_decay_optimizers/DecoupledWeightDecayExtension.md)
+
+### Aliases:
+
+* Class `tfa.optimizers.SGDW`
+* Class `tfa.optimizers.weight_decay_optimizers.SGDW`
+
+
+
+Defined in [`optimizers/weight_decay_optimizers.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/optimizers/weight_decay_optimizers.py).
+
+<!-- Placeholder for "Used in" -->
+
+This is an implementation of the SGDW optimizer described in "Decoupled
+Weight Decay Regularization" by Loshchilov & Hutter
+(https://arxiv.org/abs/1711.05101)
+([pdf])(https://arxiv.org/pdf/1711.05101.pdf).
+It computes the update step of `tf.keras.optimizers.SGD` and additionally
+decays the variable. Note that this is different from adding
+L2 regularization on the variables to the loss. Decoupling the weight decay
+from other hyperparameters (in particular the learning rate) simplifies
+hyperparameter search.
+
+For further information see the documentation of the SGD Optimizer.
+
+This optimizer can also be instantiated as
+```python
+extend_with_decoupled_weight_decay(tf.keras.optimizers.SGD,
+                                   weight_decay=weight_decay)
+```
+
+Note: when applying a decay to the learning rate, be sure to manually apply
+the decay to the `weight_decay` as well. For example:
+
+```python
+step = tf.Variable(0, trainable=False)
+schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
+    [10000, 15000], [1e-0, 1e-1, 1e-2])
+# lr and wd can be a function or a tensor
+lr = 1e-1 * schedule(step)
+wd = lambda: 1e-4 * schedule(step)
+
+# ...
+
+optimizer = tfa.optimizers.SGDW(
+    learning_rate=lr, weight_decay=wd, momentum=0.9)
+```
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    weight_decay,
+    learning_rate=0.001,
+    momentum=0.0,
+    nesterov=False,
+    name='SGDW',
+    **kwargs
+)
+```
+
+Construct a new SGDW optimizer.
+
+For further information see the documentation of the SGD Optimizer.
+
+#### Args:
+
+
+* <b>`learning_rate`</b>: float hyperparameter >= 0. Learning rate.
+* <b>`momentum`</b>: float hyperparameter >= 0 that accelerates SGD in the
+    relevant direction and dampens oscillations.
+* <b>`nesterov`</b>: boolean. Whether to apply Nesterov momentum.
+* <b>`name`</b>: Optional name prefix for the operations created when applying
+    gradients.  Defaults to 'SGD'.
+* <b>`**kwargs`</b>: keyword arguments. Allowed to be {`clipnorm`,
+    `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
+    norm; `clipvalue` is clip gradients by value, `decay` is
+    included for backward compatibility to allow time inverse decay
+    of learning rate. `lr` is included for backward compatibility,
+    recommended to use `learning_rate` instead.
+
+
+
+## Properties
+
+<h3 id="iterations"><code>iterations</code></h3>
+
+Variable. The number of training steps this Optimizer has run.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns variables of this Optimizer based on the order created.
+
+
+
+
+## Methods
+
+<h3 id="add_slot"><code>add_slot</code></h3>
+
+``` python
+add_slot(
+    var,
+    slot_name,
+    initializer='zeros'
+)
+```
+
+Add a new slot variable for `var`.
+
+
+<h3 id="add_weight"><code>add_weight</code></h3>
+
+``` python
+add_weight(
+    name,
+    shape,
+    dtype=None,
+    initializer='zeros',
+    trainable=None,
+    synchronization=tf_variables.VariableSynchronization.AUTO,
+    aggregation=tf_variables.VariableAggregation.NONE
+)
+```
+
+
+
+
+<h3 id="apply_gradients"><code>apply_gradients</code></h3>
+
+``` python
+apply_gradients(
+    grads_and_vars,
+    name=None,
+    decay_var_list=None
+)
+```
+
+Apply gradients to variables.
+
+This is the second part of `minimize()`. It returns an `Operation` that
+applies gradients.
+
+#### Args:
+
+
+* <b>`grads_and_vars`</b>: List of (gradient, variable) pairs.
+* <b>`name`</b>: Optional name for the returned operation.  Default to the
+    name passed to the `Optimizer` constructor.
+* <b>`decay_var_list`</b>: Optional list of variables to be decayed. Defaults
+    to all variables in var_list.
+
+#### Returns:
+
+An `Operation` that applies the specified gradients. If
+`global_step` was not None, that operation also increments
+`global_step`.
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: If `grads_and_vars` is malformed.
+* <b>`ValueError`</b>: If none of the variables have gradients.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config,
+    custom_objects=None
+)
+```
+
+Creates an optimizer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same optimizer from the config
+dictionary.
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the output of get_config.
+* <b>`custom_objects`</b>: A Python dictionary mapping names to additional Python
+  objects used to create this optimizer, such as a function used for a
+  hyperparameter.
+
+
+#### Returns:
+
+An optimizer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_gradients"><code>get_gradients</code></h3>
+
+``` python
+get_gradients(
+    loss,
+    params
+)
+```
+
+Returns gradients of `loss` with respect to `params`.
+
+
+#### Arguments:
+
+
+* <b>`loss`</b>: Loss tensor.
+* <b>`params`</b>: List of variables.
+
+
+#### Returns:
+
+List of gradient tensors.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: In case any gradient cannot be computed (e.g. if gradient
+  function not implemented).
+
+<h3 id="get_slot"><code>get_slot</code></h3>
+
+``` python
+get_slot(
+    var,
+    slot_name
+)
+```
+
+
+
+
+<h3 id="get_slot_names"><code>get_slot_names</code></h3>
+
+``` python
+get_slot_names()
+```
+
+A list of names for this optimizer's slots.
+
+
+<h3 id="get_updates"><code>get_updates</code></h3>
+
+``` python
+get_updates(
+    loss,
+    params
+)
+```
+
+
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+
+
+
+<h3 id="minimize"><code>minimize</code></h3>
+
+``` python
+minimize(
+    loss,
+    var_list,
+    grad_loss=None,
+    name=None,
+    decay_var_list=None
+)
+```
+
+Minimize `loss` by updating `var_list`.
+
+This method simply computes gradient using `tf.GradientTape` and calls
+`apply_gradients()`. If you want to process the gradient before
+applying then call `tf.GradientTape` and `apply_gradients()` explicitly
+instead of using this function.
+
+#### Args:
+
+
+* <b>`loss`</b>: A callable taking no arguments which returns the value to
+    minimize.
+* <b>`var_list`</b>: list or tuple of `Variable` objects to update to
+    minimize `loss`, or a callable returning the list or tuple of
+    `Variable` objects. Use callable when the variable list would
+    otherwise be incomplete before `minimize` since the variables
+    are created at the first time `loss` is called.
+* <b>`grad_loss`</b>: Optional. A `Tensor` holding the gradient computed for
+    `loss`.
+* <b>`decay_var_list`</b>: Optional list of variables to be decayed. Defaults
+    to all variables in var_list.
+* <b>`name`</b>: Optional name for the returned operation.
+
+#### Returns:
+
+An Operation that updates the variables in `var_list`.  If
+`global_step` was not `None`, that operation also increments
+`global_step`.
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If some of the variables are not `Variable` objects.
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+``` python
+variables()
+```
+
+Returns variables of this Optimizer based on the order created.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/optimizers/extend_with_decoupled_weight_decay.md b/docs/api_docs/python/tfa/optimizers/extend_with_decoupled_weight_decay.md
new file mode 100644
index 0000000000..833f2a8169
--- /dev/null
+++ b/docs/api_docs/python/tfa/optimizers/extend_with_decoupled_weight_decay.md
@@ -0,0 +1,83 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.optimizers.extend_with_decoupled_weight_decay" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.optimizers.extend_with_decoupled_weight_decay
+
+Factory function returning an optimizer class with decoupled weight
+
+### Aliases:
+
+* `tfa.optimizers.extend_with_decoupled_weight_decay`
+* `tfa.optimizers.weight_decay_optimizers.extend_with_decoupled_weight_decay`
+
+``` python
+tfa.optimizers.extend_with_decoupled_weight_decay(base_optimizer)
+```
+
+
+
+Defined in [`optimizers/weight_decay_optimizers.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/optimizers/weight_decay_optimizers.py).
+
+<!-- Placeholder for "Used in" -->
+decay.
+
+Returns an optimizer class. An instance of the returned class computes the
+update step of `base_optimizer` and additionally decays the weights.
+E.g., the class returned by
+`extend_with_decoupled_weight_decay(tf.keras.optimizers.Adam)` is
+equivalent to <a href="../../tfa/optimizers/AdamW.md"><code>tfa.optimizers.AdamW</code></a>.
+
+The API of the new optimizer class slightly differs from the API of the
+base optimizer:
+- The first argument to the constructor is the weight decay rate.
+- `minimize` and `apply_gradients` accept the optional keyword argument
+  `decay_var_list`, which specifies the variables that should be decayed.
+  If `None`, all variables that are optimized are decayed.
+
+#### Usage example:
+
+
+```python
+# MyAdamW is a new class
+MyAdamW = extend_with_decoupled_weight_decay(tf.keras.optimizers.Adam)
+# Create a MyAdamW object
+optimizer = MyAdamW(weight_decay=0.001, learning_rate=0.001)
+# update var1, var2 but only decay var1
+optimizer.minimize(loss, var_list=[var1, var2], decay_variables=[var1])
+
+Note: this extension decays weights BEFORE applying the update based
+on the gradient, i.e. this extension only has the desired behaviour for
+optimizers which do not depend on the value of 'var' in the update step!
+
+Note: when applying a decay to the learning rate, be sure to manually apply
+the decay to the `weight_decay` as well. For example:
+
+```python
+step = tf.Variable(0, trainable=False)
+schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
+    [10000, 15000], [1e-0, 1e-1, 1e-2])
+# lr and wd can be a function or a tensor
+lr = 1e-1 * schedule(step)
+wd = lambda: 1e-4 * schedule(step)
+
+# ...
+
+optimizer = tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
+```
+
+Note: you might want to register your own custom optimizer using
+`tf.keras.utils.get_custom_objects()`.
+
+#### Args:
+
+
+* <b>`base_optimizer`</b>: An optimizer class that inherits from
+    tf.optimizers.Optimizer.
+
+
+#### Returns:
+
+A new optimizer class that inherits from DecoupledWeightDecayExtension
+and base_optimizer.
diff --git a/docs/api_docs/python/tfa/optimizers/lazy_adam.md b/docs/api_docs/python/tfa/optimizers/lazy_adam.md
new file mode 100644
index 0000000000..d4c7a6e1c4
--- /dev/null
+++ b/docs/api_docs/python/tfa/optimizers/lazy_adam.md
@@ -0,0 +1,24 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.optimizers.lazy_adam" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.optimizers.lazy_adam
+
+Variant of the Adam optimizer that handles sparse updates more efficiently.
+
+
+
+Defined in [`optimizers/lazy_adam.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/optimizers/lazy_adam.py).
+
+<!-- Placeholder for "Used in" -->
+
+Compared with the original Adam optimizer, the one in this file can
+provide a large improvement in model training throughput for some
+applications. However, it provides slightly different semantics than the
+original Adam algorithm, and may lead to different empirical results.
+
+## Classes
+
+[`class LazyAdam`](../../tfa/optimizers/LazyAdam.md): Variant of the Adam optimizer that handles sparse updates more
+
diff --git a/docs/api_docs/python/tfa/optimizers/moving_average.md b/docs/api_docs/python/tfa/optimizers/moving_average.md
new file mode 100644
index 0000000000..e069a84e92
--- /dev/null
+++ b/docs/api_docs/python/tfa/optimizers/moving_average.md
@@ -0,0 +1,20 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.optimizers.moving_average" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.optimizers.moving_average
+
+
+
+
+
+Defined in [`optimizers/moving_average.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/optimizers/moving_average.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class MovingAverage`](../../tfa/optimizers/MovingAverage.md): Optimizer that computes a moving average of the variables.
+
diff --git a/docs/api_docs/python/tfa/optimizers/weight_decay_optimizers.md b/docs/api_docs/python/tfa/optimizers/weight_decay_optimizers.md
new file mode 100644
index 0000000000..3029d1bb54
--- /dev/null
+++ b/docs/api_docs/python/tfa/optimizers/weight_decay_optimizers.md
@@ -0,0 +1,28 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.optimizers.weight_decay_optimizers" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.optimizers.weight_decay_optimizers
+
+Base class to make optimizers weight decay ready.
+
+
+
+Defined in [`optimizers/weight_decay_optimizers.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/optimizers/weight_decay_optimizers.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class AdamW`](../../tfa/optimizers/AdamW.md): Optimizer that implements the Adam algorithm with weight decay.
+
+[`class DecoupledWeightDecayExtension`](../../tfa/optimizers/weight_decay_optimizers/DecoupledWeightDecayExtension.md): This class allows to extend optimizers with decoupled weight decay.
+
+[`class SGDW`](../../tfa/optimizers/SGDW.md): Optimizer that implements the Momentum algorithm with weight_decay.
+
+## Functions
+
+[`extend_with_decoupled_weight_decay(...)`](../../tfa/optimizers/extend_with_decoupled_weight_decay.md): Factory function returning an optimizer class with decoupled weight
+
diff --git a/docs/api_docs/python/tfa/optimizers/weight_decay_optimizers/DecoupledWeightDecayExtension.md b/docs/api_docs/python/tfa/optimizers/weight_decay_optimizers/DecoupledWeightDecayExtension.md
new file mode 100644
index 0000000000..e8ff936bde
--- /dev/null
+++ b/docs/api_docs/python/tfa/optimizers/weight_decay_optimizers/DecoupledWeightDecayExtension.md
@@ -0,0 +1,186 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.optimizers.weight_decay_optimizers.DecoupledWeightDecayExtension" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply_gradients"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="minimize"/>
+</div>
+
+# tfa.optimizers.weight_decay_optimizers.DecoupledWeightDecayExtension
+
+## Class `DecoupledWeightDecayExtension`
+
+This class allows to extend optimizers with decoupled weight decay.
+
+
+
+
+
+Defined in [`optimizers/weight_decay_optimizers.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/optimizers/weight_decay_optimizers.py).
+
+<!-- Placeholder for "Used in" -->
+
+It implements the decoupled weight decay described by Loshchilov & Hutter
+(https://arxiv.org/pdf/1711.05101.pdf), in which the weight decay is
+decoupled from the optimization steps w.r.t. to the loss function.
+For SGD variants, this simplifies hyperparameter search since it decouples
+the settings of weight decay and learning rate.
+For adaptive gradient algorithms, it regularizes variables with large
+gradients more than L2 regularization would, which was shown to yield
+better training loss and generalization error in the paper above.
+
+This class alone is not an optimizer but rather extends existing
+optimizers with decoupled weight decay. We explicitly define the two
+examples used in the above paper (SGDW and AdamW), but in general this
+can extend any OptimizerX by using
+`extend_with_decoupled_weight_decay(
+    OptimizerX, weight_decay=weight_decay)`.
+In order for it to work, it must be the first class the Optimizer with
+weight decay inherits from, e.g.
+
+```python
+class AdamW(DecoupledWeightDecayExtension, tf.keras.optimizers.Adam):
+  def __init__(self, weight_decay, *args, **kwargs):
+    super(AdamW, self).__init__(weight_decay, *args, **kwargs).
+```
+
+Note: this extension decays weights BEFORE applying the update based
+on the gradient, i.e. this extension only has the desired behaviour for
+optimizers which do not depend on the value of'var' in the update step!
+
+Note: when applying a decay to the learning rate, be sure to manually apply
+the decay to the `weight_decay` as well. For example:
+
+```python
+step = tf.Variable(0, trainable=False)
+schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
+    [10000, 15000], [1e-0, 1e-1, 1e-2])
+# lr and wd can be a function or a tensor
+lr = 1e-1 * schedule(step)
+wd = lambda: 1e-4 * schedule(step)
+
+# ...
+
+optimizer = tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
+```
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    weight_decay,
+    **kwargs
+)
+```
+
+Extension class that adds weight decay to an optimizer.
+
+
+#### Args:
+
+
+* <b>`weight_decay`</b>: A `Tensor` or a floating point value, the factor by
+    which a variable is decayed in the update step.
+* <b>`**kwargs`</b>: Optional list or tuple or set of `Variable` objects to
+    decay.
+
+
+
+## Methods
+
+<h3 id="apply_gradients"><code>apply_gradients</code></h3>
+
+``` python
+apply_gradients(
+    grads_and_vars,
+    name=None,
+    decay_var_list=None
+)
+```
+
+Apply gradients to variables.
+
+This is the second part of `minimize()`. It returns an `Operation` that
+applies gradients.
+
+#### Args:
+
+
+* <b>`grads_and_vars`</b>: List of (gradient, variable) pairs.
+* <b>`name`</b>: Optional name for the returned operation.  Default to the
+    name passed to the `Optimizer` constructor.
+* <b>`decay_var_list`</b>: Optional list of variables to be decayed. Defaults
+    to all variables in var_list.
+
+#### Returns:
+
+An `Operation` that applies the specified gradients. If
+`global_step` was not None, that operation also increments
+`global_step`.
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: If `grads_and_vars` is malformed.
+* <b>`ValueError`</b>: If none of the variables have gradients.
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="minimize"><code>minimize</code></h3>
+
+``` python
+minimize(
+    loss,
+    var_list,
+    grad_loss=None,
+    name=None,
+    decay_var_list=None
+)
+```
+
+Minimize `loss` by updating `var_list`.
+
+This method simply computes gradient using `tf.GradientTape` and calls
+`apply_gradients()`. If you want to process the gradient before
+applying then call `tf.GradientTape` and `apply_gradients()` explicitly
+instead of using this function.
+
+#### Args:
+
+
+* <b>`loss`</b>: A callable taking no arguments which returns the value to
+    minimize.
+* <b>`var_list`</b>: list or tuple of `Variable` objects to update to
+    minimize `loss`, or a callable returning the list or tuple of
+    `Variable` objects. Use callable when the variable list would
+    otherwise be incomplete before `minimize` since the variables
+    are created at the first time `loss` is called.
+* <b>`grad_loss`</b>: Optional. A `Tensor` holding the gradient computed for
+    `loss`.
+* <b>`decay_var_list`</b>: Optional list of variables to be decayed. Defaults
+    to all variables in var_list.
+* <b>`name`</b>: Optional name for the returned operation.
+
+#### Returns:
+
+An Operation that updates the variables in `var_list`.  If
+`global_step` was not `None`, that operation also increments
+`global_step`.
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If some of the variables are not `Variable` objects.
+
+
+
diff --git a/docs/api_docs/python/tfa/rnn.md b/docs/api_docs/python/tfa/rnn.md
new file mode 100644
index 0000000000..07eac378ce
--- /dev/null
+++ b/docs/api_docs/python/tfa/rnn.md
@@ -0,0 +1,26 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.rnn" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.rnn
+
+Customized RNN cells.
+
+
+
+Defined in [`rnn/__init__.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/rnn/__init__.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Modules
+
+[`cell`](../tfa/rnn/cell.md) module: Module for RNN Cells.
+
+## Classes
+
+[`class LayerNormLSTMCell`](../tfa/rnn/LayerNormLSTMCell.md): LSTM cell with layer normalization and recurrent dropout.
+
+[`class NASCell`](../tfa/rnn/NASCell.md): Neural Architecture Search (NAS) recurrent network cell.
+
diff --git a/docs/api_docs/python/tfa/rnn/LayerNormLSTMCell.md b/docs/api_docs/python/tfa/rnn/LayerNormLSTMCell.md
new file mode 100644
index 0000000000..4d13c7dd0a
--- /dev/null
+++ b/docs/api_docs/python/tfa/rnn/LayerNormLSTMCell.md
@@ -0,0 +1,995 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.rnn.LayerNormLSTMCell" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_dropout_mask_for_cell"/>
+<meta itemprop="property" content="get_initial_state"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_recurrent_dropout_mask_for_cell"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="reset_dropout_mask"/>
+<meta itemprop="property" content="reset_recurrent_dropout_mask"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.rnn.LayerNormLSTMCell
+
+## Class `LayerNormLSTMCell`
+
+LSTM cell with layer normalization and recurrent dropout.
+
+
+
+### Aliases:
+
+* Class `tfa.rnn.LayerNormLSTMCell`
+* Class `tfa.rnn.cell.LayerNormLSTMCell`
+
+
+
+Defined in [`rnn/cell.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/rnn/cell.py).
+
+<!-- Placeholder for "Used in" -->
+
+This class adds layer normalization and recurrent dropout to a LSTM unit.
+Layer normalization implementation is based on:
+
+  https://arxiv.org/abs/1607.06450.
+
+"Layer Normalization" Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
+
+and is applied before the internal nonlinearities.
+Recurrent dropout is based on:
+
+  https://arxiv.org/abs/1603.05118
+
+"Recurrent Dropout without Memory Loss"
+Stanislau Semeniuta, Aliaksei Severyn, Erhardt Barth.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    units,
+    activation='tanh',
+    recurrent_activation='sigmoid',
+    use_bias=True,
+    kernel_initializer='glorot_uniform',
+    recurrent_initializer='orthogonal',
+    bias_initializer='zeros',
+    unit_forget_bias=True,
+    kernel_regularizer=None,
+    recurrent_regularizer=None,
+    bias_regularizer=None,
+    kernel_constraint=None,
+    recurrent_constraint=None,
+    bias_constraint=None,
+    dropout=0.0,
+    recurrent_dropout=0.0,
+    norm_gamma_initializer='ones',
+    norm_beta_initializer='zeros',
+    norm_epsilon=0.001,
+    **kwargs
+)
+```
+
+Initializes the LSTM cell.
+
+
+#### Args:
+
+
+* <b>`units`</b>: Positive integer, dimensionality of the output space.
+* <b>`activation`</b>: Activation function to use. Default: hyperbolic tangent
+  (`tanh`). If you pass `None`, no activation is applied (ie.
+  "linear" activation: `a(x) = x`).
+* <b>`recurrent_activation`</b>: Activation function to use for the recurrent
+  step. Default: sigmoid (`sigmoid`). If you pass `None`, no
+  activation is applied (ie. "linear" activation: `a(x) = x`).
+* <b>`use_bias`</b>: Boolean, whether the layer uses a bias vector.
+* <b>`kernel_initializer`</b>: Initializer for the `kernel` weights matrix, used
+  for the linear transformation of the inputs.
+* <b>`recurrent_initializer`</b>: Initializer for the `recurrent_kernel` weights
+  matrix, used for the linear transformation of the recurrent state.
+* <b>`bias_initializer`</b>: Initializer for the bias vector.
+* <b>`unit_forget_bias`</b>: Boolean. If True, add 1 to the bias of the forget
+  gate at initialization. Setting it to true will also force
+  `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
+    al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+* <b>`kernel_regularizer`</b>: Regularizer function applied to the `kernel`
+  weights matrix.
+* <b>`recurrent_regularizer`</b>: Regularizer function applied to
+  the `recurrent_kernel` weights matrix.
+* <b>`bias_regularizer`</b>: Regularizer function applied to the bias vector.
+* <b>`kernel_constraint`</b>: Constraint function applied to the `kernel`
+  weights matrix.
+* <b>`recurrent_constraint`</b>: Constraint function applied to the
+  `recurrent_kernel` weights matrix.
+* <b>`bias_constraint`</b>: Constraint function applied to the bias vector.
+* <b>`dropout`</b>: Float between 0 and 1. Fraction of the units to drop for the
+  linear transformation of the inputs.
+* <b>`recurrent_dropout`</b>: Float between 0 and 1. Fraction of the units to
+  drop for the linear transformation of the recurrent state.
+* <b>`norm_gamma_initializer`</b>: Initializer for the layer normalization gain
+  initial value.
+* <b>`norm_beta_initializer`</b>: Initializer for the layer normalization shift
+  initial value.
+* <b>`norm_epsilon`</b>: Float, the epsilon value for normalization layers.
+* <b>`**kwargs`</b>: Dict, the other keyword arguments for layer creation.
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Wraps `call`, applying pre- and post-processing steps.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+
+#### Note:
+
+- The following optional keyword arguments are reserved for specific uses:
+  * `training`: Boolean scalar tensor of Python boolean indicating
+    whether the `call` is meant for training or inference.
+  * `mask`: Boolean input mask.
+- If the layer's `call` method takes a `mask` argument (as some Keras
+  layers do), its default value will be set to the mask generated
+  for `inputs` by the previous layer (if `input` did come from
+  a layer that generated a corresponding mask, i.e. if it came from
+  a Keras layer with masking support.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer's `call` method returns None (an invalid value).
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+
+
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+Computes an output mask tensor.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Tensor or list of tensors.
+* <b>`mask`</b>: Tensor or list of tensors.
+
+
+#### Returns:
+
+None or a tensor (or list of tensors,
+    one per output tensor of the layer).
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+Computes the output shape of the layer.
+
+Assumes that the layer will be built
+to match that input shape provided.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Shape tuple (tuple of integers)
+    or list of shape tuples (one per output tensor of the layer).
+    Shape tuples can include None for free dimensions,
+    instead of an integer.
+
+
+#### Returns:
+
+An input shape tuple.
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Creates a layer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same layer from the config
+dictionary. It does not handle layer connectivity
+(handled by Network), nor weights (handled by `set_weights`).
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the
+    output of get_config.
+
+
+#### Returns:
+
+A layer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_dropout_mask_for_cell"><code>get_dropout_mask_for_cell</code></h3>
+
+``` python
+get_dropout_mask_for_cell(
+    inputs,
+    training,
+    count=1
+)
+```
+
+Get the dropout mask for RNN cell's input.
+
+It will create mask based on context if there isn't any existing cached
+mask. If a new mask is generated, it will update the cache in the cell.
+
+#### Args:
+
+
+* <b>`inputs`</b>: the input tensor whose shape will be used to generate dropout
+  mask.
+* <b>`training`</b>: boolean tensor, whether its in training mode, dropout will be
+  ignored in non-training mode.
+* <b>`count`</b>: int, how many dropout mask will be generated. It is useful for cell
+  that has internal weights fused together.
+
+#### Returns:
+
+List of mask tensor, generated or cached mask based on context.
+
+
+<h3 id="get_initial_state"><code>get_initial_state</code></h3>
+
+``` python
+get_initial_state(
+    inputs=None,
+    batch_size=None,
+    dtype=None
+)
+```
+
+
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_recurrent_dropout_mask_for_cell"><code>get_recurrent_dropout_mask_for_cell</code></h3>
+
+``` python
+get_recurrent_dropout_mask_for_cell(
+    inputs,
+    training,
+    count=1
+)
+```
+
+Get the recurrent dropout mask for RNN cell.
+
+It will create mask based on context if there isn't any existing cached
+mask. If a new mask is generated, it will update the cache in the cell.
+
+#### Args:
+
+
+* <b>`inputs`</b>: the input tensor whose shape will be used to generate dropout
+  mask.
+* <b>`training`</b>: boolean tensor, whether its in training mode, dropout will be
+  ignored in non-training mode.
+* <b>`count`</b>: int, how many dropout mask will be generated. It is useful for cell
+  that has internal weights fused together.
+
+#### Returns:
+
+List of mask tensor, generated or cached mask based on context.
+
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="reset_dropout_mask"><code>reset_dropout_mask</code></h3>
+
+``` python
+reset_dropout_mask()
+```
+
+Reset the cached dropout masks if any.
+
+This is important for the RNN layer to invoke this in it call() method so
+that the cached mask is cleared before calling the cell.call(). The mask
+should be cached across the timestep within the same batch, but shouldn't
+be cached between batches. Otherwise it will introduce unreasonable bias
+against certain index of data within the batch.
+
+<h3 id="reset_recurrent_dropout_mask"><code>reset_recurrent_dropout_mask</code></h3>
+
+``` python
+reset_recurrent_dropout_mask()
+```
+
+Reset the cached recurrent dropout masks if any.
+
+This is important for the RNN layer to invoke this in it call() method so
+that the cached mask is cleared before calling the cell.call(). The mask
+should be cached across the timestep within the same batch, but shouldn't
+be cached between batches. Otherwise it will introduce unreasonable bias
+against certain index of data within the batch.
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/rnn/NASCell.md b/docs/api_docs/python/tfa/rnn/NASCell.md
new file mode 100644
index 0000000000..eb101fabd8
--- /dev/null
+++ b/docs/api_docs/python/tfa/rnn/NASCell.md
@@ -0,0 +1,871 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.rnn.NASCell" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="output_size"/>
+<meta itemprop="property" content="state_size"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_initial_state"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.rnn.NASCell
+
+## Class `NASCell`
+
+Neural Architecture Search (NAS) recurrent network cell.
+
+
+
+### Aliases:
+
+* Class `tfa.rnn.NASCell`
+* Class `tfa.rnn.cell.NASCell`
+
+
+
+Defined in [`rnn/cell.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/rnn/cell.py).
+
+<!-- Placeholder for "Used in" -->
+
+This implements the recurrent cell from the paper:
+
+  https://arxiv.org/abs/1611.01578
+
+Barret Zoph and Quoc V. Le.
+"Neural Architecture Search with Reinforcement Learning" Proc. ICLR 2017.
+
+The class uses an optional projection layer.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    units,
+    projection=None,
+    use_bias=False,
+    kernel_initializer='glorot_uniform',
+    recurrent_initializer='glorot_uniform',
+    projection_initializer='glorot_uniform',
+    bias_initializer='zeros',
+    **kwargs
+)
+```
+
+Initialize the parameters for a NAS cell.
+
+
+#### Args:
+
+
+* <b>`units`</b>: int, The number of units in the NAS cell.
+* <b>`projection`</b>: (optional) int, The output dimensionality for the
+  projection matrices.  If None, no projection is performed.
+* <b>`use_bias`</b>: (optional) bool, If True then use biases within the cell.
+  This is False by default.
+* <b>`kernel_initializer`</b>: Initializer for kernel weight.
+* <b>`recurrent_initializer`</b>: Initializer for recurrent kernel weight.
+* <b>`projection_initializer`</b>: Initializer for projection weight, used when
+  projection is not None.
+* <b>`bias_initializer`</b>: Initializer for bias, used when use_bias is True.
+* <b>`**kwargs`</b>: Additional keyword arguments.
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_size"><code>output_size</code></h3>
+
+
+
+
+<h3 id="state_size"><code>state_size</code></h3>
+
+
+
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Wraps `call`, applying pre- and post-processing steps.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+
+#### Note:
+
+- The following optional keyword arguments are reserved for specific uses:
+  * `training`: Boolean scalar tensor of Python boolean indicating
+    whether the `call` is meant for training or inference.
+  * `mask`: Boolean input mask.
+- If the layer's `call` method takes a `mask` argument (as some Keras
+  layers do), its default value will be set to the mask generated
+  for `inputs` by the previous layer (if `input` did come from
+  a layer that generated a corresponding mask, i.e. if it came from
+  a Keras layer with masking support.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer's `call` method returns None (an invalid value).
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(inputs_shape)
+```
+
+
+
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+Computes an output mask tensor.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Tensor or list of tensors.
+* <b>`mask`</b>: Tensor or list of tensors.
+
+
+#### Returns:
+
+None or a tensor (or list of tensors,
+    one per output tensor of the layer).
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+Computes the output shape of the layer.
+
+Assumes that the layer will be built
+to match that input shape provided.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Shape tuple (tuple of integers)
+    or list of shape tuples (one per output tensor of the layer).
+    Shape tuples can include None for free dimensions,
+    instead of an integer.
+
+
+#### Returns:
+
+An input shape tuple.
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Creates a layer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same layer from the config
+dictionary. It does not handle layer connectivity
+(handled by Network), nor weights (handled by `set_weights`).
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the
+    output of get_config.
+
+
+#### Returns:
+
+A layer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_initial_state"><code>get_initial_state</code></h3>
+
+``` python
+get_initial_state(
+    inputs=None,
+    batch_size=None,
+    dtype=None
+)
+```
+
+
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/rnn/cell.md b/docs/api_docs/python/tfa/rnn/cell.md
new file mode 100644
index 0000000000..d271c28ee1
--- /dev/null
+++ b/docs/api_docs/python/tfa/rnn/cell.md
@@ -0,0 +1,22 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.rnn.cell" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.rnn.cell
+
+Module for RNN Cells.
+
+
+
+Defined in [`rnn/cell.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/rnn/cell.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class LayerNormLSTMCell`](../../tfa/rnn/LayerNormLSTMCell.md): LSTM cell with layer normalization and recurrent dropout.
+
+[`class NASCell`](../../tfa/rnn/NASCell.md): Neural Architecture Search (NAS) recurrent network cell.
+
diff --git a/docs/api_docs/python/tfa/seq2seq.md b/docs/api_docs/python/tfa/seq2seq.md
new file mode 100644
index 0000000000..6b734a1491
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq.md
@@ -0,0 +1,96 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.seq2seq
+
+Ops for building neural network sequence to sequence decoders and losses.
+
+
+
+Defined in [`seq2seq/__init__.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/__init__.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Modules
+
+[`attention_wrapper`](../tfa/seq2seq/attention_wrapper.md) module: A powerful dynamic attention wrapper object.
+
+[`basic_decoder`](../tfa/seq2seq/basic_decoder.md) module: A class of Decoders that may sample to generate the next input.
+
+[`beam_search_decoder`](../tfa/seq2seq/beam_search_decoder.md) module: A decoder that performs beam search.
+
+[`decoder`](../tfa/seq2seq/decoder.md) module: Seq2seq layer operations for use in neural networks.
+
+[`loss`](../tfa/seq2seq/loss.md) module: Seq2seq loss operations for use in sequence models.
+
+[`sampler`](../tfa/seq2seq/sampler.md) module: A library of sampler for use with SamplingDecoders.
+
+## Classes
+
+[`class AttentionMechanism`](../tfa/seq2seq/AttentionMechanism.md)
+
+[`class AttentionWrapper`](../tfa/seq2seq/AttentionWrapper.md): Wraps another `RNNCell` with attention.
+
+[`class AttentionWrapperState`](../tfa/seq2seq/AttentionWrapperState.md): `namedtuple` storing the state of a `AttentionWrapper`.
+
+[`class BahdanauAttention`](../tfa/seq2seq/BahdanauAttention.md): Implements Bahdanau-style (additive) attention.
+
+[`class BahdanauMonotonicAttention`](../tfa/seq2seq/BahdanauMonotonicAttention.md): Monotonic attention mechanism with Bahadanau-style energy function.
+
+[`class BaseDecoder`](../tfa/seq2seq/BaseDecoder.md): An RNN Decoder that is based on a Keras layer.
+
+[`class BasicDecoder`](../tfa/seq2seq/BasicDecoder.md): Basic sampling decoder.
+
+[`class BasicDecoderOutput`](../tfa/seq2seq/BasicDecoderOutput.md)
+
+[`class BeamSearchDecoder`](../tfa/seq2seq/BeamSearchDecoder.md): BeamSearch sampling decoder.
+
+[`class BeamSearchDecoderOutput`](../tfa/seq2seq/BeamSearchDecoderOutput.md)
+
+[`class BeamSearchDecoderState`](../tfa/seq2seq/BeamSearchDecoderState.md)
+
+[`class CustomSampler`](../tfa/seq2seq/CustomSampler.md): Base abstract class that allows the user to customize sampling.
+
+[`class Decoder`](../tfa/seq2seq/Decoder.md): An RNN Decoder abstract interface object.
+
+[`class FinalBeamSearchDecoderOutput`](../tfa/seq2seq/FinalBeamSearchDecoderOutput.md): Final outputs returned by the beam search after all decoding is
+
+[`class GreedyEmbeddingSampler`](../tfa/seq2seq/GreedyEmbeddingSampler.md): A sampler for use during inference.
+
+[`class InferenceSampler`](../tfa/seq2seq/InferenceSampler.md): A helper to use during inference with a custom sampling function.
+
+[`class LuongAttention`](../tfa/seq2seq/LuongAttention.md): Implements Luong-style (multiplicative) attention scoring.
+
+[`class LuongMonotonicAttention`](../tfa/seq2seq/LuongMonotonicAttention.md): Monotonic attention mechanism with Luong-style energy function.
+
+[`class SampleEmbeddingSampler`](../tfa/seq2seq/SampleEmbeddingSampler.md): A sampler for use during inference.
+
+[`class Sampler`](../tfa/seq2seq/Sampler.md): Interface for implementing sampling in seq2seq decoders.
+
+[`class ScheduledEmbeddingTrainingSampler`](../tfa/seq2seq/ScheduledEmbeddingTrainingSampler.md): A training sampler that adds scheduled sampling.
+
+[`class ScheduledOutputTrainingSampler`](../tfa/seq2seq/ScheduledOutputTrainingSampler.md): A training sampler that adds scheduled sampling directly to outputs.
+
+[`class SequenceLoss`](../tfa/seq2seq/SequenceLoss.md): Weighted cross-entropy loss for a sequence of logits.
+
+[`class TrainingSampler`](../tfa/seq2seq/TrainingSampler.md): A Sampler for use during training.
+
+## Functions
+
+[`dynamic_decode(...)`](../tfa/seq2seq/dynamic_decode.md): Perform dynamic decoding with `decoder`.
+
+[`gather_tree_from_array(...)`](../tfa/seq2seq/gather_tree_from_array.md): Calculates the full beams for `TensorArray`s.
+
+[`hardmax(...)`](../tfa/seq2seq/hardmax.md): Returns batched one-hot vectors.
+
+[`monotonic_attention(...)`](../tfa/seq2seq/monotonic_attention.md): Compute monotonic attention distribution from choosing probabilities.
+
+[`safe_cumprod(...)`](../tfa/seq2seq/safe_cumprod.md): Computes cumprod of x in logspace using cumsum to avoid underflow.
+
+[`sequence_loss(...)`](../tfa/seq2seq/sequence_loss.md): Weighted cross-entropy loss for a sequence of logits.
+
+[`tile_batch(...)`](../tfa/seq2seq/tile_batch.md): Tile the batch dimension of a (possibly nested structure of) tensor(s)
+
diff --git a/docs/api_docs/python/tfa/seq2seq/AttentionMechanism.md b/docs/api_docs/python/tfa/seq2seq/AttentionMechanism.md
new file mode 100644
index 0000000000..ef8158bd10
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/AttentionMechanism.md
@@ -0,0 +1,41 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.AttentionMechanism" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="alignments_size"/>
+<meta itemprop="property" content="state_size"/>
+</div>
+
+# tfa.seq2seq.AttentionMechanism
+
+## Class `AttentionMechanism`
+
+
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.AttentionMechanism`
+* Class `tfa.seq2seq.attention_wrapper.AttentionMechanism`
+
+
+
+Defined in [`seq2seq/attention_wrapper.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/attention_wrapper.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Properties
+
+<h3 id="alignments_size"><code>alignments_size</code></h3>
+
+
+
+
+<h3 id="state_size"><code>state_size</code></h3>
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/AttentionWrapper.md b/docs/api_docs/python/tfa/seq2seq/AttentionWrapper.md
new file mode 100644
index 0000000000..f45fe89c7b
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/AttentionWrapper.md
@@ -0,0 +1,1000 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.AttentionWrapper" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="output_size"/>
+<meta itemprop="property" content="state_size"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_initial_state"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.seq2seq.AttentionWrapper
+
+## Class `AttentionWrapper`
+
+Wraps another `RNNCell` with attention.
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.AttentionWrapper`
+* Class `tfa.seq2seq.attention_wrapper.AttentionWrapper`
+
+
+
+Defined in [`seq2seq/attention_wrapper.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/attention_wrapper.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    cell,
+    attention_mechanism,
+    attention_layer_size=None,
+    alignment_history=False,
+    cell_input_fn=None,
+    output_attention=True,
+    initial_cell_state=None,
+    name=None,
+    attention_layer=None,
+    attention_fn=None
+)
+```
+
+Construct the `AttentionWrapper`.
+
+**NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped
+in `AttentionWrapper`, then you must ensure that:
+
+- The encoder output has been tiled to `beam_width` via
+  `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
+- The `batch_size` argument passed to the `get_initial_state` method of
+  this wrapper is equal to `true_batch_size * beam_width`.
+- The initial state created with `get_initial_state` above contains a
+  `cell_state` value containing properly tiled final state from the
+  encoder.
+
+#### An example:
+
+
+
+```
+tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
+    encoder_outputs, multiplier=beam_width)
+tiled_encoder_final_state = tf.conrib.seq2seq.tile_batch(
+    encoder_final_state, multiplier=beam_width)
+tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
+    sequence_length, multiplier=beam_width)
+attention_mechanism = MyFavoriteAttentionMechanism(
+    num_units=attention_depth,
+    memory=tiled_inputs,
+    memory_sequence_length=tiled_sequence_length)
+attention_cell = AttentionWrapper(cell, attention_mechanism, ...)
+decoder_initial_state = attention_cell.get_initial_state(
+    batch_size=true_batch_size * beam_width, dtype=dtype)
+decoder_initial_state = decoder_initial_state.clone(
+    cell_state=tiled_encoder_final_state)
+```
+
+#### Args:
+
+
+* <b>`cell`</b>: An instance of `RNNCell`.
+* <b>`attention_mechanism`</b>: A list of `AttentionMechanism` instances or a
+  single instance.
+* <b>`attention_layer_size`</b>: A list of Python integers or a single Python
+  integer, the depth of the attention (output) layer(s). If None
+  (default), use the context as attention at each time step.
+  Otherwise, feed the context and cell output into the attention
+  layer to generate attention at each time step. If
+  attention_mechanism is a list, attention_layer_size must be a list
+  of the same length. If attention_layer is set, this must be None.
+  If attention_fn is set, it must guaranteed that the outputs of
+  attention_fn also meet the above requirements.
+* <b>`alignment_history`</b>: Python boolean, whether to store alignment history
+  from all time steps in the final output state (currently stored as
+  a time major `TensorArray` on which you must call `stack()`).
+* <b>`cell_input_fn`</b>: (optional) A `callable`.  The default is:
+  `lambda inputs, attention:
+    tf.concat([inputs, attention], -1)`.
+* <b>`output_attention`</b>: Python bool.  If `True` (default), the output at
+  each time step is the attention value.  This is the behavior of
+  Luong-style attention mechanisms.  If `False`, the output at each
+  time step is the output of `cell`.  This is the behavior of
+  Bhadanau-style attention mechanisms.  In both cases, the
+  `attention` tensor is propagated to the next time step via the
+  state and is used there. This flag only controls whether the
+  attention mechanism is propagated up to the next cell in an RNN
+  stack or to the top RNN output.
+* <b>`initial_cell_state`</b>: The initial state value to use for the cell when
+  the user calls `get_initial_state()`.  Note that if this value is
+  provided now, and the user uses a `batch_size` argument of
+  `get_initial_state` which does not match the batch size of
+  `initial_cell_state`, proper behavior is not guaranteed.
+* <b>`name`</b>: Name to use when creating ops.
+* <b>`attention_layer`</b>: A list of `tf.tf.keras.layers.Layer` instances or a
+  single `tf.tf.keras.layers.Layer` instance taking the context
+  and cell output as inputs to generate attention at each time step.
+  If None (default), use the context as attention at each time step.
+  If attention_mechanism is a list, attention_layer must be a list of
+  the same length. If attention_layers_size is set, this must be
+  None.
+* <b>`attention_fn`</b>: An optional callable function that allows users to
+  provide their own customized attention function, which takes input
+  (attention_mechanism, cell_output, attention_state,
+  attention_layer) and outputs (attention, alignments,
+  next_attention_state). If provided, the attention_layer_size should
+  be the size of the outputs of attention_fn.
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: `attention_layer_size` is not None and
+  (`attention_mechanism` is a list but `attention_layer_size` is not;
+  or vice versa).
+* <b>`ValueError`</b>: if `attention_layer_size` is not None,
+  `attention_mechanism` is a list, and its length does not match that
+  of `attention_layer_size`; if `attention_layer_size` and
+  `attention_layer` are set simultaneously.
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_size"><code>output_size</code></h3>
+
+
+
+
+<h3 id="state_size"><code>state_size</code></h3>
+
+The `state_size` property of `AttentionWrapper`.
+
+
+#### Returns:
+
+An `AttentionWrapperState` tuple containing shapes used
+by this object.
+
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Wraps `call`, applying pre- and post-processing steps.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+
+#### Note:
+
+- The following optional keyword arguments are reserved for specific uses:
+  * `training`: Boolean scalar tensor of Python boolean indicating
+    whether the `call` is meant for training or inference.
+  * `mask`: Boolean input mask.
+- If the layer's `call` method takes a `mask` argument (as some Keras
+  layers do), its default value will be set to the mask generated
+  for `inputs` by the previous layer (if `input` did come from
+  a layer that generated a corresponding mask, i.e. if it came from
+  a Keras layer with masking support.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer's `call` method returns None (an invalid value).
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+Creates the variables of the layer (optional, for subclass implementers).
+
+This is a method that implementers of subclasses of `Layer` or `Model`
+can override if they need a state-creation step in-between
+layer instantiation and layer call.
+
+This is typically used to create the weights of `Layer` subclasses.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Instance of `TensorShape`, or list of instances of
+  `TensorShape` if the layer expects a list of inputs
+  (one instance per input).
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+Computes an output mask tensor.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Tensor or list of tensors.
+* <b>`mask`</b>: Tensor or list of tensors.
+
+
+#### Returns:
+
+None or a tensor (or list of tensors,
+    one per output tensor of the layer).
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+Computes the output shape of the layer.
+
+Assumes that the layer will be built
+to match that input shape provided.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Shape tuple (tuple of integers)
+    or list of shape tuples (one per output tensor of the layer).
+    Shape tuples can include None for free dimensions,
+    instead of an integer.
+
+
+#### Returns:
+
+An input shape tuple.
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Creates a layer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same layer from the config
+dictionary. It does not handle layer connectivity
+(handled by Network), nor weights (handled by `set_weights`).
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the
+    output of get_config.
+
+
+#### Returns:
+
+A layer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+Returns the config of the layer.
+
+A layer config is a Python dictionary (serializable)
+containing the configuration of a layer.
+The same layer can be reinstantiated later
+(without its trained weights) from this configuration.
+
+The config of a layer does not include connectivity
+information, nor the layer class name. These are handled
+by `Network` (one layer of abstraction above).
+
+#### Returns:
+
+Python dictionary.
+
+
+<h3 id="get_initial_state"><code>get_initial_state</code></h3>
+
+``` python
+get_initial_state(
+    inputs=None,
+    batch_size=None,
+    dtype=None
+)
+```
+
+Return an initial (zero) state tuple for this `AttentionWrapper`.
+
+**NOTE** Please see the initializer documentation for details of how
+to call `get_initial_state` if using an `AttentionWrapper` with a
+`BeamSearchDecoder`.
+
+#### Args:
+
+
+* <b>`inputs`</b>: The inputs that will be fed to this cell.
+* <b>`batch_size`</b>: `0D` integer tensor: the batch size.
+* <b>`dtype`</b>: The internal state data type.
+
+
+#### Returns:
+
+An `AttentionWrapperState` tuple containing zeroed out tensors and,
+possibly, empty `TensorArray` objects.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: (or, possibly at runtime, InvalidArgument), if
+  `batch_size` does not match the output size of the encoder passed
+  to the wrapper object at initialization time.
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/AttentionWrapperState.md b/docs/api_docs/python/tfa/seq2seq/AttentionWrapperState.md
new file mode 100644
index 0000000000..c1d6d70c41
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/AttentionWrapperState.md
@@ -0,0 +1,123 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.AttentionWrapperState" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="cell_state"/>
+<meta itemprop="property" content="attention"/>
+<meta itemprop="property" content="time"/>
+<meta itemprop="property" content="alignments"/>
+<meta itemprop="property" content="alignment_history"/>
+<meta itemprop="property" content="attention_state"/>
+<meta itemprop="property" content="clone"/>
+</div>
+
+# tfa.seq2seq.AttentionWrapperState
+
+## Class `AttentionWrapperState`
+
+`namedtuple` storing the state of a `AttentionWrapper`.
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.AttentionWrapperState`
+* Class `tfa.seq2seq.attention_wrapper.AttentionWrapperState`
+
+
+
+Defined in [`seq2seq/attention_wrapper.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/attention_wrapper.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+#### Contains:
+
+
+- `cell_state`: The state of the wrapped `RNNCell` at the previous time
+  step.
+- `attention`: The attention emitted at the previous time step.
+- `time`: int32 scalar containing the current time step.
+- `alignments`: A single or tuple of `Tensor`(s) containing the
+   alignments emitted at the previous time step for each attention
+   mechanism.
+- `alignment_history`: (if enabled) a single or tuple of `TensorArray`(s)
+   containing alignment matrices from all time steps for each attention
+   mechanism. Call `stack()` on each to convert to a `Tensor`.
+- `attention_state`: A single or tuple of nested objects
+   containing attention mechanism state for each attention mechanism.
+   The objects may contain Tensors or TensorArrays.
+
+
+## Properties
+
+<h3 id="cell_state"><code>cell_state</code></h3>
+
+
+
+
+<h3 id="attention"><code>attention</code></h3>
+
+
+
+
+<h3 id="time"><code>time</code></h3>
+
+
+
+
+<h3 id="alignments"><code>alignments</code></h3>
+
+
+
+
+<h3 id="alignment_history"><code>alignment_history</code></h3>
+
+
+
+
+<h3 id="attention_state"><code>attention_state</code></h3>
+
+
+
+
+
+
+## Methods
+
+<h3 id="clone"><code>clone</code></h3>
+
+``` python
+clone(**kwargs)
+```
+
+Clone this object, overriding components provided by kwargs.
+
+The new state fields' shape must match original state fields' shape.
+This will be validated, and original fields' shape will be propagated
+to new fields.
+
+#### Example:
+
+
+
+```python
+initial_state = attention_wrapper.get_initial_state(
+    batch_size=..., dtype=...)
+initial_state = initial_state.clone(cell_state=encoder_state)
+```
+
+#### Args:
+
+
+* <b>`**kwargs`</b>: Any properties of the state object to replace in the
+  returned `AttentionWrapperState`.
+
+
+#### Returns:
+
+A new `AttentionWrapperState` whose properties are the same as
+this one, except any overridden properties as provided in `kwargs`.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/BahdanauAttention.md b/docs/api_docs/python/tfa/seq2seq/BahdanauAttention.md
new file mode 100644
index 0000000000..bd20a0647e
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/BahdanauAttention.md
@@ -0,0 +1,927 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.BahdanauAttention" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="alignments_size"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="state_size"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="deserialize_inner_layer_from_config"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="initial_alignments"/>
+<meta itemprop="property" content="initial_state"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.seq2seq.BahdanauAttention
+
+## Class `BahdanauAttention`
+
+Implements Bahdanau-style (additive) attention.
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.BahdanauAttention`
+* Class `tfa.seq2seq.attention_wrapper.BahdanauAttention`
+
+
+
+Defined in [`seq2seq/attention_wrapper.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/attention_wrapper.py).
+
+<!-- Placeholder for "Used in" -->
+
+This attention has two forms.  The first is Bahdanau attention,
+as described in:
+
+Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
+"Neural Machine Translation by Jointly Learning to Align and Translate."
+ICLR 2015. https://arxiv.org/abs/1409.0473
+
+The second is the normalized form.  This form is inspired by the
+weight normalization article:
+
+Tim Salimans, Diederik P. Kingma.
+"Weight Normalization: A Simple Reparameterization to Accelerate
+ Training of Deep Neural Networks."
+https://arxiv.org/abs/1602.07868
+
+To enable the second form, construct the object with parameter
+`normalize=True`.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    units,
+    memory,
+    memory_sequence_length=None,
+    normalize=False,
+    probability_fn='softmax',
+    kernel_initializer='glorot_uniform',
+    dtype=None,
+    name='BahdanauAttention',
+    **kwargs
+)
+```
+
+Construct the Attention mechanism.
+
+
+#### Args:
+
+
+* <b>`units`</b>: The depth of the query mechanism.
+* <b>`memory`</b>: The memory to query; usually the output of an RNN encoder.
+  This tensor should be shaped `[batch_size, max_time, ...]`.
+* <b>`memory_sequence_length`</b>: (optional): Sequence lengths for the batch
+  entries in memory.  If provided, the memory tensor rows are masked
+  with zeros for values past the respective sequence lengths.
+* <b>`normalize`</b>: Python boolean.  Whether to normalize the energy term.
+* <b>`probability_fn`</b>: (optional) string, the name of function to convert
+  the attention score to probabilities. The default is `softmax`
+  which is `tf.nn.softmax`. Other options is `hardmax`, which is
+  hardmax() within this module. Any other value will result into
+  validation error. Default to use `softmax`.
+* <b>`kernel_initializer`</b>: (optional), the name of the initializer for the
+  attention kernel.
+* <b>`dtype`</b>: The data type for the query and memory layers of the attention
+  mechanism.
+* <b>`name`</b>: Name to use when creating ops.
+* <b>`**kwargs`</b>: Dictionary that contains other common arguments for layer
+  creation.
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="alignments_size"><code>alignments_size</code></h3>
+
+
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="state_size"><code>state_size</code></h3>
+
+
+
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    **kwargs
+)
+```
+
+Preprocess the inputs before calling `base_layer.__call__()`.
+
+Note that there are situation here, one for setup memory, and one with
+actual query and state.
+1. When the memory has not been configured, we just pass all the param
+   to base_layer.__call__(), which will then invoke self.call() with
+   proper inputs, which allows this class to setup memory.
+2. When the memory has already been setup, the input should contain
+   query and state, and optionally processed memory. If the processed
+   memory is not included in the input, we will have to append it to
+   the inputs and give it to the base_layer.__call__(). The processed
+   memory is the output of first invocation of self.__call__(). If we
+   don't add it here, then from keras perspective, the graph is
+   disconnected since the output from previous call is never used.
+
+#### Args:
+
+
+* <b>`inputs`</b>: the inputs tensors.
+* <b>`**kwargs`</b>: dict, other keyeword arguments for the `__call__()`
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+
+
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+Computes the output shape of the layer.
+
+Assumes that the layer will be built
+to match that input shape provided.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Shape tuple (tuple of integers)
+    or list of shape tuples (one per output tensor of the layer).
+    Shape tuples can include None for free dimensions,
+    instead of an integer.
+
+
+#### Returns:
+
+An input shape tuple.
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="deserialize_inner_layer_from_config"><code>deserialize_inner_layer_from_config</code></h3>
+
+``` python
+deserialize_inner_layer_from_config(
+    cls,
+    config,
+    custom_objects
+)
+```
+
+Helper method that reconstruct the query and memory from the config.
+
+In the get_config() method, the query and memory layer configs are
+serialized into dict for persistence, this method perform the reverse
+action to reconstruct the layer from the config.
+
+#### Args:
+
+
+* <b>`config`</b>: dict, the configs that will be used to reconstruct the
+  object.
+* <b>`custom_objects`</b>: dict mapping class names (or function names) of
+  custom (non-Keras) objects to class/functions.
+
+#### Returns:
+
+
+* <b>`config`</b>: dict, the config with layer instance created, which is ready
+  to be used as init parameters.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+@classmethod
+from_config(
+    cls,
+    config,
+    custom_objects=None
+)
+```
+
+
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="initial_alignments"><code>initial_alignments</code></h3>
+
+``` python
+initial_alignments(
+    batch_size,
+    dtype
+)
+```
+
+Creates the initial alignment values for the `AttentionWrapper`
+class.
+
+This is important for AttentionMechanisms that use the previous
+alignment to calculate the alignment at the next time step
+(e.g. monotonic attention).
+
+The default behavior is to return a tensor of all zeros.
+
+#### Args:
+
+
+* <b>`batch_size`</b>: `int32` scalar, the batch_size.
+* <b>`dtype`</b>: The `dtype`.
+
+
+#### Returns:
+
+A `dtype` tensor shaped `[batch_size, alignments_size]`
+(`alignments_size` is the values' `max_time`).
+
+
+<h3 id="initial_state"><code>initial_state</code></h3>
+
+``` python
+initial_state(
+    batch_size,
+    dtype
+)
+```
+
+Creates the initial state values for the `AttentionWrapper` class.
+
+This is important for AttentionMechanisms that use the previous
+alignment to calculate the alignment at the next time step
+(e.g. monotonic attention).
+
+The default behavior is to return the same output as
+initial_alignments.
+
+#### Args:
+
+
+* <b>`batch_size`</b>: `int32` scalar, the batch_size.
+* <b>`dtype`</b>: The `dtype`.
+
+
+#### Returns:
+
+A structure of all-zero tensors with shapes as described by
+`state_size`.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/BahdanauMonotonicAttention.md b/docs/api_docs/python/tfa/seq2seq/BahdanauMonotonicAttention.md
new file mode 100644
index 0000000000..58f4e5b883
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/BahdanauMonotonicAttention.md
@@ -0,0 +1,925 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.BahdanauMonotonicAttention" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="alignments_size"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="state_size"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="deserialize_inner_layer_from_config"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="initial_alignments"/>
+<meta itemprop="property" content="initial_state"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.seq2seq.BahdanauMonotonicAttention
+
+## Class `BahdanauMonotonicAttention`
+
+Monotonic attention mechanism with Bahadanau-style energy function.
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.BahdanauMonotonicAttention`
+* Class `tfa.seq2seq.attention_wrapper.BahdanauMonotonicAttention`
+
+
+
+Defined in [`seq2seq/attention_wrapper.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/attention_wrapper.py).
+
+<!-- Placeholder for "Used in" -->
+
+This type of attention enforces a monotonic constraint on the attention
+distributions; that is once the model attends to a given point in the
+memory it can't attend to any prior points at subsequence output timesteps.
+It achieves this by using the _monotonic_probability_fn instead of softmax
+to construct its attention distributions.  Since the attention scores are
+passed through a sigmoid, a learnable scalar bias parameter is applied
+after the score function and before the sigmoid.  Otherwise, it is
+equivalent to BahdanauAttention.  This approach is proposed in
+
+Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
+"Online and Linear-Time Attention by Enforcing Monotonic Alignments."
+ICML 2017.  https://arxiv.org/abs/1704.00784
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    units,
+    memory,
+    memory_sequence_length=None,
+    normalize=False,
+    sigmoid_noise=0.0,
+    sigmoid_noise_seed=None,
+    score_bias_init=0.0,
+    mode='parallel',
+    kernel_initializer='glorot_uniform',
+    dtype=None,
+    name='BahdanauMonotonicAttention',
+    **kwargs
+)
+```
+
+Construct the Attention mechanism.
+
+
+#### Args:
+
+
+* <b>`units`</b>: The depth of the query mechanism.
+* <b>`memory`</b>: The memory to query; usually the output of an RNN encoder.
+  This tensor should be shaped `[batch_size, max_time, ...]`.
+* <b>`memory_sequence_length`</b>: (optional): Sequence lengths for the batch
+  entries in memory.  If provided, the memory tensor rows are masked
+  with zeros for values past the respective sequence lengths.
+* <b>`normalize`</b>: Python boolean. Whether to normalize the energy term.
+* <b>`sigmoid_noise`</b>: Standard deviation of pre-sigmoid noise. See the
+  docstring for `_monotonic_probability_fn` for more information.
+* <b>`sigmoid_noise_seed`</b>: (optional) Random seed for pre-sigmoid noise.
+* <b>`score_bias_init`</b>: Initial value for score bias scalar. It's
+  recommended to initialize this to a negative value when the length
+  of the memory is large.
+* <b>`mode`</b>: How to compute the attention distribution. Must be one of
+  'recursive', 'parallel', or 'hard'. See the docstring for
+  `tf.contrib.seq2seq.monotonic_attention` for more information.
+* <b>`kernel_initializer`</b>: (optional), the name of the initializer for the
+  attention kernel.
+* <b>`dtype`</b>: The data type for the query and memory layers of the attention
+  mechanism.
+* <b>`name`</b>: Name to use when creating ops.
+* <b>`**kwargs`</b>: Dictionary that contains other common arguments for layer
+  creation.
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="alignments_size"><code>alignments_size</code></h3>
+
+
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="state_size"><code>state_size</code></h3>
+
+
+
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    **kwargs
+)
+```
+
+Preprocess the inputs before calling `base_layer.__call__()`.
+
+Note that there are situation here, one for setup memory, and one with
+actual query and state.
+1. When the memory has not been configured, we just pass all the param
+   to base_layer.__call__(), which will then invoke self.call() with
+   proper inputs, which allows this class to setup memory.
+2. When the memory has already been setup, the input should contain
+   query and state, and optionally processed memory. If the processed
+   memory is not included in the input, we will have to append it to
+   the inputs and give it to the base_layer.__call__(). The processed
+   memory is the output of first invocation of self.__call__(). If we
+   don't add it here, then from keras perspective, the graph is
+   disconnected since the output from previous call is never used.
+
+#### Args:
+
+
+* <b>`inputs`</b>: the inputs tensors.
+* <b>`**kwargs`</b>: dict, other keyeword arguments for the `__call__()`
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+
+
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+Computes the output shape of the layer.
+
+Assumes that the layer will be built
+to match that input shape provided.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Shape tuple (tuple of integers)
+    or list of shape tuples (one per output tensor of the layer).
+    Shape tuples can include None for free dimensions,
+    instead of an integer.
+
+
+#### Returns:
+
+An input shape tuple.
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="deserialize_inner_layer_from_config"><code>deserialize_inner_layer_from_config</code></h3>
+
+``` python
+deserialize_inner_layer_from_config(
+    cls,
+    config,
+    custom_objects
+)
+```
+
+Helper method that reconstruct the query and memory from the config.
+
+In the get_config() method, the query and memory layer configs are
+serialized into dict for persistence, this method perform the reverse
+action to reconstruct the layer from the config.
+
+#### Args:
+
+
+* <b>`config`</b>: dict, the configs that will be used to reconstruct the
+  object.
+* <b>`custom_objects`</b>: dict mapping class names (or function names) of
+  custom (non-Keras) objects to class/functions.
+
+#### Returns:
+
+
+* <b>`config`</b>: dict, the config with layer instance created, which is ready
+  to be used as init parameters.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+@classmethod
+from_config(
+    cls,
+    config,
+    custom_objects=None
+)
+```
+
+
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="initial_alignments"><code>initial_alignments</code></h3>
+
+``` python
+initial_alignments(
+    batch_size,
+    dtype
+)
+```
+
+Creates the initial alignment values for the monotonic attentions.
+
+Initializes to dirac distributions, i.e.
+[1, 0, 0, ...memory length..., 0] for all entries in the batch.
+
+#### Args:
+
+
+* <b>`batch_size`</b>: `int32` scalar, the batch_size.
+* <b>`dtype`</b>: The `dtype`.
+
+
+#### Returns:
+
+A `dtype` tensor shaped `[batch_size, alignments_size]`
+(`alignments_size` is the values' `max_time`).
+
+
+<h3 id="initial_state"><code>initial_state</code></h3>
+
+``` python
+initial_state(
+    batch_size,
+    dtype
+)
+```
+
+Creates the initial state values for the `AttentionWrapper` class.
+
+This is important for AttentionMechanisms that use the previous
+alignment to calculate the alignment at the next time step
+(e.g. monotonic attention).
+
+The default behavior is to return the same output as
+initial_alignments.
+
+#### Args:
+
+
+* <b>`batch_size`</b>: `int32` scalar, the batch_size.
+* <b>`dtype`</b>: The `dtype`.
+
+
+#### Returns:
+
+A structure of all-zero tensors with shapes as described by
+`state_size`.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/BaseDecoder.md b/docs/api_docs/python/tfa/seq2seq/BaseDecoder.md
new file mode 100644
index 0000000000..6a56b0c256
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/BaseDecoder.md
@@ -0,0 +1,977 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.BaseDecoder" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="batch_size"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_dtype"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="output_size"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="tracks_own_finished"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="finalize"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="initialize"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="step"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.seq2seq.BaseDecoder
+
+## Class `BaseDecoder`
+
+An RNN Decoder that is based on a Keras layer.
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.BaseDecoder`
+* Class `tfa.seq2seq.decoder.BaseDecoder`
+
+
+
+Defined in [`seq2seq/decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+Concepts used by this interface:
+- `inputs`: (structure of) tensors and TensorArrays that is passed as input
+  to the RNNCell composing the decoder, at each time step.
+- `state`: (structure of) tensors and TensorArrays that is passed to the
+  RNNCell instance as the state.
+- `memory`: (sturecute of) tensors that is usually the full output of the
+  encoder, which will be used for the attention wrapper for the RNNCell.
+- `finished`: boolean tensor telling whether each sequence in the batch is
+  finished.
+- `outputs`: Instance of BasicDecoderOutput. Result of the decoding, at
+  each time step.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    output_time_major=False,
+    impute_finished=False,
+    maximum_iterations=None,
+    parallel_iterations=32,
+    swap_memory=False,
+    **kwargs
+)
+```
+
+
+
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="batch_size"><code>batch_size</code></h3>
+
+The batch size of input values.
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_dtype"><code>output_dtype</code></h3>
+
+A (possibly nested tuple of...) dtype[s].
+
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_size"><code>output_size</code></h3>
+
+A (possibly nested tuple of...) integer[s] or `TensorShape`
+object[s].
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="tracks_own_finished"><code>tracks_own_finished</code></h3>
+
+Describes whether the Decoder keeps track of finished states.
+
+Most decoders will emit a true/false `finished` value independently
+at each time step.  In this case, the `dynamic_decode` function keeps
+track of which batch entries are already finished, and performs a
+logical OR to insert new batches to the finished set.
+
+Some decoders, however, shuffle batches / beams between time steps and
+`dynamic_decode` will mix up the finished state across these entries
+because it does not track the reshuffle across time steps. In this
+case, it is up to the decoder to declare that it will keep track of its
+own finished state by setting this property to `True`.
+
+#### Returns:
+
+Python bool.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Wraps `call`, applying pre- and post-processing steps.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+
+#### Note:
+
+- The following optional keyword arguments are reserved for specific uses:
+  * `training`: Boolean scalar tensor of Python boolean indicating
+    whether the `call` is meant for training or inference.
+  * `mask`: Boolean input mask.
+- If the layer's `call` method takes a `mask` argument (as some Keras
+  layers do), its default value will be set to the mask generated
+  for `inputs` by the previous layer (if `input` did come from
+  a layer that generated a corresponding mask, i.e. if it came from
+  a Keras layer with masking support.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer's `call` method returns None (an invalid value).
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+Creates the variables of the layer (optional, for subclass implementers).
+
+This is a method that implementers of subclasses of `Layer` or `Model`
+can override if they need a state-creation step in-between
+layer instantiation and layer call.
+
+This is typically used to create the weights of `Layer` subclasses.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Instance of `TensorShape`, or list of instances of
+  `TensorShape` if the layer expects a list of inputs
+  (one instance per input).
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+Computes an output mask tensor.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Tensor or list of tensors.
+* <b>`mask`</b>: Tensor or list of tensors.
+
+
+#### Returns:
+
+None or a tensor (or list of tensors,
+    one per output tensor of the layer).
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+Computes the output shape of the layer.
+
+Assumes that the layer will be built
+to match that input shape provided.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Shape tuple (tuple of integers)
+    or list of shape tuples (one per output tensor of the layer).
+    Shape tuples can include None for free dimensions,
+    instead of an integer.
+
+
+#### Returns:
+
+An input shape tuple.
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="finalize"><code>finalize</code></h3>
+
+``` python
+finalize(
+    outputs,
+    final_state,
+    sequence_lengths
+)
+```
+
+
+
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Creates a layer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same layer from the config
+dictionary. It does not handle layer connectivity
+(handled by Network), nor weights (handled by `set_weights`).
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the
+    output of get_config.
+
+
+#### Returns:
+
+A layer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+Returns the config of the layer.
+
+A layer config is a Python dictionary (serializable)
+containing the configuration of a layer.
+The same layer can be reinstantiated later
+(without its trained weights) from this configuration.
+
+The config of a layer does not include connectivity
+information, nor the layer class name. These are handled
+by `Network` (one layer of abstraction above).
+
+#### Returns:
+
+Python dictionary.
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="initialize"><code>initialize</code></h3>
+
+``` python
+initialize(
+    inputs,
+    initial_state=None,
+    **kwargs
+)
+```
+
+Called before any decoding iterations.
+
+This methods must compute initial input values and initial state.
+
+#### Args:
+
+
+* <b>`inputs`</b>: (structure of) tensors that contains the input for the
+  decoder. In the normal case, its a tensor with shape
+  [batch, timestep, embedding].
+* <b>`initial_state`</b>: (structure of) tensors that contains the initial state
+  for the RNNCell.
+* <b>`**kwargs`</b>: Other arguments that are passed in from layer.call()
+  method. It could contains item like input sequence_length, or
+  masking for input.
+
+
+#### Returns:
+
+`(finished, initial_inputs, initial_state)`: initial values of
+'finished' flags, inputs and state.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="step"><code>step</code></h3>
+
+``` python
+step(
+    time,
+    inputs,
+    state
+)
+```
+
+Called per step of decoding (but only once for dynamic decoding).
+
+
+#### Args:
+
+
+* <b>`time`</b>: Scalar `int32` tensor. Current step number.
+* <b>`inputs`</b>: RNNCell input (possibly nested tuple of) tensor[s] for this
+  time step.
+* <b>`state`</b>: RNNCell state (possibly nested tuple of) tensor[s] from
+  previous time step.
+
+
+#### Returns:
+
+`(outputs, next_state, next_inputs, finished)`: `outputs` is an
+object containing the decoder output, `next_state` is a
+(structure of) state tensors and TensorArrays, `next_inputs` is the
+tensor that should be used as input for the next step, `finished` is
+a boolean tensor telling whether the sequence is complete, for each
+sequence in the batch.
+
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/BasicDecoder.md b/docs/api_docs/python/tfa/seq2seq/BasicDecoder.md
new file mode 100644
index 0000000000..053bb24011
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/BasicDecoder.md
@@ -0,0 +1,954 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.BasicDecoder" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="batch_size"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_dtype"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="output_size"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="tracks_own_finished"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="finalize"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="initialize"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="step"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.seq2seq.BasicDecoder
+
+## Class `BasicDecoder`
+
+Basic sampling decoder.
+
+Inherits From: [`BaseDecoder`](../../tfa/seq2seq/BaseDecoder.md)
+
+### Aliases:
+
+* Class `tfa.seq2seq.BasicDecoder`
+* Class `tfa.seq2seq.basic_decoder.BasicDecoder`
+
+
+
+Defined in [`seq2seq/basic_decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/basic_decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    cell,
+    sampler,
+    output_layer=None,
+    **kwargs
+)
+```
+
+Initialize BasicDecoder.
+
+
+#### Args:
+
+
+* <b>`cell`</b>: An `RNNCell` instance.
+* <b>`sampler`</b>: A `Sampler` instance.
+* <b>`output_layer`</b>: (Optional) An instance of `tf.layers.Layer`, i.e.,
+  `tf.layers.Dense`. Optional layer to apply to the RNN output prior
+   to storing the result or sampling.
+* <b>`**kwargs`</b>: Other keyward arguments for layer creation.
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: if `cell`, `helper` or `output_layer` have an incorrect
+type.
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="batch_size"><code>batch_size</code></h3>
+
+
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_dtype"><code>output_dtype</code></h3>
+
+
+
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_size"><code>output_size</code></h3>
+
+
+
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="tracks_own_finished"><code>tracks_own_finished</code></h3>
+
+Describes whether the Decoder keeps track of finished states.
+
+Most decoders will emit a true/false `finished` value independently
+at each time step.  In this case, the `dynamic_decode` function keeps
+track of which batch entries are already finished, and performs a
+logical OR to insert new batches to the finished set.
+
+Some decoders, however, shuffle batches / beams between time steps and
+`dynamic_decode` will mix up the finished state across these entries
+because it does not track the reshuffle across time steps. In this
+case, it is up to the decoder to declare that it will keep track of its
+own finished state by setting this property to `True`.
+
+#### Returns:
+
+Python bool.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Wraps `call`, applying pre- and post-processing steps.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+
+#### Note:
+
+- The following optional keyword arguments are reserved for specific uses:
+  * `training`: Boolean scalar tensor of Python boolean indicating
+    whether the `call` is meant for training or inference.
+  * `mask`: Boolean input mask.
+- If the layer's `call` method takes a `mask` argument (as some Keras
+  layers do), its default value will be set to the mask generated
+  for `inputs` by the previous layer (if `input` did come from
+  a layer that generated a corresponding mask, i.e. if it came from
+  a Keras layer with masking support.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer's `call` method returns None (an invalid value).
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+Creates the variables of the layer (optional, for subclass implementers).
+
+This is a method that implementers of subclasses of `Layer` or `Model`
+can override if they need a state-creation step in-between
+layer instantiation and layer call.
+
+This is typically used to create the weights of `Layer` subclasses.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Instance of `TensorShape`, or list of instances of
+  `TensorShape` if the layer expects a list of inputs
+  (one instance per input).
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+Computes an output mask tensor.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Tensor or list of tensors.
+* <b>`mask`</b>: Tensor or list of tensors.
+
+
+#### Returns:
+
+None or a tensor (or list of tensors,
+    one per output tensor of the layer).
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+Computes the output shape of the layer.
+
+Assumes that the layer will be built
+to match that input shape provided.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Shape tuple (tuple of integers)
+    or list of shape tuples (one per output tensor of the layer).
+    Shape tuples can include None for free dimensions,
+    instead of an integer.
+
+
+#### Returns:
+
+An input shape tuple.
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="finalize"><code>finalize</code></h3>
+
+``` python
+finalize(
+    outputs,
+    final_state,
+    sequence_lengths
+)
+```
+
+
+
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Creates a layer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same layer from the config
+dictionary. It does not handle layer connectivity
+(handled by Network), nor weights (handled by `set_weights`).
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the
+    output of get_config.
+
+
+#### Returns:
+
+A layer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+Returns the config of the layer.
+
+A layer config is a Python dictionary (serializable)
+containing the configuration of a layer.
+The same layer can be reinstantiated later
+(without its trained weights) from this configuration.
+
+The config of a layer does not include connectivity
+information, nor the layer class name. These are handled
+by `Network` (one layer of abstraction above).
+
+#### Returns:
+
+Python dictionary.
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="initialize"><code>initialize</code></h3>
+
+``` python
+initialize(
+    inputs,
+    initial_state=None,
+    **kwargs
+)
+```
+
+Initialize the decoder.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="step"><code>step</code></h3>
+
+``` python
+step(
+    time,
+    inputs,
+    state
+)
+```
+
+Perform a decoding step.
+
+
+#### Args:
+
+
+* <b>`time`</b>: scalar `int32` tensor.
+* <b>`inputs`</b>: A (structure of) input tensors.
+* <b>`state`</b>: A (structure of) state tensors and TensorArrays.
+
+
+#### Returns:
+
+`(outputs, next_state, next_inputs, finished)`.
+
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/BasicDecoderOutput.md b/docs/api_docs/python/tfa/seq2seq/BasicDecoderOutput.md
new file mode 100644
index 0000000000..7008d26ef0
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/BasicDecoderOutput.md
@@ -0,0 +1,41 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.BasicDecoderOutput" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="rnn_output"/>
+<meta itemprop="property" content="sample_id"/>
+</div>
+
+# tfa.seq2seq.BasicDecoderOutput
+
+## Class `BasicDecoderOutput`
+
+
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.BasicDecoderOutput`
+* Class `tfa.seq2seq.basic_decoder.BasicDecoderOutput`
+
+
+
+Defined in [`seq2seq/basic_decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/basic_decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Properties
+
+<h3 id="rnn_output"><code>rnn_output</code></h3>
+
+
+
+
+<h3 id="sample_id"><code>sample_id</code></h3>
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/BeamSearchDecoder.md b/docs/api_docs/python/tfa/seq2seq/BeamSearchDecoder.md
new file mode 100644
index 0000000000..406be683f0
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/BeamSearchDecoder.md
@@ -0,0 +1,1043 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.BeamSearchDecoder" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="batch_size"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_dtype"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="output_size"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="tracks_own_finished"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="finalize"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="initialize"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="step"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.seq2seq.BeamSearchDecoder
+
+## Class `BeamSearchDecoder`
+
+BeamSearch sampling decoder.
+
+Inherits From: [`BeamSearchDecoderMixin`](../../tfa/seq2seq/beam_search_decoder/BeamSearchDecoderMixin.md), [`BaseDecoder`](../../tfa/seq2seq/BaseDecoder.md)
+
+### Aliases:
+
+* Class `tfa.seq2seq.BeamSearchDecoder`
+* Class `tfa.seq2seq.beam_search_decoder.BeamSearchDecoder`
+
+
+
+Defined in [`seq2seq/beam_search_decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/beam_search_decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+**NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
+`AttentionWrapper`, then you must ensure that:
+
+- The encoder output has been tiled to `beam_width` via
+  `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
+- The `batch_size` argument passed to the `get_initial_state` method of
+  this wrapper is equal to `true_batch_size * beam_width`.
+- The initial state created with `get_initial_state` above contains a
+  `cell_state` value containing properly tiled final state from the
+  encoder.
+
+#### An example:
+
+
+
+```
+tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
+    encoder_outputs, multiplier=beam_width)
+tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
+    encoder_final_state, multiplier=beam_width)
+tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
+    sequence_length, multiplier=beam_width)
+attention_mechanism = MyFavoriteAttentionMechanism(
+    num_units=attention_depth,
+    memory=tiled_inputs,
+    memory_sequence_length=tiled_sequence_length)
+attention_cell = AttentionWrapper(cell, attention_mechanism, ...)
+decoder_initial_state = attention_cell.get_initial_state(
+    batch_size=true_batch_size * beam_width, dtype=dtype)
+decoder_initial_state = decoder_initial_state.clone(
+    cell_state=tiled_encoder_final_state)
+```
+
+Meanwhile, with `AttentionWrapper`, coverage penalty is suggested to use
+when computing scores (https://arxiv.org/pdf/1609.08144.pdf). It encourages
+the decoding to cover all inputs.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    cell,
+    beam_width,
+    embedding_fn=None,
+    output_layer=None,
+    length_penalty_weight=0.0,
+    coverage_penalty_weight=0.0,
+    reorder_tensor_arrays=True,
+    **kwargs
+)
+```
+
+Initialize the BeamSearchDecoder.
+
+
+#### Args:
+
+
+* <b>`cell`</b>: An `RNNCell` instance.
+* <b>`beam_width`</b>:  Python integer, the number of beams.
+* <b>`embedding_fn`</b>: A callable that takes a vector tensor of `ids`
+  (argmax ids).
+* <b>`output_layer`</b>: (Optional) An instance of `tf.keras.layers.Layer`,
+  i.e., `tf.keras.layers.Dense`.  Optional layer to apply to the RNN
+  output prior to storing the result or sampling.
+* <b>`length_penalty_weight`</b>: Float weight to penalize length. Disabled with
+  0.0.
+* <b>`coverage_penalty_weight`</b>: Float weight to penalize the coverage of
+  source sentence. Disabled with 0.0.
+* <b>`reorder_tensor_arrays`</b>: If `True`, `TensorArray`s' elements within the
+  cell state will be reordered according to the beam search path. If
+  the `TensorArray` can be reordered, the stacked form will be
+  returned. Otherwise, the `TensorArray` will be returned as is. Set
+  this flag to `False` if the cell state contains `TensorArray`s that
+  are not amenable to reordering.
+* <b>`**kwargs`</b>: Dict, other keyword arguments for initialization.
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: if `cell` is not an instance of `RNNCell`,
+  or `output_layer` is not an instance of `tf.keras.layers.Layer`.
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="batch_size"><code>batch_size</code></h3>
+
+
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_dtype"><code>output_dtype</code></h3>
+
+
+
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_size"><code>output_size</code></h3>
+
+
+
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="tracks_own_finished"><code>tracks_own_finished</code></h3>
+
+The BeamSearchDecoder shuffles its beams and their finished state.
+
+For this reason, it conflicts with the `dynamic_decode` function's
+tracking of finished states.  Setting this property to true avoids
+early stopping of decoding due to mismanagement of the finished state
+in `dynamic_decode`.
+
+#### Returns:
+
+`True`.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Wraps `call`, applying pre- and post-processing steps.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+
+#### Note:
+
+- The following optional keyword arguments are reserved for specific uses:
+  * `training`: Boolean scalar tensor of Python boolean indicating
+    whether the `call` is meant for training or inference.
+  * `mask`: Boolean input mask.
+- If the layer's `call` method takes a `mask` argument (as some Keras
+  layers do), its default value will be set to the mask generated
+  for `inputs` by the previous layer (if `input` did come from
+  a layer that generated a corresponding mask, i.e. if it came from
+  a Keras layer with masking support.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer's `call` method returns None (an invalid value).
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+Creates the variables of the layer (optional, for subclass implementers).
+
+This is a method that implementers of subclasses of `Layer` or `Model`
+can override if they need a state-creation step in-between
+layer instantiation and layer call.
+
+This is typically used to create the weights of `Layer` subclasses.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Instance of `TensorShape`, or list of instances of
+  `TensorShape` if the layer expects a list of inputs
+  (one instance per input).
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+Computes an output mask tensor.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Tensor or list of tensors.
+* <b>`mask`</b>: Tensor or list of tensors.
+
+
+#### Returns:
+
+None or a tensor (or list of tensors,
+    one per output tensor of the layer).
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+Computes the output shape of the layer.
+
+Assumes that the layer will be built
+to match that input shape provided.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Shape tuple (tuple of integers)
+    or list of shape tuples (one per output tensor of the layer).
+    Shape tuples can include None for free dimensions,
+    instead of an integer.
+
+
+#### Returns:
+
+An input shape tuple.
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="finalize"><code>finalize</code></h3>
+
+``` python
+finalize(
+    outputs,
+    final_state,
+    sequence_lengths
+)
+```
+
+Finalize and return the predicted_ids.
+
+
+#### Args:
+
+
+* <b>`outputs`</b>: An instance of BeamSearchDecoderOutput.
+* <b>`final_state`</b>: An instance of BeamSearchDecoderState. Passed through to
+  the output.
+* <b>`sequence_lengths`</b>: An `int64` tensor shaped
+  `[batch_size, beam_width]`. The sequence lengths determined for
+  each beam during decode. **NOTE** These are ignored; the updated
+  sequence lengths are stored in `final_state.lengths`.
+
+
+#### Returns:
+
+
+* <b>`outputs`</b>: An instance of `FinalBeamSearchDecoderOutput` where the
+  predicted_ids are the result of calling _gather_tree.
+* <b>`final_state`</b>: The same input instance of `BeamSearchDecoderState`.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Creates a layer from its config.
+
+This method is the reverse of `get_config`,
+capable of instantiating the same layer from the config
+dictionary. It does not handle layer connectivity
+(handled by Network), nor weights (handled by `set_weights`).
+
+#### Arguments:
+
+
+* <b>`config`</b>: A Python dictionary, typically the
+    output of get_config.
+
+
+#### Returns:
+
+A layer instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+Returns the config of the layer.
+
+A layer config is a Python dictionary (serializable)
+containing the configuration of a layer.
+The same layer can be reinstantiated later
+(without its trained weights) from this configuration.
+
+The config of a layer does not include connectivity
+information, nor the layer class name. These are handled
+by `Network` (one layer of abstraction above).
+
+#### Returns:
+
+Python dictionary.
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="initialize"><code>initialize</code></h3>
+
+``` python
+initialize(
+    embedding,
+    start_tokens,
+    end_token,
+    initial_state
+)
+```
+
+Initialize the decoder.
+
+
+#### Args:
+
+
+* <b>`embedding`</b>: A tensor from the embedding layer output, which is the
+  `params` argument for `embedding_lookup`.
+* <b>`start_tokens`</b>: `int32` vector shaped `[batch_size]`, the start tokens.
+* <b>`end_token`</b>: `int32` scalar, the token that marks end of decoding.
+* <b>`initial_state`</b>: A (possibly nested tuple of...) tensors and
+TensorArrays.
+
+#### Returns:
+
+`(finished, start_inputs, initial_state)`.
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If `start_tokens` is not a vector or `end_token` is not a
+  scalar.
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="step"><code>step</code></h3>
+
+``` python
+step(
+    time,
+    inputs,
+    state,
+    name=None
+)
+```
+
+Perform a decoding step.
+
+
+#### Args:
+
+
+* <b>`time`</b>: scalar `int32` tensor.
+* <b>`inputs`</b>: A (structure of) input tensors.
+* <b>`state`</b>: A (structure of) state tensors and TensorArrays.
+* <b>`name`</b>: Name scope for any created operations.
+
+
+#### Returns:
+
+`(outputs, next_state, next_inputs, finished)`.
+
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/BeamSearchDecoderOutput.md b/docs/api_docs/python/tfa/seq2seq/BeamSearchDecoderOutput.md
new file mode 100644
index 0000000000..c7a3ed1ba3
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/BeamSearchDecoderOutput.md
@@ -0,0 +1,47 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.BeamSearchDecoderOutput" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="scores"/>
+<meta itemprop="property" content="predicted_ids"/>
+<meta itemprop="property" content="parent_ids"/>
+</div>
+
+# tfa.seq2seq.BeamSearchDecoderOutput
+
+## Class `BeamSearchDecoderOutput`
+
+
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.BeamSearchDecoderOutput`
+* Class `tfa.seq2seq.beam_search_decoder.BeamSearchDecoderOutput`
+
+
+
+Defined in [`seq2seq/beam_search_decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/beam_search_decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Properties
+
+<h3 id="scores"><code>scores</code></h3>
+
+
+
+
+<h3 id="predicted_ids"><code>predicted_ids</code></h3>
+
+
+
+
+<h3 id="parent_ids"><code>parent_ids</code></h3>
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/BeamSearchDecoderState.md b/docs/api_docs/python/tfa/seq2seq/BeamSearchDecoderState.md
new file mode 100644
index 0000000000..ee2ff86db4
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/BeamSearchDecoderState.md
@@ -0,0 +1,59 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.BeamSearchDecoderState" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="cell_state"/>
+<meta itemprop="property" content="log_probs"/>
+<meta itemprop="property" content="finished"/>
+<meta itemprop="property" content="lengths"/>
+<meta itemprop="property" content="accumulated_attention_probs"/>
+</div>
+
+# tfa.seq2seq.BeamSearchDecoderState
+
+## Class `BeamSearchDecoderState`
+
+
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.BeamSearchDecoderState`
+* Class `tfa.seq2seq.beam_search_decoder.BeamSearchDecoderState`
+
+
+
+Defined in [`seq2seq/beam_search_decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/beam_search_decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Properties
+
+<h3 id="cell_state"><code>cell_state</code></h3>
+
+
+
+
+<h3 id="log_probs"><code>log_probs</code></h3>
+
+
+
+
+<h3 id="finished"><code>finished</code></h3>
+
+
+
+
+<h3 id="lengths"><code>lengths</code></h3>
+
+
+
+
+<h3 id="accumulated_attention_probs"><code>accumulated_attention_probs</code></h3>
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/CustomSampler.md b/docs/api_docs/python/tfa/seq2seq/CustomSampler.md
new file mode 100644
index 0000000000..be8b08328b
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/CustomSampler.md
@@ -0,0 +1,127 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.CustomSampler" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="batch_size"/>
+<meta itemprop="property" content="sample_ids_dtype"/>
+<meta itemprop="property" content="sample_ids_shape"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="initialize"/>
+<meta itemprop="property" content="next_inputs"/>
+<meta itemprop="property" content="sample"/>
+</div>
+
+# tfa.seq2seq.CustomSampler
+
+## Class `CustomSampler`
+
+Base abstract class that allows the user to customize sampling.
+
+Inherits From: [`Sampler`](../../tfa/seq2seq/Sampler.md)
+
+### Aliases:
+
+* Class `tfa.seq2seq.CustomSampler`
+* Class `tfa.seq2seq.sampler.CustomSampler`
+
+
+
+Defined in [`seq2seq/sampler.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/sampler.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    initialize_fn,
+    sample_fn,
+    next_inputs_fn,
+    sample_ids_shape=None,
+    sample_ids_dtype=None
+)
+```
+
+Initializer.
+
+
+#### Args:
+
+
+* <b>`initialize_fn`</b>: callable that returns `(finished, next_inputs)` for
+  the first iteration.
+* <b>`sample_fn`</b>: callable that takes `(time, outputs, state)` and emits
+  tensor `sample_ids`.
+* <b>`next_inputs_fn`</b>: callable that takes
+  `(time, outputs, state, sample_ids)` and emits
+  `(finished, next_inputs, next_state)`.
+* <b>`sample_ids_shape`</b>: Either a list of integers, or a 1-D Tensor of type
+  `int32`, the shape of each value in the `sample_ids` batch.
+  Defaults to a scalar.
+* <b>`sample_ids_dtype`</b>: The dtype of the `sample_ids` tensor. Defaults to
+  int32.
+
+
+
+## Properties
+
+<h3 id="batch_size"><code>batch_size</code></h3>
+
+
+
+
+<h3 id="sample_ids_dtype"><code>sample_ids_dtype</code></h3>
+
+
+
+
+<h3 id="sample_ids_shape"><code>sample_ids_shape</code></h3>
+
+
+
+
+
+
+## Methods
+
+<h3 id="initialize"><code>initialize</code></h3>
+
+``` python
+initialize(
+    inputs,
+    **kwargs
+)
+```
+
+
+
+
+<h3 id="next_inputs"><code>next_inputs</code></h3>
+
+``` python
+next_inputs(
+    time,
+    outputs,
+    state,
+    sample_ids
+)
+```
+
+
+
+
+<h3 id="sample"><code>sample</code></h3>
+
+``` python
+sample(
+    time,
+    outputs,
+    state
+)
+```
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/Decoder.md b/docs/api_docs/python/tfa/seq2seq/Decoder.md
new file mode 100644
index 0000000000..da123f73dc
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/Decoder.md
@@ -0,0 +1,154 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.Decoder" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="batch_size"/>
+<meta itemprop="property" content="output_dtype"/>
+<meta itemprop="property" content="output_size"/>
+<meta itemprop="property" content="tracks_own_finished"/>
+<meta itemprop="property" content="finalize"/>
+<meta itemprop="property" content="initialize"/>
+<meta itemprop="property" content="step"/>
+</div>
+
+# tfa.seq2seq.Decoder
+
+## Class `Decoder`
+
+An RNN Decoder abstract interface object.
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.Decoder`
+* Class `tfa.seq2seq.decoder.Decoder`
+
+
+
+Defined in [`seq2seq/decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+Concepts used by this interface:
+- `inputs`: (structure of) tensors and TensorArrays that is passed as input
+  to the RNNCell composing the decoder, at each time step.
+- `state`: (structure of) tensors and TensorArrays that is passed to the
+  RNNCell instance as the state.
+- `finished`: boolean tensor telling whether each sequence in the batch is
+  finished.
+- `outputs`: Instance of BasicDecoderOutput. Result of the decoding, at
+  each time step.
+
+## Properties
+
+<h3 id="batch_size"><code>batch_size</code></h3>
+
+The batch size of input values.
+
+
+<h3 id="output_dtype"><code>output_dtype</code></h3>
+
+A (possibly nested tuple of...) dtype[s].
+
+
+<h3 id="output_size"><code>output_size</code></h3>
+
+A (possibly nested tuple of...) integer[s] or `TensorShape`
+object[s].
+
+<h3 id="tracks_own_finished"><code>tracks_own_finished</code></h3>
+
+Describes whether the Decoder keeps track of finished states.
+
+Most decoders will emit a true/false `finished` value independently
+at each time step.  In this case, the `dynamic_decode` function keeps
+track of which batch entries are already finished, and performs a
+logical OR to insert new batches to the finished set.
+
+Some decoders, however, shuffle batches / beams between time steps and
+`dynamic_decode` will mix up the finished state across these entries
+because it does not track the reshuffle across time steps. In this
+case, it is up to the decoder to declare that it will keep track of its
+own finished state by setting this property to `True`.
+
+#### Returns:
+
+Python bool.
+
+
+
+
+## Methods
+
+<h3 id="finalize"><code>finalize</code></h3>
+
+``` python
+finalize(
+    outputs,
+    final_state,
+    sequence_lengths
+)
+```
+
+
+
+
+<h3 id="initialize"><code>initialize</code></h3>
+
+``` python
+initialize(name=None)
+```
+
+Called before any decoding iterations.
+
+This methods must compute initial input values and initial state.
+
+#### Args:
+
+
+* <b>`name`</b>: Name scope for any created operations.
+
+
+#### Returns:
+
+`(finished, initial_inputs, initial_state)`: initial values of
+'finished' flags, inputs and state.
+
+
+<h3 id="step"><code>step</code></h3>
+
+``` python
+step(
+    time,
+    inputs,
+    state,
+    name=None
+)
+```
+
+Called per step of decoding (but only once for dynamic decoding).
+
+
+#### Args:
+
+
+* <b>`time`</b>: Scalar `int32` tensor. Current step number.
+* <b>`inputs`</b>: RNNCell input (possibly nested tuple of) tensor[s] for this
+  time step.
+* <b>`state`</b>: RNNCell state (possibly nested tuple of) tensor[s] from
+  previous time step.
+* <b>`name`</b>: Name scope for any created operations.
+
+
+#### Returns:
+
+`(outputs, next_state, next_inputs, finished)`: `outputs` is an
+object containing the decoder output, `next_state` is a (structure
+of) state tensors and TensorArrays, `next_inputs` is the tensor that
+should be used as input for the next step, `finished` is a boolean
+tensor telling whether the sequence is complete, for each sequence in
+the batch.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/FinalBeamSearchDecoderOutput.md b/docs/api_docs/python/tfa/seq2seq/FinalBeamSearchDecoderOutput.md
new file mode 100644
index 0000000000..90d2b3712f
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/FinalBeamSearchDecoderOutput.md
@@ -0,0 +1,50 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.FinalBeamSearchDecoderOutput" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="predicted_ids"/>
+<meta itemprop="property" content="beam_search_decoder_output"/>
+</div>
+
+# tfa.seq2seq.FinalBeamSearchDecoderOutput
+
+## Class `FinalBeamSearchDecoderOutput`
+
+Final outputs returned by the beam search after all decoding is
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.FinalBeamSearchDecoderOutput`
+* Class `tfa.seq2seq.beam_search_decoder.FinalBeamSearchDecoderOutput`
+
+
+
+Defined in [`seq2seq/beam_search_decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/beam_search_decoder.py).
+
+<!-- Placeholder for "Used in" -->
+finished.
+
+#### Args:
+
+
+* <b>`predicted_ids`</b>: The final prediction. A tensor of shape
+  `[batch_size, T, beam_width]` (or `[T, batch_size, beam_width]` if
+  `output_time_major` is True). Beams are ordered from best to worst.
+* <b>`beam_search_decoder_output`</b>: An instance of `BeamSearchDecoderOutput` that
+  describes the state of the beam search.
+
+## Properties
+
+<h3 id="predicted_ids"><code>predicted_ids</code></h3>
+
+
+
+
+<h3 id="beam_search_decoder_output"><code>beam_search_decoder_output</code></h3>
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/GreedyEmbeddingSampler.md b/docs/api_docs/python/tfa/seq2seq/GreedyEmbeddingSampler.md
new file mode 100644
index 0000000000..7f7d726744
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/GreedyEmbeddingSampler.md
@@ -0,0 +1,138 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.GreedyEmbeddingSampler" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="batch_size"/>
+<meta itemprop="property" content="sample_ids_dtype"/>
+<meta itemprop="property" content="sample_ids_shape"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="initialize"/>
+<meta itemprop="property" content="next_inputs"/>
+<meta itemprop="property" content="sample"/>
+</div>
+
+# tfa.seq2seq.GreedyEmbeddingSampler
+
+## Class `GreedyEmbeddingSampler`
+
+A sampler for use during inference.
+
+Inherits From: [`Sampler`](../../tfa/seq2seq/Sampler.md)
+
+### Aliases:
+
+* Class `tfa.seq2seq.GreedyEmbeddingSampler`
+* Class `tfa.seq2seq.sampler.GreedyEmbeddingSampler`
+
+
+
+Defined in [`seq2seq/sampler.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/sampler.py).
+
+<!-- Placeholder for "Used in" -->
+
+Uses the argmax of the output (treated as logits) and passes the
+result through an embedding layer to get the next input.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(embedding_fn=None)
+```
+
+Initializer.
+
+
+#### Args:
+
+
+* <b>`embedding_fn`</b>: A optional callable that takes a vector tensor of `ids`
+  (argmax ids), or the `params` argument for `embedding_lookup`. The
+  returned tensor will be passed to the decoder input. Default to use
+  `tf.nn.embedding_lookup`.
+
+
+
+## Properties
+
+<h3 id="batch_size"><code>batch_size</code></h3>
+
+
+
+
+<h3 id="sample_ids_dtype"><code>sample_ids_dtype</code></h3>
+
+
+
+
+<h3 id="sample_ids_shape"><code>sample_ids_shape</code></h3>
+
+
+
+
+
+
+## Methods
+
+<h3 id="initialize"><code>initialize</code></h3>
+
+``` python
+initialize(
+    embedding,
+    start_tokens=None,
+    end_token=None
+)
+```
+
+Initialize the GreedyEmbeddingSampler.
+
+
+#### Args:
+
+
+* <b>`embedding`</b>: tensor that contains embedding states matrix. It will be
+  used to generate generate outputs with start_tokens and end_tokens.
+  The embedding will be ignored if the embedding_fn has been provided
+  at __init__().
+* <b>`start_tokens`</b>: `int32` vector shaped `[batch_size]`, the start tokens.
+* <b>`end_token`</b>: `int32` scalar, the token that marks end of decoding.
+
+
+#### Returns:
+
+Tuple of two items: `(finished, self.start_inputs)`.
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if `start_tokens` is not a 1D tensor or `end_token` is
+  not a scalar.
+
+<h3 id="next_inputs"><code>next_inputs</code></h3>
+
+``` python
+next_inputs(
+    time,
+    outputs,
+    state,
+    sample_ids
+)
+```
+
+next_inputs_fn for GreedyEmbeddingHelper.
+
+
+<h3 id="sample"><code>sample</code></h3>
+
+``` python
+sample(
+    time,
+    outputs,
+    state
+)
+```
+
+sample for GreedyEmbeddingHelper.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/InferenceSampler.md b/docs/api_docs/python/tfa/seq2seq/InferenceSampler.md
new file mode 100644
index 0000000000..5c2d4bd043
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/InferenceSampler.md
@@ -0,0 +1,124 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.InferenceSampler" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="batch_size"/>
+<meta itemprop="property" content="sample_ids_dtype"/>
+<meta itemprop="property" content="sample_ids_shape"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="initialize"/>
+<meta itemprop="property" content="next_inputs"/>
+<meta itemprop="property" content="sample"/>
+</div>
+
+# tfa.seq2seq.InferenceSampler
+
+## Class `InferenceSampler`
+
+A helper to use during inference with a custom sampling function.
+
+Inherits From: [`Sampler`](../../tfa/seq2seq/Sampler.md)
+
+### Aliases:
+
+* Class `tfa.seq2seq.InferenceSampler`
+* Class `tfa.seq2seq.sampler.InferenceSampler`
+
+
+
+Defined in [`seq2seq/sampler.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/sampler.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    sample_fn,
+    sample_shape,
+    sample_dtype,
+    end_fn,
+    next_inputs_fn=None
+)
+```
+
+Initializer.
+
+
+#### Args:
+
+
+* <b>`sample_fn`</b>: A callable that takes `outputs` and emits tensor
+  `sample_ids`.
+* <b>`sample_shape`</b>: Either a list of integers, or a 1-D Tensor of type
+  `int32`, the shape of the each sample in the batch returned by
+  `sample_fn`.
+* <b>`sample_dtype`</b>: the dtype of the sample returned by `sample_fn`.
+* <b>`end_fn`</b>: A callable that takes `sample_ids` and emits a `bool` vector
+  shaped `[batch_size]` indicating whether each sample is an end
+  token.
+* <b>`next_inputs_fn`</b>: (Optional) A callable that takes `sample_ids` and
+  returns the next batch of inputs. If not provided, `sample_ids` is
+  used as the next batch of inputs.
+
+
+
+## Properties
+
+<h3 id="batch_size"><code>batch_size</code></h3>
+
+
+
+
+<h3 id="sample_ids_dtype"><code>sample_ids_dtype</code></h3>
+
+
+
+
+<h3 id="sample_ids_shape"><code>sample_ids_shape</code></h3>
+
+
+
+
+
+
+## Methods
+
+<h3 id="initialize"><code>initialize</code></h3>
+
+``` python
+initialize(start_inputs)
+```
+
+
+
+
+<h3 id="next_inputs"><code>next_inputs</code></h3>
+
+``` python
+next_inputs(
+    time,
+    outputs,
+    state,
+    sample_ids
+)
+```
+
+
+
+
+<h3 id="sample"><code>sample</code></h3>
+
+``` python
+sample(
+    time,
+    outputs,
+    state
+)
+```
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/LuongAttention.md b/docs/api_docs/python/tfa/seq2seq/LuongAttention.md
new file mode 100644
index 0000000000..b727880f34
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/LuongAttention.md
@@ -0,0 +1,918 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.LuongAttention" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="alignments_size"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="state_size"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="deserialize_inner_layer_from_config"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="initial_alignments"/>
+<meta itemprop="property" content="initial_state"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.seq2seq.LuongAttention
+
+## Class `LuongAttention`
+
+Implements Luong-style (multiplicative) attention scoring.
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.LuongAttention`
+* Class `tfa.seq2seq.attention_wrapper.LuongAttention`
+
+
+
+Defined in [`seq2seq/attention_wrapper.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/attention_wrapper.py).
+
+<!-- Placeholder for "Used in" -->
+
+This attention has two forms.  The first is standard Luong attention,
+as described in:
+
+Minh-Thang Luong, Hieu Pham, Christopher D. Manning.
+[Effective Approaches to Attention-based Neural Machine Translation.
+EMNLP 2015.](https://arxiv.org/abs/1508.04025)
+
+The second is the scaled form inspired partly by the normalized form of
+Bahdanau attention.
+
+To enable the second form, construct the object with parameter
+`scale=True`.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    units,
+    memory,
+    memory_sequence_length=None,
+    scale=False,
+    probability_fn='softmax',
+    dtype=None,
+    name='LuongAttention',
+    **kwargs
+)
+```
+
+Construct the AttentionMechanism mechanism.
+
+
+#### Args:
+
+
+* <b>`units`</b>: The depth of the attention mechanism.
+* <b>`memory`</b>: The memory to query; usually the output of an RNN encoder.
+  This tensor should be shaped `[batch_size, max_time, ...]`.
+* <b>`memory_sequence_length`</b>: (optional): Sequence lengths for the batch
+  entries in memory.  If provided, the memory tensor rows are masked
+  with zeros for values past the respective sequence lengths.
+* <b>`scale`</b>: Python boolean. Whether to scale the energy term.
+* <b>`probability_fn`</b>: (optional) string, the name of function to convert
+  the attention score to probabilities. The default is `softmax`
+  which is `tf.nn.softmax`. Other options is `hardmax`, which is
+  hardmax() within this module. Any other value will result
+  intovalidation error. Default to use `softmax`.
+* <b>`dtype`</b>: The data type for the memory layer of the attention mechanism.
+* <b>`name`</b>: Name to use when creating ops.
+* <b>`**kwargs`</b>: Dictionary that contains other common arguments for layer
+  creation.
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="alignments_size"><code>alignments_size</code></h3>
+
+
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="state_size"><code>state_size</code></h3>
+
+
+
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    **kwargs
+)
+```
+
+Preprocess the inputs before calling `base_layer.__call__()`.
+
+Note that there are situation here, one for setup memory, and one with
+actual query and state.
+1. When the memory has not been configured, we just pass all the param
+   to base_layer.__call__(), which will then invoke self.call() with
+   proper inputs, which allows this class to setup memory.
+2. When the memory has already been setup, the input should contain
+   query and state, and optionally processed memory. If the processed
+   memory is not included in the input, we will have to append it to
+   the inputs and give it to the base_layer.__call__(). The processed
+   memory is the output of first invocation of self.__call__(). If we
+   don't add it here, then from keras perspective, the graph is
+   disconnected since the output from previous call is never used.
+
+#### Args:
+
+
+* <b>`inputs`</b>: the inputs tensors.
+* <b>`**kwargs`</b>: dict, other keyeword arguments for the `__call__()`
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+
+
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+Computes the output shape of the layer.
+
+Assumes that the layer will be built
+to match that input shape provided.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Shape tuple (tuple of integers)
+    or list of shape tuples (one per output tensor of the layer).
+    Shape tuples can include None for free dimensions,
+    instead of an integer.
+
+
+#### Returns:
+
+An input shape tuple.
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="deserialize_inner_layer_from_config"><code>deserialize_inner_layer_from_config</code></h3>
+
+``` python
+deserialize_inner_layer_from_config(
+    cls,
+    config,
+    custom_objects
+)
+```
+
+Helper method that reconstruct the query and memory from the config.
+
+In the get_config() method, the query and memory layer configs are
+serialized into dict for persistence, this method perform the reverse
+action to reconstruct the layer from the config.
+
+#### Args:
+
+
+* <b>`config`</b>: dict, the configs that will be used to reconstruct the
+  object.
+* <b>`custom_objects`</b>: dict mapping class names (or function names) of
+  custom (non-Keras) objects to class/functions.
+
+#### Returns:
+
+
+* <b>`config`</b>: dict, the config with layer instance created, which is ready
+  to be used as init parameters.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+@classmethod
+from_config(
+    cls,
+    config,
+    custom_objects=None
+)
+```
+
+
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="initial_alignments"><code>initial_alignments</code></h3>
+
+``` python
+initial_alignments(
+    batch_size,
+    dtype
+)
+```
+
+Creates the initial alignment values for the `AttentionWrapper`
+class.
+
+This is important for AttentionMechanisms that use the previous
+alignment to calculate the alignment at the next time step
+(e.g. monotonic attention).
+
+The default behavior is to return a tensor of all zeros.
+
+#### Args:
+
+
+* <b>`batch_size`</b>: `int32` scalar, the batch_size.
+* <b>`dtype`</b>: The `dtype`.
+
+
+#### Returns:
+
+A `dtype` tensor shaped `[batch_size, alignments_size]`
+(`alignments_size` is the values' `max_time`).
+
+
+<h3 id="initial_state"><code>initial_state</code></h3>
+
+``` python
+initial_state(
+    batch_size,
+    dtype
+)
+```
+
+Creates the initial state values for the `AttentionWrapper` class.
+
+This is important for AttentionMechanisms that use the previous
+alignment to calculate the alignment at the next time step
+(e.g. monotonic attention).
+
+The default behavior is to return the same output as
+initial_alignments.
+
+#### Args:
+
+
+* <b>`batch_size`</b>: `int32` scalar, the batch_size.
+* <b>`dtype`</b>: The `dtype`.
+
+
+#### Returns:
+
+A structure of all-zero tensors with shapes as described by
+`state_size`.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/LuongMonotonicAttention.md b/docs/api_docs/python/tfa/seq2seq/LuongMonotonicAttention.md
new file mode 100644
index 0000000000..c1d6721f9c
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/LuongMonotonicAttention.md
@@ -0,0 +1,920 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.LuongMonotonicAttention" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="activity_regularizer"/>
+<meta itemprop="property" content="alignments_size"/>
+<meta itemprop="property" content="dtype"/>
+<meta itemprop="property" content="dynamic"/>
+<meta itemprop="property" content="input"/>
+<meta itemprop="property" content="input_mask"/>
+<meta itemprop="property" content="input_shape"/>
+<meta itemprop="property" content="input_spec"/>
+<meta itemprop="property" content="losses"/>
+<meta itemprop="property" content="metrics"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="name_scope"/>
+<meta itemprop="property" content="non_trainable_variables"/>
+<meta itemprop="property" content="non_trainable_weights"/>
+<meta itemprop="property" content="output"/>
+<meta itemprop="property" content="output_mask"/>
+<meta itemprop="property" content="output_shape"/>
+<meta itemprop="property" content="state_size"/>
+<meta itemprop="property" content="submodules"/>
+<meta itemprop="property" content="trainable"/>
+<meta itemprop="property" content="trainable_variables"/>
+<meta itemprop="property" content="trainable_weights"/>
+<meta itemprop="property" content="updates"/>
+<meta itemprop="property" content="variables"/>
+<meta itemprop="property" content="weights"/>
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="apply"/>
+<meta itemprop="property" content="build"/>
+<meta itemprop="property" content="compute_mask"/>
+<meta itemprop="property" content="compute_output_shape"/>
+<meta itemprop="property" content="count_params"/>
+<meta itemprop="property" content="deserialize_inner_layer_from_config"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+<meta itemprop="property" content="get_input_at"/>
+<meta itemprop="property" content="get_input_mask_at"/>
+<meta itemprop="property" content="get_input_shape_at"/>
+<meta itemprop="property" content="get_losses_for"/>
+<meta itemprop="property" content="get_output_at"/>
+<meta itemprop="property" content="get_output_mask_at"/>
+<meta itemprop="property" content="get_output_shape_at"/>
+<meta itemprop="property" content="get_updates_for"/>
+<meta itemprop="property" content="get_weights"/>
+<meta itemprop="property" content="initial_alignments"/>
+<meta itemprop="property" content="initial_state"/>
+<meta itemprop="property" content="set_weights"/>
+<meta itemprop="property" content="with_name_scope"/>
+</div>
+
+# tfa.seq2seq.LuongMonotonicAttention
+
+## Class `LuongMonotonicAttention`
+
+Monotonic attention mechanism with Luong-style energy function.
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.LuongMonotonicAttention`
+* Class `tfa.seq2seq.attention_wrapper.LuongMonotonicAttention`
+
+
+
+Defined in [`seq2seq/attention_wrapper.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/attention_wrapper.py).
+
+<!-- Placeholder for "Used in" -->
+
+This type of attention enforces a monotonic constraint on the attention
+distributions; that is once the model attends to a given point in the
+memory it can't attend to any prior points at subsequence output timesteps.
+It achieves this by using the _monotonic_probability_fn instead of softmax
+to construct its attention distributions.  Otherwise, it is equivalent to
+LuongAttention.  This approach is proposed in
+
+[Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
+"Online and Linear-Time Attention by Enforcing Monotonic Alignments."
+ICML 2017.](https://arxiv.org/abs/1704.00784)
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    units,
+    memory,
+    memory_sequence_length=None,
+    scale=False,
+    sigmoid_noise=0.0,
+    sigmoid_noise_seed=None,
+    score_bias_init=0.0,
+    mode='parallel',
+    dtype=None,
+    name='LuongMonotonicAttention',
+    **kwargs
+)
+```
+
+Construct the Attention mechanism.
+
+
+#### Args:
+
+
+* <b>`units`</b>: The depth of the query mechanism.
+* <b>`memory`</b>: The memory to query; usually the output of an RNN encoder.
+  This tensor should be shaped `[batch_size, max_time, ...]`.
+* <b>`memory_sequence_length`</b>: (optional): Sequence lengths for the batch
+  entries in memory.  If provided, the memory tensor rows are masked
+  with zeros for values past the respective sequence lengths.
+* <b>`scale`</b>: Python boolean.  Whether to scale the energy term.
+* <b>`sigmoid_noise`</b>: Standard deviation of pre-sigmoid noise.  See the
+  docstring for `_monotonic_probability_fn` for more information.
+* <b>`sigmoid_noise_seed`</b>: (optional) Random seed for pre-sigmoid noise.
+* <b>`score_bias_init`</b>: Initial value for score bias scalar.  It's
+  recommended to initialize this to a negative value when the length
+  of the memory is large.
+* <b>`mode`</b>: How to compute the attention distribution.  Must be one of
+  'recursive', 'parallel', or 'hard'.  See the docstring for
+  `tf.contrib.seq2seq.monotonic_attention` for more information.
+* <b>`dtype`</b>: The data type for the query and memory layers of the attention
+  mechanism.
+* <b>`name`</b>: Name to use when creating ops.
+* <b>`**kwargs`</b>: Dictionary that contains other common arguments for layer
+  creation.
+
+
+
+## Properties
+
+<h3 id="activity_regularizer"><code>activity_regularizer</code></h3>
+
+Optional regularizer function for the output of this layer.
+
+
+<h3 id="alignments_size"><code>alignments_size</code></h3>
+
+
+
+
+<h3 id="dtype"><code>dtype</code></h3>
+
+
+
+
+<h3 id="dynamic"><code>dynamic</code></h3>
+
+
+
+
+<h3 id="input"><code>input</code></h3>
+
+Retrieves the input tensor(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input tensor or list of input tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+* <b>`AttributeError`</b>: If no inbound nodes are found.
+
+<h3 id="input_mask"><code>input_mask</code></h3>
+
+Retrieves the input mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Input mask tensor (potentially None) or list of input
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="input_shape"><code>input_shape</code></h3>
+
+Retrieves the input shape(s) of a layer.
+
+Only applicable if the layer has exactly one input,
+i.e. if it is connected to one incoming layer, or if all inputs
+have the same shape.
+
+#### Returns:
+
+Input shape, as an integer shape tuple
+(or list of shape tuples, one tuple per input tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined input_shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="input_spec"><code>input_spec</code></h3>
+
+
+
+
+<h3 id="losses"><code>losses</code></h3>
+
+Losses which are associated with this `Layer`.
+
+Variable regularization tensors are created when this property is accessed,
+so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+propagate gradients back to the corresponding variables.
+
+#### Returns:
+
+A list of tensors.
+
+
+<h3 id="metrics"><code>metrics</code></h3>
+
+
+
+
+<h3 id="name"><code>name</code></h3>
+
+
+
+
+<h3 id="name_scope"><code>name_scope</code></h3>
+
+Returns a `tf.name_scope` instance for this class.
+
+
+<h3 id="non_trainable_variables"><code>non_trainable_variables</code></h3>
+
+
+
+
+<h3 id="non_trainable_weights"><code>non_trainable_weights</code></h3>
+
+
+
+
+<h3 id="output"><code>output</code></h3>
+
+Retrieves the output tensor(s) of a layer.
+
+Only applicable if the layer has exactly one output,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output tensor or list of output tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to more than one incoming
+  layers.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="output_mask"><code>output_mask</code></h3>
+
+Retrieves the output mask tensor(s) of a layer.
+
+Only applicable if the layer has exactly one inbound node,
+i.e. if it is connected to one incoming layer.
+
+#### Returns:
+
+Output mask tensor (potentially None) or list of output
+mask tensors.
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer is connected to
+more than one incoming layers.
+
+<h3 id="output_shape"><code>output_shape</code></h3>
+
+Retrieves the output shape(s) of a layer.
+
+Only applicable if the layer has one output,
+or if all outputs have the same shape.
+
+#### Returns:
+
+Output shape, as an integer shape tuple
+(or list of shape tuples, one tuple per output tensor).
+
+
+
+#### Raises:
+
+
+* <b>`AttributeError`</b>: if the layer has no defined output shape.
+* <b>`RuntimeError`</b>: if called in Eager mode.
+
+<h3 id="state_size"><code>state_size</code></h3>
+
+
+
+
+<h3 id="submodules"><code>submodules</code></h3>
+
+Sequence of all sub-modules.
+
+Submodules are modules which are properties of this module, or found as
+properties of modules which are properties of this module (and so on).
+
+```
+a = tf.Module()
+b = tf.Module()
+c = tf.Module()
+a.b = b
+b.c = c
+assert list(a.submodules) == [b, c]
+assert list(b.submodules) == [c]
+assert list(c.submodules) == []
+```
+
+#### Returns:
+
+A sequence of all submodules.
+
+
+<h3 id="trainable"><code>trainable</code></h3>
+
+
+
+
+<h3 id="trainable_variables"><code>trainable_variables</code></h3>
+
+
+
+
+<h3 id="trainable_weights"><code>trainable_weights</code></h3>
+
+
+
+
+<h3 id="updates"><code>updates</code></h3>
+
+
+
+
+<h3 id="variables"><code>variables</code></h3>
+
+Returns the list of all layer variables/weights.
+
+Alias of `self.weights`.
+
+#### Returns:
+
+A list of variables.
+
+
+<h3 id="weights"><code>weights</code></h3>
+
+Returns the list of all layer variables/weights.
+
+
+#### Returns:
+
+A list of variables.
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    inputs,
+    **kwargs
+)
+```
+
+Preprocess the inputs before calling `base_layer.__call__()`.
+
+Note that there are situation here, one for setup memory, and one with
+actual query and state.
+1. When the memory has not been configured, we just pass all the param
+   to base_layer.__call__(), which will then invoke self.call() with
+   proper inputs, which allows this class to setup memory.
+2. When the memory has already been setup, the input should contain
+   query and state, and optionally processed memory. If the processed
+   memory is not included in the input, we will have to append it to
+   the inputs and give it to the base_layer.__call__(). The processed
+   memory is the output of first invocation of self.__call__(). If we
+   don't add it here, then from keras perspective, the graph is
+   disconnected since the output from previous call is never used.
+
+#### Args:
+
+
+* <b>`inputs`</b>: the inputs tensors.
+* <b>`**kwargs`</b>: dict, other keyeword arguments for the `__call__()`
+
+<h3 id="apply"><code>apply</code></h3>
+
+``` python
+apply(
+    inputs,
+    *args,
+    **kwargs
+)
+```
+
+Apply the layer on a input.
+
+This is an alias of `self.__call__`.
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor(s).
+* <b>`*args`</b>: additional positional arguments to be passed to `self.call`.
+* <b>`**kwargs`</b>: additional keyword arguments to be passed to `self.call`.
+
+
+#### Returns:
+
+Output tensor(s).
+
+
+<h3 id="build"><code>build</code></h3>
+
+``` python
+build(input_shape)
+```
+
+
+
+
+<h3 id="compute_mask"><code>compute_mask</code></h3>
+
+``` python
+compute_mask(
+    inputs,
+    mask=None
+)
+```
+
+
+
+
+<h3 id="compute_output_shape"><code>compute_output_shape</code></h3>
+
+``` python
+compute_output_shape(input_shape)
+```
+
+Computes the output shape of the layer.
+
+Assumes that the layer will be built
+to match that input shape provided.
+
+#### Arguments:
+
+
+* <b>`input_shape`</b>: Shape tuple (tuple of integers)
+    or list of shape tuples (one per output tensor of the layer).
+    Shape tuples can include None for free dimensions,
+    instead of an integer.
+
+
+#### Returns:
+
+An input shape tuple.
+
+
+<h3 id="count_params"><code>count_params</code></h3>
+
+``` python
+count_params()
+```
+
+Count the total number of scalars composing the weights.
+
+
+#### Returns:
+
+An integer count.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if the layer isn't yet built
+  (in which case its weights aren't yet defined).
+
+<h3 id="deserialize_inner_layer_from_config"><code>deserialize_inner_layer_from_config</code></h3>
+
+``` python
+deserialize_inner_layer_from_config(
+    cls,
+    config,
+    custom_objects
+)
+```
+
+Helper method that reconstruct the query and memory from the config.
+
+In the get_config() method, the query and memory layer configs are
+serialized into dict for persistence, this method perform the reverse
+action to reconstruct the layer from the config.
+
+#### Args:
+
+
+* <b>`config`</b>: dict, the configs that will be used to reconstruct the
+  object.
+* <b>`custom_objects`</b>: dict mapping class names (or function names) of
+  custom (non-Keras) objects to class/functions.
+
+#### Returns:
+
+
+* <b>`config`</b>: dict, the config with layer instance created, which is ready
+  to be used as init parameters.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+@classmethod
+from_config(
+    cls,
+    config,
+    custom_objects=None
+)
+```
+
+
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+<h3 id="get_input_at"><code>get_input_at</code></h3>
+
+``` python
+get_input_at(node_index)
+```
+
+Retrieves the input tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_input_mask_at"><code>get_input_mask_at</code></h3>
+
+``` python
+get_input_mask_at(node_index)
+```
+
+Retrieves the input mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple inputs).
+
+
+<h3 id="get_input_shape_at"><code>get_input_shape_at</code></h3>
+
+``` python
+get_input_shape_at(node_index)
+```
+
+Retrieves the input shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple inputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_losses_for"><code>get_losses_for</code></h3>
+
+``` python
+get_losses_for(inputs)
+```
+
+Retrieves losses relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of loss tensors of the layer that depend on `inputs`.
+
+
+<h3 id="get_output_at"><code>get_output_at</code></h3>
+
+``` python
+get_output_at(node_index)
+```
+
+Retrieves the output tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A tensor (or list of tensors if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_output_mask_at"><code>get_output_mask_at</code></h3>
+
+``` python
+get_output_mask_at(node_index)
+```
+
+Retrieves the output mask tensor(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A mask tensor
+(or list of tensors if the layer has multiple outputs).
+
+
+<h3 id="get_output_shape_at"><code>get_output_shape_at</code></h3>
+
+``` python
+get_output_shape_at(node_index)
+```
+
+Retrieves the output shape(s) of a layer at a given node.
+
+
+#### Arguments:
+
+
+* <b>`node_index`</b>: Integer, index of the node
+    from which to retrieve the attribute.
+    E.g. `node_index=0` will correspond to the
+    first time the layer was called.
+
+
+#### Returns:
+
+A shape tuple
+(or list of shape tuples if the layer has multiple outputs).
+
+
+
+#### Raises:
+
+
+* <b>`RuntimeError`</b>: If called in Eager mode.
+
+<h3 id="get_updates_for"><code>get_updates_for</code></h3>
+
+``` python
+get_updates_for(inputs)
+```
+
+Retrieves updates relevant to a specific set of inputs.
+
+
+#### Arguments:
+
+
+* <b>`inputs`</b>: Input tensor or list/tuple of input tensors.
+
+
+#### Returns:
+
+List of update ops of the layer that depend on `inputs`.
+
+
+<h3 id="get_weights"><code>get_weights</code></h3>
+
+``` python
+get_weights()
+```
+
+Returns the current weights of the layer.
+
+
+#### Returns:
+
+Weights values as a list of numpy arrays.
+
+
+<h3 id="initial_alignments"><code>initial_alignments</code></h3>
+
+``` python
+initial_alignments(
+    batch_size,
+    dtype
+)
+```
+
+Creates the initial alignment values for the monotonic attentions.
+
+Initializes to dirac distributions, i.e.
+[1, 0, 0, ...memory length..., 0] for all entries in the batch.
+
+#### Args:
+
+
+* <b>`batch_size`</b>: `int32` scalar, the batch_size.
+* <b>`dtype`</b>: The `dtype`.
+
+
+#### Returns:
+
+A `dtype` tensor shaped `[batch_size, alignments_size]`
+(`alignments_size` is the values' `max_time`).
+
+
+<h3 id="initial_state"><code>initial_state</code></h3>
+
+``` python
+initial_state(
+    batch_size,
+    dtype
+)
+```
+
+Creates the initial state values for the `AttentionWrapper` class.
+
+This is important for AttentionMechanisms that use the previous
+alignment to calculate the alignment at the next time step
+(e.g. monotonic attention).
+
+The default behavior is to return the same output as
+initial_alignments.
+
+#### Args:
+
+
+* <b>`batch_size`</b>: `int32` scalar, the batch_size.
+* <b>`dtype`</b>: The `dtype`.
+
+
+#### Returns:
+
+A structure of all-zero tensors with shapes as described by
+`state_size`.
+
+
+<h3 id="set_weights"><code>set_weights</code></h3>
+
+``` python
+set_weights(weights)
+```
+
+Sets the weights of the layer, from Numpy arrays.
+
+
+#### Arguments:
+
+
+* <b>`weights`</b>: a list of Numpy arrays. The number
+    of arrays and their shape must match
+    number of the dimensions of the weights
+    of the layer (i.e. it should match the
+    output of `get_weights`).
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If the provided weights list does not match the
+    layer's specifications.
+
+<h3 id="with_name_scope"><code>with_name_scope</code></h3>
+
+``` python
+with_name_scope(
+    cls,
+    method
+)
+```
+
+Decorator to automatically enter the module name scope.
+
+```
+class MyModule(tf.Module):
+  @tf.Module.with_name_scope
+  def __call__(self, x):
+    if not hasattr(self, 'w'):
+      self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
+    return tf.matmul(x, self.w)
+```
+
+Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose
+names included the module name:
+
+```
+mod = MyModule()
+mod(tf.ones([8, 32]))
+# ==> <tf.Tensor: ...>
+mod.w
+# ==> <tf.Variable ...'my_module/w:0'>
+```
+
+#### Args:
+
+
+* <b>`method`</b>: The method to wrap.
+
+
+#### Returns:
+
+The original method wrapped such that it enters the module's name scope.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/SampleEmbeddingSampler.md b/docs/api_docs/python/tfa/seq2seq/SampleEmbeddingSampler.md
new file mode 100644
index 0000000000..0e5c21769f
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/SampleEmbeddingSampler.md
@@ -0,0 +1,155 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.SampleEmbeddingSampler" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="batch_size"/>
+<meta itemprop="property" content="sample_ids_dtype"/>
+<meta itemprop="property" content="sample_ids_shape"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="initialize"/>
+<meta itemprop="property" content="next_inputs"/>
+<meta itemprop="property" content="sample"/>
+</div>
+
+# tfa.seq2seq.SampleEmbeddingSampler
+
+## Class `SampleEmbeddingSampler`
+
+A sampler for use during inference.
+
+Inherits From: [`GreedyEmbeddingSampler`](../../tfa/seq2seq/GreedyEmbeddingSampler.md)
+
+### Aliases:
+
+* Class `tfa.seq2seq.SampleEmbeddingSampler`
+* Class `tfa.seq2seq.sampler.SampleEmbeddingSampler`
+
+
+
+Defined in [`seq2seq/sampler.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/sampler.py).
+
+<!-- Placeholder for "Used in" -->
+
+Uses sampling (from a distribution) instead of argmax and passes the
+result through an embedding layer to get the next input.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    embedding_fn=None,
+    softmax_temperature=None,
+    seed=None
+)
+```
+
+Initializer.
+
+
+#### Args:
+
+
+* <b>`embedding_fn`</b>: (Optional) A callable that takes a vector tensor of
+  `ids` (argmax ids), or the `params` argument for
+  `embedding_lookup`. The returned tensor will be passed to the
+  decoder input.
+* <b>`softmax_temperature`</b>: (Optional) `float32` scalar, value to divide the
+  logits by before computing the softmax. Larger values (above 1.0)
+  result in more random samples, while smaller values push the
+  sampling distribution towards the argmax. Must be strictly greater
+  than 0. Defaults to 1.0.
+* <b>`seed`</b>: (Optional) The sampling seed.
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if `start_tokens` is not a 1D tensor or `end_token` is
+  not a scalar.
+
+
+
+## Properties
+
+<h3 id="batch_size"><code>batch_size</code></h3>
+
+
+
+
+<h3 id="sample_ids_dtype"><code>sample_ids_dtype</code></h3>
+
+
+
+
+<h3 id="sample_ids_shape"><code>sample_ids_shape</code></h3>
+
+
+
+
+
+
+## Methods
+
+<h3 id="initialize"><code>initialize</code></h3>
+
+``` python
+initialize(
+    embedding,
+    start_tokens=None,
+    end_token=None
+)
+```
+
+Initialize the GreedyEmbeddingSampler.
+
+
+#### Args:
+
+
+* <b>`embedding`</b>: tensor that contains embedding states matrix. It will be
+  used to generate generate outputs with start_tokens and end_tokens.
+  The embedding will be ignored if the embedding_fn has been provided
+  at __init__().
+* <b>`start_tokens`</b>: `int32` vector shaped `[batch_size]`, the start tokens.
+* <b>`end_token`</b>: `int32` scalar, the token that marks end of decoding.
+
+
+#### Returns:
+
+Tuple of two items: `(finished, self.start_inputs)`.
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if `start_tokens` is not a 1D tensor or `end_token` is
+  not a scalar.
+
+<h3 id="next_inputs"><code>next_inputs</code></h3>
+
+``` python
+next_inputs(
+    time,
+    outputs,
+    state,
+    sample_ids
+)
+```
+
+next_inputs_fn for GreedyEmbeddingHelper.
+
+
+<h3 id="sample"><code>sample</code></h3>
+
+``` python
+sample(
+    time,
+    outputs,
+    state
+)
+```
+
+sample for SampleEmbeddingHelper.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/Sampler.md b/docs/api_docs/python/tfa/seq2seq/Sampler.md
new file mode 100644
index 0000000000..cfd9219217
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/Sampler.md
@@ -0,0 +1,128 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.Sampler" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="batch_size"/>
+<meta itemprop="property" content="sample_ids_dtype"/>
+<meta itemprop="property" content="sample_ids_shape"/>
+<meta itemprop="property" content="initialize"/>
+<meta itemprop="property" content="next_inputs"/>
+<meta itemprop="property" content="sample"/>
+</div>
+
+# tfa.seq2seq.Sampler
+
+## Class `Sampler`
+
+Interface for implementing sampling in seq2seq decoders.
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.Sampler`
+* Class `tfa.seq2seq.sampler.Sampler`
+
+
+
+Defined in [`seq2seq/sampler.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/sampler.py).
+
+<!-- Placeholder for "Used in" -->
+
+Sampler instances are used by `BasicDecoder`. The normal usage of a sampler
+is like below:
+sampler = Sampler(init_args)
+(initial_finished, initial_inputs) = sampler.initialize(input_tensors)
+for time_step in range(time):
+  cell_output, cell_state = cell.call(cell_input, previous_state)
+  sample_ids = sampler.sample(time_step, cell_output, cell_state)
+  (finished, next_inputs, next_state) = sampler.next_inputs(
+      time_step,cell_output, cell_state)
+
+Note that all the tensor input should not be feed to Sampler as __init__()
+parameters, instead, they should be feed by decoders via initialize().
+
+## Properties
+
+<h3 id="batch_size"><code>batch_size</code></h3>
+
+Batch size of tensor returned by `sample`.
+
+Returns a scalar int32 tensor. The return value might not
+available before the invocation of initialize(), in this case,
+ValueError is raised.
+
+<h3 id="sample_ids_dtype"><code>sample_ids_dtype</code></h3>
+
+DType of tensor returned by `sample`.
+
+Returns a DType. The return value might not available before the
+invocation of initialize().
+
+<h3 id="sample_ids_shape"><code>sample_ids_shape</code></h3>
+
+Shape of tensor returned by `sample`, excluding the batch dimension.
+
+Returns a `TensorShape`. The return value might not available
+before the invocation of initialize().
+
+
+
+## Methods
+
+<h3 id="initialize"><code>initialize</code></h3>
+
+``` python
+initialize(
+    inputs,
+    **kwargs
+)
+```
+
+initialize the sampler with the input tensors.
+
+This method suppose to be only invoke once before the calling other
+methods of the Sampler.
+
+#### Args:
+
+
+* <b>`inputs`</b>: A (structure of) input tensors, it could be a nested tuple or
+  a single tensor.
+* <b>`**kwargs`</b>: Other kwargs for initialization. It could contain tensors
+  like mask for inputs, or non tensor parameter.
+
+
+#### Returns:
+
+`(initial_finished, initial_inputs)`.
+
+
+<h3 id="next_inputs"><code>next_inputs</code></h3>
+
+``` python
+next_inputs(
+    time,
+    outputs,
+    state,
+    sample_ids
+)
+```
+
+Returns `(finished, next_inputs, next_state)`.
+
+
+<h3 id="sample"><code>sample</code></h3>
+
+``` python
+sample(
+    time,
+    outputs,
+    state
+)
+```
+
+Returns `sample_ids`.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/ScheduledEmbeddingTrainingSampler.md b/docs/api_docs/python/tfa/seq2seq/ScheduledEmbeddingTrainingSampler.md
new file mode 100644
index 0000000000..5f4f46f74d
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/ScheduledEmbeddingTrainingSampler.md
@@ -0,0 +1,133 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.ScheduledEmbeddingTrainingSampler" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="batch_size"/>
+<meta itemprop="property" content="sample_ids_dtype"/>
+<meta itemprop="property" content="sample_ids_shape"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="initialize"/>
+<meta itemprop="property" content="next_inputs"/>
+<meta itemprop="property" content="sample"/>
+</div>
+
+# tfa.seq2seq.ScheduledEmbeddingTrainingSampler
+
+## Class `ScheduledEmbeddingTrainingSampler`
+
+A training sampler that adds scheduled sampling.
+
+Inherits From: [`TrainingSampler`](../../tfa/seq2seq/TrainingSampler.md)
+
+### Aliases:
+
+* Class `tfa.seq2seq.ScheduledEmbeddingTrainingSampler`
+* Class `tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler`
+
+
+
+Defined in [`seq2seq/sampler.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/sampler.py).
+
+<!-- Placeholder for "Used in" -->
+
+Returns -1s for sample_ids where no sampling took place; valid
+sample id values elsewhere.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    sampling_probability,
+    embedding_fn=None,
+    time_major=False,
+    seed=None,
+    scheduling_seed=None
+)
+```
+
+Initializer.
+
+
+#### Args:
+
+
+* <b>`sampling_probability`</b>: A `float32` 0-D or 1-D tensor: the probability
+  of sampling categorically from the output ids instead of reading
+  directly from the inputs.
+* <b>`embedding_fn`</b>: A callable that takes a vector tensor of `ids`
+  (argmax ids), or the `params` argument for `embedding_lookup`.
+* <b>`time_major`</b>: Python bool. Whether the tensors in `inputs` are time
+  major. If `False` (default), they are assumed to be batch major.
+* <b>`seed`</b>: The sampling seed.
+* <b>`scheduling_seed`</b>: The schedule decision rule sampling seed.
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if `sampling_probability` is not a scalar or vector.
+
+
+
+## Properties
+
+<h3 id="batch_size"><code>batch_size</code></h3>
+
+
+
+
+<h3 id="sample_ids_dtype"><code>sample_ids_dtype</code></h3>
+
+
+
+
+<h3 id="sample_ids_shape"><code>sample_ids_shape</code></h3>
+
+
+
+
+
+
+## Methods
+
+<h3 id="initialize"><code>initialize</code></h3>
+
+``` python
+initialize(
+    inputs,
+    sequence_length=None,
+    embedding=None
+)
+```
+
+
+
+
+<h3 id="next_inputs"><code>next_inputs</code></h3>
+
+``` python
+next_inputs(
+    time,
+    outputs,
+    state,
+    sample_ids
+)
+```
+
+
+
+
+<h3 id="sample"><code>sample</code></h3>
+
+``` python
+sample(
+    time,
+    outputs,
+    state
+)
+```
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/ScheduledOutputTrainingSampler.md b/docs/api_docs/python/tfa/seq2seq/ScheduledOutputTrainingSampler.md
new file mode 100644
index 0000000000..fdeaa44a86
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/ScheduledOutputTrainingSampler.md
@@ -0,0 +1,132 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.ScheduledOutputTrainingSampler" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="batch_size"/>
+<meta itemprop="property" content="sample_ids_dtype"/>
+<meta itemprop="property" content="sample_ids_shape"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="initialize"/>
+<meta itemprop="property" content="next_inputs"/>
+<meta itemprop="property" content="sample"/>
+</div>
+
+# tfa.seq2seq.ScheduledOutputTrainingSampler
+
+## Class `ScheduledOutputTrainingSampler`
+
+A training sampler that adds scheduled sampling directly to outputs.
+
+Inherits From: [`TrainingSampler`](../../tfa/seq2seq/TrainingSampler.md)
+
+### Aliases:
+
+* Class `tfa.seq2seq.ScheduledOutputTrainingSampler`
+* Class `tfa.seq2seq.sampler.ScheduledOutputTrainingSampler`
+
+
+
+Defined in [`seq2seq/sampler.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/sampler.py).
+
+<!-- Placeholder for "Used in" -->
+
+Returns False for sample_ids where no sampling took place; True
+elsewhere.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    sampling_probability,
+    time_major=False,
+    seed=None,
+    next_inputs_fn=None
+)
+```
+
+Initializer.
+
+
+#### Args:
+
+
+* <b>`sampling_probability`</b>: A `float32` scalar tensor: the probability of
+  sampling from the outputs instead of reading directly from the
+  inputs.
+* <b>`time_major`</b>: Python bool. Whether the tensors in `inputs` are time
+  major. If `False` (default), they are assumed to be batch major.
+* <b>`seed`</b>: The sampling seed.
+* <b>`next_inputs_fn`</b>: (Optional) callable to apply to the RNN outputs to
+  create the next input when sampling. If `None` (default), the RNN
+  outputs will be used as the next inputs.
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if `sampling_probability` is not a scalar or vector.
+
+
+
+## Properties
+
+<h3 id="batch_size"><code>batch_size</code></h3>
+
+
+
+
+<h3 id="sample_ids_dtype"><code>sample_ids_dtype</code></h3>
+
+
+
+
+<h3 id="sample_ids_shape"><code>sample_ids_shape</code></h3>
+
+
+
+
+
+
+## Methods
+
+<h3 id="initialize"><code>initialize</code></h3>
+
+``` python
+initialize(
+    inputs,
+    sequence_length=None,
+    auxiliary_inputs=None
+)
+```
+
+
+
+
+<h3 id="next_inputs"><code>next_inputs</code></h3>
+
+``` python
+next_inputs(
+    time,
+    outputs,
+    state,
+    sample_ids
+)
+```
+
+
+
+
+<h3 id="sample"><code>sample</code></h3>
+
+``` python
+sample(
+    time,
+    outputs,
+    state
+)
+```
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/SequenceLoss.md b/docs/api_docs/python/tfa/seq2seq/SequenceLoss.md
new file mode 100644
index 0000000000..fe49b5b542
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/SequenceLoss.md
@@ -0,0 +1,96 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.SequenceLoss" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="__call__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="from_config"/>
+<meta itemprop="property" content="get_config"/>
+</div>
+
+# tfa.seq2seq.SequenceLoss
+
+## Class `SequenceLoss`
+
+Weighted cross-entropy loss for a sequence of logits.
+
+
+
+### Aliases:
+
+* Class `tfa.seq2seq.SequenceLoss`
+* Class `tfa.seq2seq.loss.SequenceLoss`
+
+
+
+Defined in [`seq2seq/loss.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/loss.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    average_across_timesteps=False,
+    average_across_batch=False,
+    sum_over_timesteps=True,
+    sum_over_batch=True,
+    softmax_loss_function=None,
+    name=None
+)
+```
+
+
+
+
+
+
+## Methods
+
+<h3 id="__call__"><code>__call__</code></h3>
+
+``` python
+__call__(
+    y_true,
+    y_pred,
+    sample_weight=None
+)
+```
+
+Override the parent __call__ to have a customized reduce
+behavior.
+
+<h3 id="from_config"><code>from_config</code></h3>
+
+``` python
+from_config(
+    cls,
+    config
+)
+```
+
+Instantiates a `Loss` from its config (output of `get_config()`).
+
+
+#### Args:
+
+
+* <b>`config`</b>: Output of `get_config()`.
+
+
+#### Returns:
+
+A `Loss` instance.
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+``` python
+get_config()
+```
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/TrainingSampler.md b/docs/api_docs/python/tfa/seq2seq/TrainingSampler.md
new file mode 100644
index 0000000000..df8e1300be
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/TrainingSampler.md
@@ -0,0 +1,135 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.TrainingSampler" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="batch_size"/>
+<meta itemprop="property" content="sample_ids_dtype"/>
+<meta itemprop="property" content="sample_ids_shape"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="initialize"/>
+<meta itemprop="property" content="next_inputs"/>
+<meta itemprop="property" content="sample"/>
+</div>
+
+# tfa.seq2seq.TrainingSampler
+
+## Class `TrainingSampler`
+
+A Sampler for use during training.
+
+Inherits From: [`Sampler`](../../tfa/seq2seq/Sampler.md)
+
+### Aliases:
+
+* Class `tfa.seq2seq.TrainingSampler`
+* Class `tfa.seq2seq.sampler.TrainingSampler`
+
+
+
+Defined in [`seq2seq/sampler.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/sampler.py).
+
+<!-- Placeholder for "Used in" -->
+
+Only reads inputs.
+
+Returned sample_ids are the argmax of the RNN output logits.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(time_major=False)
+```
+
+Initializer.
+
+
+#### Args:
+
+
+* <b>`time_major`</b>: Python bool.  Whether the tensors in `inputs` are time
+  major. If `False` (default), they are assumed to be batch major.
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if `sequence_length` is not a 1D tensor.
+
+
+
+## Properties
+
+<h3 id="batch_size"><code>batch_size</code></h3>
+
+
+
+
+<h3 id="sample_ids_dtype"><code>sample_ids_dtype</code></h3>
+
+
+
+
+<h3 id="sample_ids_shape"><code>sample_ids_shape</code></h3>
+
+
+
+
+
+
+## Methods
+
+<h3 id="initialize"><code>initialize</code></h3>
+
+``` python
+initialize(
+    inputs,
+    sequence_length=None
+)
+```
+
+Initialize the TrainSampler.
+
+
+#### Args:
+
+
+* <b>`inputs`</b>: A (structure of) input tensors.
+* <b>`sequence_length`</b>: An int32 vector tensor.
+
+
+#### Returns:
+
+(finished, next_inputs), a tuple of two items. The first item is a
+  boolean vector to indicate whether the item in the batch has
+  finished. The second item is the first slide of input data based on
+  the timestep dimension (usually the second dim of the input).
+
+
+<h3 id="next_inputs"><code>next_inputs</code></h3>
+
+``` python
+next_inputs(
+    time,
+    outputs,
+    state,
+    sample_ids
+)
+```
+
+
+
+
+<h3 id="sample"><code>sample</code></h3>
+
+``` python
+sample(
+    time,
+    outputs,
+    state
+)
+```
+
+
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/attention_wrapper.md b/docs/api_docs/python/tfa/seq2seq/attention_wrapper.md
new file mode 100644
index 0000000000..23a464ed62
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/attention_wrapper.md
@@ -0,0 +1,40 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.attention_wrapper" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.seq2seq.attention_wrapper
+
+A powerful dynamic attention wrapper object.
+
+
+
+Defined in [`seq2seq/attention_wrapper.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/attention_wrapper.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class AttentionMechanism`](../../tfa/seq2seq/AttentionMechanism.md)
+
+[`class AttentionWrapper`](../../tfa/seq2seq/AttentionWrapper.md): Wraps another `RNNCell` with attention.
+
+[`class AttentionWrapperState`](../../tfa/seq2seq/AttentionWrapperState.md): `namedtuple` storing the state of a `AttentionWrapper`.
+
+[`class BahdanauAttention`](../../tfa/seq2seq/BahdanauAttention.md): Implements Bahdanau-style (additive) attention.
+
+[`class BahdanauMonotonicAttention`](../../tfa/seq2seq/BahdanauMonotonicAttention.md): Monotonic attention mechanism with Bahadanau-style energy function.
+
+[`class LuongAttention`](../../tfa/seq2seq/LuongAttention.md): Implements Luong-style (multiplicative) attention scoring.
+
+[`class LuongMonotonicAttention`](../../tfa/seq2seq/LuongMonotonicAttention.md): Monotonic attention mechanism with Luong-style energy function.
+
+## Functions
+
+[`hardmax(...)`](../../tfa/seq2seq/hardmax.md): Returns batched one-hot vectors.
+
+[`monotonic_attention(...)`](../../tfa/seq2seq/monotonic_attention.md): Compute monotonic attention distribution from choosing probabilities.
+
+[`safe_cumprod(...)`](../../tfa/seq2seq/safe_cumprod.md): Computes cumprod of x in logspace using cumsum to avoid underflow.
+
diff --git a/docs/api_docs/python/tfa/seq2seq/basic_decoder.md b/docs/api_docs/python/tfa/seq2seq/basic_decoder.md
new file mode 100644
index 0000000000..8cc74c9532
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/basic_decoder.md
@@ -0,0 +1,22 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.basic_decoder" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.seq2seq.basic_decoder
+
+A class of Decoders that may sample to generate the next input.
+
+
+
+Defined in [`seq2seq/basic_decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/basic_decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class BasicDecoder`](../../tfa/seq2seq/BasicDecoder.md): Basic sampling decoder.
+
+[`class BasicDecoderOutput`](../../tfa/seq2seq/BasicDecoderOutput.md)
+
diff --git a/docs/api_docs/python/tfa/seq2seq/beam_search_decoder.md b/docs/api_docs/python/tfa/seq2seq/beam_search_decoder.md
new file mode 100644
index 0000000000..0a359ea60e
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/beam_search_decoder.md
@@ -0,0 +1,38 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.beam_search_decoder" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.seq2seq.beam_search_decoder
+
+A decoder that performs beam search.
+
+
+
+Defined in [`seq2seq/beam_search_decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/beam_search_decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class BeamSearchDecoder`](../../tfa/seq2seq/BeamSearchDecoder.md): BeamSearch sampling decoder.
+
+[`class BeamSearchDecoderMixin`](../../tfa/seq2seq/beam_search_decoder/BeamSearchDecoderMixin.md): BeamSearchDecoderMixin contains the common methods for
+
+[`class BeamSearchDecoderOutput`](../../tfa/seq2seq/BeamSearchDecoderOutput.md)
+
+[`class BeamSearchDecoderState`](../../tfa/seq2seq/BeamSearchDecoderState.md)
+
+[`class FinalBeamSearchDecoderOutput`](../../tfa/seq2seq/FinalBeamSearchDecoderOutput.md): Final outputs returned by the beam search after all decoding is
+
+## Functions
+
+[`attention_probs_from_attn_state(...)`](../../tfa/seq2seq/beam_search_decoder/attention_probs_from_attn_state.md): Calculates the average attention probabilities.
+
+[`gather_tree_from_array(...)`](../../tfa/seq2seq/gather_tree_from_array.md): Calculates the full beams for `TensorArray`s.
+
+[`get_attention_probs(...)`](../../tfa/seq2seq/beam_search_decoder/get_attention_probs.md): Get attention probabilities from the cell state.
+
+[`tile_batch(...)`](../../tfa/seq2seq/tile_batch.md): Tile the batch dimension of a (possibly nested structure of) tensor(s)
+
diff --git a/docs/api_docs/python/tfa/seq2seq/beam_search_decoder/BeamSearchDecoderMixin.md b/docs/api_docs/python/tfa/seq2seq/beam_search_decoder/BeamSearchDecoderMixin.md
new file mode 100644
index 0000000000..52efa9249f
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/beam_search_decoder/BeamSearchDecoderMixin.md
@@ -0,0 +1,168 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="batch_size"/>
+<meta itemprop="property" content="output_size"/>
+<meta itemprop="property" content="tracks_own_finished"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="finalize"/>
+<meta itemprop="property" content="step"/>
+</div>
+
+# tfa.seq2seq.beam_search_decoder.BeamSearchDecoderMixin
+
+## Class `BeamSearchDecoderMixin`
+
+BeamSearchDecoderMixin contains the common methods for
+
+
+
+
+
+Defined in [`seq2seq/beam_search_decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/beam_search_decoder.py).
+
+<!-- Placeholder for "Used in" -->
+BeamSearchDecoder.
+
+It is expected to be used a base class for concrete
+BeamSearchDecoder. Since this is a mixin class, it is expected to be
+used together with other class as base.
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+``` python
+__init__(
+    cell,
+    beam_width,
+    output_layer=None,
+    length_penalty_weight=0.0,
+    coverage_penalty_weight=0.0,
+    reorder_tensor_arrays=True,
+    **kwargs
+)
+```
+
+Initialize the BeamSearchDecoderMixin.
+
+
+#### Args:
+
+
+* <b>`cell`</b>: An `RNNCell` instance.
+* <b>`beam_width`</b>:  Python integer, the number of beams.
+* <b>`output_layer`</b>: (Optional) An instance of `tf.keras.layers.Layer`,
+  i.e., `tf.keras.layers.Dense`.  Optional layer to apply to the RNN
+  output prior to storing the result or sampling.
+* <b>`length_penalty_weight`</b>: Float weight to penalize length. Disabled with
+   0.0.
+* <b>`coverage_penalty_weight`</b>: Float weight to penalize the coverage of
+  source sentence. Disabled with 0.0.
+* <b>`reorder_tensor_arrays`</b>: If `True`, `TensorArray`s' elements within the
+  cell state will be reordered according to the beam search path. If
+  the `TensorArray` can be reordered, the stacked form will be
+  returned. Otherwise, the `TensorArray` will be returned as is. Set
+  this flag to `False` if the cell state contains `TensorArray`s that
+  are not amenable to reordering.
+* <b>`**kwargs`</b>: Dict, other keyword arguments for parent class.
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: if `cell` is not an instance of `RNNCell`,
+  or `output_layer` is not an instance of `tf.keras.layers.Layer`.
+
+
+
+## Properties
+
+<h3 id="batch_size"><code>batch_size</code></h3>
+
+
+
+
+<h3 id="output_size"><code>output_size</code></h3>
+
+
+
+
+<h3 id="tracks_own_finished"><code>tracks_own_finished</code></h3>
+
+The BeamSearchDecoder shuffles its beams and their finished state.
+
+For this reason, it conflicts with the `dynamic_decode` function's
+tracking of finished states.  Setting this property to true avoids
+early stopping of decoding due to mismanagement of the finished state
+in `dynamic_decode`.
+
+#### Returns:
+
+`True`.
+
+
+
+
+## Methods
+
+<h3 id="finalize"><code>finalize</code></h3>
+
+``` python
+finalize(
+    outputs,
+    final_state,
+    sequence_lengths
+)
+```
+
+Finalize and return the predicted_ids.
+
+
+#### Args:
+
+
+* <b>`outputs`</b>: An instance of BeamSearchDecoderOutput.
+* <b>`final_state`</b>: An instance of BeamSearchDecoderState. Passed through to
+  the output.
+* <b>`sequence_lengths`</b>: An `int64` tensor shaped
+  `[batch_size, beam_width]`. The sequence lengths determined for
+  each beam during decode. **NOTE** These are ignored; the updated
+  sequence lengths are stored in `final_state.lengths`.
+
+
+#### Returns:
+
+
+* <b>`outputs`</b>: An instance of `FinalBeamSearchDecoderOutput` where the
+  predicted_ids are the result of calling _gather_tree.
+* <b>`final_state`</b>: The same input instance of `BeamSearchDecoderState`.
+
+<h3 id="step"><code>step</code></h3>
+
+``` python
+step(
+    time,
+    inputs,
+    state,
+    name=None
+)
+```
+
+Perform a decoding step.
+
+
+#### Args:
+
+
+* <b>`time`</b>: scalar `int32` tensor.
+* <b>`inputs`</b>: A (structure of) input tensors.
+* <b>`state`</b>: A (structure of) state tensors and TensorArrays.
+* <b>`name`</b>: Name scope for any created operations.
+
+
+#### Returns:
+
+`(outputs, next_state, next_inputs, finished)`.
+
+
+
+
diff --git a/docs/api_docs/python/tfa/seq2seq/beam_search_decoder/attention_probs_from_attn_state.md b/docs/api_docs/python/tfa/seq2seq/beam_search_decoder/attention_probs_from_attn_state.md
new file mode 100644
index 0000000000..c5e8495d04
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/beam_search_decoder/attention_probs_from_attn_state.md
@@ -0,0 +1,31 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.beam_search_decoder.attention_probs_from_attn_state" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.seq2seq.beam_search_decoder.attention_probs_from_attn_state
+
+Calculates the average attention probabilities.
+
+``` python
+tfa.seq2seq.beam_search_decoder.attention_probs_from_attn_state(attention_state)
+```
+
+
+
+Defined in [`seq2seq/beam_search_decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/beam_search_decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+#### Args:
+
+
+* <b>`attention_state`</b>: An instance of `AttentionWrapperState`.
+
+
+#### Returns:
+
+The attention probabilities in the given AttentionWrapperState.
+If there're multiple attention mechanisms, return the average value from
+all attention mechanisms.
diff --git a/docs/api_docs/python/tfa/seq2seq/beam_search_decoder/get_attention_probs.md b/docs/api_docs/python/tfa/seq2seq/beam_search_decoder/get_attention_probs.md
new file mode 100644
index 0000000000..b21fd9b665
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/beam_search_decoder/get_attention_probs.md
@@ -0,0 +1,44 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.beam_search_decoder.get_attention_probs" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.seq2seq.beam_search_decoder.get_attention_probs
+
+Get attention probabilities from the cell state.
+
+``` python
+tfa.seq2seq.beam_search_decoder.get_attention_probs(
+    next_cell_state,
+    coverage_penalty_weight
+)
+```
+
+
+
+Defined in [`seq2seq/beam_search_decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/beam_search_decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+#### Args:
+
+
+* <b>`next_cell_state`</b>: The next state from the cell, e.g. an instance of
+  AttentionWrapperState if the cell is attentional.
+* <b>`coverage_penalty_weight`</b>: Float weight to penalize the coverage of source
+  sentence. Disabled with 0.0.
+
+
+#### Returns:
+
+The attention probabilities with shape
+  `[batch_size, beam_width, max_time]` if coverage penalty is enabled.
+  Otherwise, returns None.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If no cell is attentional but coverage penalty is enabled.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/seq2seq/decoder.md b/docs/api_docs/python/tfa/seq2seq/decoder.md
new file mode 100644
index 0000000000..7537ec70f1
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/decoder.md
@@ -0,0 +1,26 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.decoder" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.seq2seq.decoder
+
+Seq2seq layer operations for use in neural networks.
+
+
+
+Defined in [`seq2seq/decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class BaseDecoder`](../../tfa/seq2seq/BaseDecoder.md): An RNN Decoder that is based on a Keras layer.
+
+[`class Decoder`](../../tfa/seq2seq/Decoder.md): An RNN Decoder abstract interface object.
+
+## Functions
+
+[`dynamic_decode(...)`](../../tfa/seq2seq/dynamic_decode.md): Perform dynamic decoding with `decoder`.
+
diff --git a/docs/api_docs/python/tfa/seq2seq/dynamic_decode.md b/docs/api_docs/python/tfa/seq2seq/dynamic_decode.md
new file mode 100644
index 0000000000..005c496a9c
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/dynamic_decode.md
@@ -0,0 +1,70 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.dynamic_decode" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.seq2seq.dynamic_decode
+
+Perform dynamic decoding with `decoder`.
+
+### Aliases:
+
+* `tfa.seq2seq.decoder.dynamic_decode`
+* `tfa.seq2seq.dynamic_decode`
+
+``` python
+tfa.seq2seq.dynamic_decode(
+    decoder,
+    output_time_major=False,
+    impute_finished=False,
+    maximum_iterations=None,
+    parallel_iterations=32,
+    swap_memory=False,
+    scope=None,
+    **kwargs
+)
+```
+
+
+
+Defined in [`seq2seq/decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+Calls initialize() once and step() repeatedly on the Decoder object.
+
+#### Args:
+
+
+* <b>`decoder`</b>: A `Decoder` instance.
+* <b>`output_time_major`</b>: Python boolean.  Default: `False` (batch major). If
+  `True`, outputs are returned as time major tensors (this mode is
+  faster). Otherwise, outputs are returned as batch major tensors (this
+  adds extra time to the computation).
+* <b>`impute_finished`</b>: Python boolean.  If `True`, then states for batch
+  entries which are marked as finished get copied through and the
+  corresponding outputs get zeroed out.  This causes some slowdown at
+  each time step, but ensures that the final state and outputs have
+  the correct values and that backprop ignores time steps that were
+  marked as finished.
+* <b>`maximum_iterations`</b>: `int32` scalar, maximum allowed number of decoding
+   steps.  Default is `None` (decode until the decoder is fully done).
+* <b>`parallel_iterations`</b>: Argument passed to `tf.while_loop`.
+* <b>`swap_memory`</b>: Argument passed to `tf.while_loop`.
+* <b>`scope`</b>: Optional variable scope to use.
+* <b>`**kwargs`</b>: dict, other keyword arguments for dynamic_decode. It might
+  contain arguments for `BaseDecoder` to initialize, which takes all
+  tensor inputs during call().
+
+
+#### Returns:
+
+`(final_outputs, final_state, final_sequence_lengths)`.
+
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: if `decoder` is not an instance of `Decoder`.
+* <b>`ValueError`</b>: if `maximum_iterations` is provided but is not a scalar.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/seq2seq/gather_tree_from_array.md b/docs/api_docs/python/tfa/seq2seq/gather_tree_from_array.md
new file mode 100644
index 0000000000..8c884c4258
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/gather_tree_from_array.md
@@ -0,0 +1,44 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.gather_tree_from_array" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.seq2seq.gather_tree_from_array
+
+Calculates the full beams for `TensorArray`s.
+
+### Aliases:
+
+* `tfa.seq2seq.beam_search_decoder.gather_tree_from_array`
+* `tfa.seq2seq.gather_tree_from_array`
+
+``` python
+tfa.seq2seq.gather_tree_from_array(
+    t,
+    parent_ids,
+    sequence_length
+)
+```
+
+
+
+Defined in [`seq2seq/beam_search_decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/beam_search_decoder.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+#### Args:
+
+
+* <b>`t`</b>: A stacked `TensorArray` of size `max_time` that contains `Tensor`s of
+  shape `[batch_size, beam_width, s]` or `[batch_size * beam_width, s]`
+  where `s` is the depth shape.
+* <b>`parent_ids`</b>: The parent ids of shape `[max_time, batch_size, beam_width]`.
+* <b>`sequence_length`</b>: The sequence length of shape `[batch_size, beam_width]`.
+
+
+#### Returns:
+
+A `Tensor` which is a stacked `TensorArray` of the same size and type as
+`t` and where beams are sorted in each `Tensor` according to
+`parent_ids`.
diff --git a/docs/api_docs/python/tfa/seq2seq/hardmax.md b/docs/api_docs/python/tfa/seq2seq/hardmax.md
new file mode 100644
index 0000000000..ca6aed2e15
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/hardmax.md
@@ -0,0 +1,38 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.hardmax" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.seq2seq.hardmax
+
+Returns batched one-hot vectors.
+
+### Aliases:
+
+* `tfa.seq2seq.attention_wrapper.hardmax`
+* `tfa.seq2seq.hardmax`
+
+``` python
+tfa.seq2seq.hardmax(
+    logits,
+    name=None
+)
+```
+
+
+
+Defined in [`seq2seq/attention_wrapper.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/attention_wrapper.py).
+
+<!-- Placeholder for "Used in" -->
+
+The depth index containing the `1` is that of the maximum logit value.
+
+#### Args:
+
+
+* <b>`logits`</b>: A batch tensor of logit values.
+* <b>`name`</b>: Name to use when creating ops.
+
+#### Returns:
+
+A batched one-hot tensor.
diff --git a/docs/api_docs/python/tfa/seq2seq/loss.md b/docs/api_docs/python/tfa/seq2seq/loss.md
new file mode 100644
index 0000000000..1408f71d38
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/loss.md
@@ -0,0 +1,24 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.loss" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.seq2seq.loss
+
+Seq2seq loss operations for use in sequence models.
+
+
+
+Defined in [`seq2seq/loss.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/loss.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class SequenceLoss`](../../tfa/seq2seq/SequenceLoss.md): Weighted cross-entropy loss for a sequence of logits.
+
+## Functions
+
+[`sequence_loss(...)`](../../tfa/seq2seq/sequence_loss.md): Weighted cross-entropy loss for a sequence of logits.
+
diff --git a/docs/api_docs/python/tfa/seq2seq/monotonic_attention.md b/docs/api_docs/python/tfa/seq2seq/monotonic_attention.md
new file mode 100644
index 0000000000..2392214a12
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/monotonic_attention.md
@@ -0,0 +1,73 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.monotonic_attention" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.seq2seq.monotonic_attention
+
+Compute monotonic attention distribution from choosing probabilities.
+
+### Aliases:
+
+* `tfa.seq2seq.attention_wrapper.monotonic_attention`
+* `tfa.seq2seq.monotonic_attention`
+
+``` python
+tfa.seq2seq.monotonic_attention(
+    p_choose_i,
+    previous_attention,
+    mode
+)
+```
+
+
+
+Defined in [`seq2seq/attention_wrapper.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/attention_wrapper.py).
+
+<!-- Placeholder for "Used in" -->
+
+Monotonic attention implies that the input sequence is processed in an
+explicitly left-to-right manner when generating the output sequence.  In
+addition, once an input sequence element is attended to at a given output
+timestep, elements occurring before it cannot be attended to at subsequent
+output timesteps.  This function generates attention distributions
+according to these assumptions.  For more information, see `Online and
+Linear-Time Attention by Enforcing Monotonic Alignments`.
+
+#### Args:
+
+
+* <b>`p_choose_i`</b>: Probability of choosing input sequence/memory element i.
+  Should be of shape (batch_size, input_sequence_length), and should all
+  be in the range [0, 1].
+* <b>`previous_attention`</b>: The attention distribution from the previous output
+  timestep.  Should be of shape (batch_size, input_sequence_length).  For
+  the first output timestep, preevious_attention[n] should be
+  [1, 0, 0, ..., 0] for all n in [0, ... batch_size - 1].
+* <b>`mode`</b>: How to compute the attention distribution.  Must be one of
+  'recursive', 'parallel', or 'hard'.
+    * 'recursive' uses tf.scan to recursively compute the distribution.
+      This is slowest but is exact, general, and does not suffer from
+      numerical instabilities.
+    * 'parallel' uses parallelized cumulative-sum and cumulative-product
+      operations to compute a closed-form solution to the recurrence
+      relation defining the attention distribution.  This makes it more
+      efficient than 'recursive', but it requires numerical checks which
+      make the distribution non-exact.  This can be a problem in
+      particular when input_sequence_length is long and/or p_choose_i has
+      entries very close to 0 or 1.
+    * 'hard' requires that the probabilities in p_choose_i are all either
+      0 or 1, and subsequently uses a more efficient and exact solution.
+
+
+#### Returns:
+
+A tensor of shape (batch_size, input_sequence_length) representing the
+attention distributions for each sequence in the batch.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: mode is not one of 'recursive', 'parallel', 'hard'.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/seq2seq/safe_cumprod.md b/docs/api_docs/python/tfa/seq2seq/safe_cumprod.md
new file mode 100644
index 0000000000..ceee331bb7
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/safe_cumprod.md
@@ -0,0 +1,44 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.safe_cumprod" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.seq2seq.safe_cumprod
+
+Computes cumprod of x in logspace using cumsum to avoid underflow.
+
+### Aliases:
+
+* `tfa.seq2seq.attention_wrapper.safe_cumprod`
+* `tfa.seq2seq.safe_cumprod`
+
+``` python
+tfa.seq2seq.safe_cumprod(
+    x,
+    *args,
+    **kwargs
+)
+```
+
+
+
+Defined in [`seq2seq/attention_wrapper.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/attention_wrapper.py).
+
+<!-- Placeholder for "Used in" -->
+
+The cumprod function and its gradient can result in numerical instabilities
+when its argument has very small and/or zero values.  As long as the
+argument is all positive, we can instead compute the cumulative product as
+exp(cumsum(log(x))).  This function can be called identically to
+tf.cumprod.
+
+#### Args:
+
+
+* <b>`x`</b>: Tensor to take the cumulative product of.
+* <b>`*args`</b>: Passed on to cumsum; these are identical to those in cumprod.
+* <b>`**kwargs`</b>: Passed on to cumsum; these are identical to those in cumprod.
+
+#### Returns:
+
+Cumulative product of x.
diff --git a/docs/api_docs/python/tfa/seq2seq/sampler.md b/docs/api_docs/python/tfa/seq2seq/sampler.md
new file mode 100644
index 0000000000..c544e9d4ec
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/sampler.md
@@ -0,0 +1,40 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.sampler" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.seq2seq.sampler
+
+A library of sampler for use with SamplingDecoders.
+
+
+
+Defined in [`seq2seq/sampler.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/sampler.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Classes
+
+[`class CustomSampler`](../../tfa/seq2seq/CustomSampler.md): Base abstract class that allows the user to customize sampling.
+
+[`class GreedyEmbeddingSampler`](../../tfa/seq2seq/GreedyEmbeddingSampler.md): A sampler for use during inference.
+
+[`class InferenceSampler`](../../tfa/seq2seq/InferenceSampler.md): A helper to use during inference with a custom sampling function.
+
+[`class SampleEmbeddingSampler`](../../tfa/seq2seq/SampleEmbeddingSampler.md): A sampler for use during inference.
+
+[`class Sampler`](../../tfa/seq2seq/Sampler.md): Interface for implementing sampling in seq2seq decoders.
+
+[`class ScheduledEmbeddingTrainingSampler`](../../tfa/seq2seq/ScheduledEmbeddingTrainingSampler.md): A training sampler that adds scheduled sampling.
+
+[`class ScheduledOutputTrainingSampler`](../../tfa/seq2seq/ScheduledOutputTrainingSampler.md): A training sampler that adds scheduled sampling directly to outputs.
+
+[`class TrainingSampler`](../../tfa/seq2seq/TrainingSampler.md): A Sampler for use during training.
+
+## Functions
+
+[`bernoulli_sample(...)`](../../tfa/seq2seq/sampler/bernoulli_sample.md): Samples from Bernoulli distribution.
+
+[`categorical_sample(...)`](../../tfa/seq2seq/sampler/categorical_sample.md): Samples from categorical distribution.
+
diff --git a/docs/api_docs/python/tfa/seq2seq/sampler/bernoulli_sample.md b/docs/api_docs/python/tfa/seq2seq/sampler/bernoulli_sample.md
new file mode 100644
index 0000000000..bca0a10698
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/sampler/bernoulli_sample.md
@@ -0,0 +1,24 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.sampler.bernoulli_sample" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.seq2seq.sampler.bernoulli_sample
+
+Samples from Bernoulli distribution.
+
+``` python
+tfa.seq2seq.sampler.bernoulli_sample(
+    probs=None,
+    logits=None,
+    dtype=tf.int32,
+    sample_shape=(),
+    seed=None
+)
+```
+
+
+
+Defined in [`seq2seq/sampler.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/sampler.py).
+
+<!-- Placeholder for "Used in" -->
diff --git a/docs/api_docs/python/tfa/seq2seq/sampler/categorical_sample.md b/docs/api_docs/python/tfa/seq2seq/sampler/categorical_sample.md
new file mode 100644
index 0000000000..d13ef6b34e
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/sampler/categorical_sample.md
@@ -0,0 +1,23 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.sampler.categorical_sample" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.seq2seq.sampler.categorical_sample
+
+Samples from categorical distribution.
+
+``` python
+tfa.seq2seq.sampler.categorical_sample(
+    logits,
+    dtype=tf.int32,
+    sample_shape=(),
+    seed=None
+)
+```
+
+
+
+Defined in [`seq2seq/sampler.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/sampler.py).
+
+<!-- Placeholder for "Used in" -->
diff --git a/docs/api_docs/python/tfa/seq2seq/sequence_loss.md b/docs/api_docs/python/tfa/seq2seq/sequence_loss.md
new file mode 100644
index 0000000000..5fe50b724a
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/sequence_loss.md
@@ -0,0 +1,95 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.sequence_loss" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.seq2seq.sequence_loss
+
+Weighted cross-entropy loss for a sequence of logits.
+
+### Aliases:
+
+* `tfa.seq2seq.loss.sequence_loss`
+* `tfa.seq2seq.sequence_loss`
+
+``` python
+tfa.seq2seq.sequence_loss(
+    logits,
+    targets,
+    weights,
+    average_across_timesteps=True,
+    average_across_batch=True,
+    sum_over_timesteps=False,
+    sum_over_batch=False,
+    softmax_loss_function=None,
+    name=None
+)
+```
+
+
+
+Defined in [`seq2seq/loss.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/loss.py).
+
+<!-- Placeholder for "Used in" -->
+
+Depending on the values of `average_across_timesteps` /
+`sum_over_timesteps` and `average_across_batch` / `sum_over_batch`, the
+return Tensor will have rank 0, 1, or 2 as these arguments reduce the
+cross-entropy at each target, which has shape
+`[batch_size, sequence_length]`, over their respective dimensions. For
+example, if `average_across_timesteps` is `True` and `average_across_batch`
+is `False`, then the return Tensor will have shape `[batch_size]`.
+
+Note that `average_across_timesteps` and `sum_over_timesteps` cannot be
+True at same time. Same for `average_across_batch` and `sum_over_batch`.
+
+The recommended loss reduction in tf 2.0 has been changed to sum_over,
+instead of weighted average. User are recommend to use `sum_over_timesteps`
+and `sum_over_batch` for reduction.
+
+#### Args:
+
+
+* <b>`logits`</b>: A Tensor of shape
+  `[batch_size, sequence_length, num_decoder_symbols]` and dtype float.
+  The logits correspond to the prediction across all classes at each
+  timestep.
+* <b>`targets`</b>: A Tensor of shape `[batch_size, sequence_length]` and dtype
+  int. The target represents the true class at each timestep.
+* <b>`weights`</b>: A Tensor of shape `[batch_size, sequence_length]` and dtype
+  float. `weights` constitutes the weighting of each prediction in the
+  sequence. When using `weights` as masking, set all valid timesteps to 1
+  and all padded timesteps to 0, e.g. a mask returned by
+  `tf.sequence_mask`.
+* <b>`average_across_timesteps`</b>: If set, sum the cost across the sequence
+  dimension and divide the cost by the total label weight across
+  timesteps.
+* <b>`average_across_batch`</b>: If set, sum the cost across the batch dimension and
+  divide the returned cost by the batch size.
+* <b>`sum_over_timesteps`</b>: If set, sum the cost across the sequence dimension
+  and divide the size of the sequence. Note that any element with 0
+  weights will be excluded from size calculation.
+* <b>`sum_over_batch`</b>: if set, sum the cost across the batch dimension and
+  divide the total cost by the batch size. Not that any element with 0
+  weights will be excluded from size calculation.
+* <b>`softmax_loss_function`</b>: Function (labels, logits) -> loss-batch
+  to be used instead of the standard softmax (the default if this is
+  None). **Note that to avoid confusion, it is required for the function
+  to accept named arguments.**
+* <b>`name`</b>: Optional name for this operation, defaults to "sequence_loss".
+
+
+#### Returns:
+
+A float Tensor of rank 0, 1, or 2 depending on the
+`average_across_timesteps` and `average_across_batch` arguments. By
+default, it has rank 0 (scalar) and is the weighted average cross-entropy
+(log-perplexity) per symbol.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: logits does not have 3 dimensions or targets does not have 2
+            dimensions or weights does not have 2 dimensions.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/seq2seq/tile_batch.md b/docs/api_docs/python/tfa/seq2seq/tile_batch.md
new file mode 100644
index 0000000000..16a849718a
--- /dev/null
+++ b/docs/api_docs/python/tfa/seq2seq/tile_batch.md
@@ -0,0 +1,56 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.seq2seq.tile_batch" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.seq2seq.tile_batch
+
+Tile the batch dimension of a (possibly nested structure of) tensor(s)
+
+### Aliases:
+
+* `tfa.seq2seq.beam_search_decoder.tile_batch`
+* `tfa.seq2seq.tile_batch`
+
+``` python
+tfa.seq2seq.tile_batch(
+    t,
+    multiplier,
+    name=None
+)
+```
+
+
+
+Defined in [`seq2seq/beam_search_decoder.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/seq2seq/beam_search_decoder.py).
+
+<!-- Placeholder for "Used in" -->
+t.
+
+For each tensor t in a (possibly nested structure) of tensors,
+this function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed
+of minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a
+shape `[batch_size * multiplier, s0, s1, ...]` composed of minibatch
+entries `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is
+repeated `multiplier` times.
+
+#### Args:
+
+
+* <b>`t`</b>: `Tensor` shaped `[batch_size, ...]`.
+* <b>`multiplier`</b>: Python int.
+* <b>`name`</b>: Name scope for any created operations.
+
+
+#### Returns:
+
+A (possibly nested structure of) `Tensor` shaped
+`[batch_size * multiplier, ...]`.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: if tensor(s) `t` do not have a statically known rank or
+the rank is < 1.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/text.md b/docs/api_docs/python/tfa/text.md
new file mode 100644
index 0000000000..8b5f6fbbda
--- /dev/null
+++ b/docs/api_docs/python/tfa/text.md
@@ -0,0 +1,26 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.text" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.text
+
+Text-processing ops.
+
+
+
+Defined in [`text/__init__.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/text/__init__.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Modules
+
+[`skip_gram_ops`](../tfa/text/skip_gram_ops.md) module: Skip-gram sampling ops from https://arxiv.org/abs/1301.3781.
+
+## Functions
+
+[`skip_gram_sample(...)`](../tfa/text/skip_gram_sample.md): Generates skip-gram token and label paired Tensors from the input
+
+[`skip_gram_sample_with_text_vocab(...)`](../tfa/text/skip_gram_sample_with_text_vocab.md): Skip-gram sampling with a text vocabulary file.
+
diff --git a/docs/api_docs/python/tfa/text/skip_gram_ops.md b/docs/api_docs/python/tfa/text/skip_gram_ops.md
new file mode 100644
index 0000000000..a8d6cf949f
--- /dev/null
+++ b/docs/api_docs/python/tfa/text/skip_gram_ops.md
@@ -0,0 +1,22 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.text.skip_gram_ops" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# Module: tfa.text.skip_gram_ops
+
+Skip-gram sampling ops from https://arxiv.org/abs/1301.3781.
+
+
+
+Defined in [`text/skip_gram_ops.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/text/skip_gram_ops.py).
+
+<!-- Placeholder for "Used in" -->
+
+
+## Functions
+
+[`skip_gram_sample(...)`](../../tfa/text/skip_gram_sample.md): Generates skip-gram token and label paired Tensors from the input
+
+[`skip_gram_sample_with_text_vocab(...)`](../../tfa/text/skip_gram_sample_with_text_vocab.md): Skip-gram sampling with a text vocabulary file.
+
diff --git a/docs/api_docs/python/tfa/text/skip_gram_sample.md b/docs/api_docs/python/tfa/text/skip_gram_sample.md
new file mode 100644
index 0000000000..763cd12b20
--- /dev/null
+++ b/docs/api_docs/python/tfa/text/skip_gram_sample.md
@@ -0,0 +1,143 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.text.skip_gram_sample" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.text.skip_gram_sample
+
+Generates skip-gram token and label paired Tensors from the input
+
+### Aliases:
+
+* `tfa.text.skip_gram_ops.skip_gram_sample`
+* `tfa.text.skip_gram_sample`
+
+``` python
+tfa.text.skip_gram_sample(
+    input_tensor,
+    min_skips=1,
+    max_skips=5,
+    start=0,
+    limit=-1,
+    emit_self_as_target=False,
+    vocab_freq_table=None,
+    vocab_min_count=None,
+    vocab_subsampling=None,
+    corpus_size=None,
+    batch_size=None,
+    batch_capacity=None,
+    seed=None,
+    name=None
+)
+```
+
+
+
+Defined in [`text/skip_gram_ops.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/text/skip_gram_ops.py).
+
+<!-- Placeholder for "Used in" -->
+tensor.
+
+Generates skip-gram `("token", "label")` pairs using each element in the
+rank-1 `input_tensor` as a token. The window size used for each token will
+be randomly selected from the range specified by `[min_skips, max_skips]`,
+inclusive. See https://arxiv.org/abs/1301.3781 for more details about
+skip-gram.
+
+For example, given `input_tensor = ["the", "quick", "brown", "fox",
+"jumps"]`, `min_skips = 1`, `max_skips = 2`, `emit_self_as_target = False`,
+the output `(tokens, labels)` pairs for the token "quick" will be randomly
+selected from either `(tokens=["quick", "quick"], labels=["the", "brown"])`
+for 1 skip, or `(tokens=["quick", "quick", "quick"],
+labels=["the", "brown", "fox"])` for 2 skips.
+
+If `emit_self_as_target = True`, each token will also be emitted as a label
+for itself. From the previous example, the output will be either
+`(tokens=["quick", "quick", "quick"], labels=["the", "quick", "brown"])`
+for 1 skip, or `(tokens=["quick", "quick", "quick", "quick"],
+labels=["the", "quick", "brown", "fox"])` for 2 skips.
+
+The same process is repeated for each element of `input_tensor` and
+concatenated together into the two output rank-1 `Tensors` (one for all the
+tokens, another for all the labels).
+
+If `vocab_freq_table` is specified, tokens in `input_tensor` that are not
+present in the vocabulary are discarded. Tokens whose frequency counts are
+below `vocab_min_count` are also discarded. Tokens whose frequency
+proportions in the corpus exceed `vocab_subsampling` may be randomly
+down-sampled. See Eq. 5 in http://arxiv.org/abs/1310.4546 for more details
+about subsampling.
+
+Due to the random window sizes used for each token, the lengths of the
+outputs are non-deterministic, unless `batch_size` is specified to batch
+the outputs to always return `Tensors` of length `batch_size`.
+
+#### Args:
+
+
+* <b>`input_tensor`</b>: A rank-1 `Tensor` from which to generate skip-gram
+  candidates.
+* <b>`min_skips`</b>: `int` or scalar `Tensor` specifying the minimum window size to
+  randomly use for each token. Must be >= 0 and <= `max_skips`. If
+  `min_skips` and `max_skips` are both 0, the only label outputted will
+  be the token itself when `emit_self_as_target = True` -
+  or no output otherwise.
+* <b>`max_skips`</b>: `int` or scalar `Tensor` specifying the maximum window size to
+  randomly use for each token. Must be >= 0.
+* <b>`start`</b>: `int` or scalar `Tensor` specifying the position in
+  `input_tensor` from which to start generating skip-gram candidates.
+* <b>`limit`</b>: `int` or scalar `Tensor` specifying the maximum number of
+  elements in `input_tensor` to use in generating skip-gram candidates.
+  -1 means to use the rest of the `Tensor` after `start`.
+* <b>`emit_self_as_target`</b>: `bool` or scalar `Tensor` specifying whether to emit
+  each token as a label for itself.
+* <b>`vocab_freq_table`</b>: (Optional) A lookup table (subclass of
+  `lookup.InitializableLookupTableBase`) that maps tokens to their raw
+  frequency counts. If specified, any token in `input_tensor` that is not
+  found in `vocab_freq_table` will be filtered out before generating
+  skip-gram candidates. While this will typically map to integer raw
+  frequency counts, it could also map to float frequency proportions.
+  `vocab_min_count` and `corpus_size` should be in the same units
+  as this.
+* <b>`vocab_min_count`</b>: (Optional) `int`, `float`, or scalar `Tensor` specifying
+  minimum frequency threshold (from `vocab_freq_table`) for a token to be
+  kept in `input_tensor`. If this is specified, `vocab_freq_table` must
+  also be specified - and they should both be in the same units.
+* <b>`vocab_subsampling`</b>: (Optional) `float` specifying frequency proportion
+  threshold for tokens from `input_tensor`. Tokens that occur more
+  frequently (based on the ratio of the token's `vocab_freq_table` value
+  to the `corpus_size`) will be randomly down-sampled. Reasonable
+  starting values may be around 1e-3 or 1e-5. If this is specified, both
+  `vocab_freq_table` and `corpus_size` must also be specified. See Eq. 5
+  in http://arxiv.org/abs/1310.4546 for more details.
+* <b>`corpus_size`</b>: (Optional) `int`, `float`, or scalar `Tensor` specifying the
+  total number of tokens in the corpus (e.g., sum of all the frequency
+  counts of `vocab_freq_table`). Used with `vocab_subsampling` for
+  down-sampling frequently occurring tokens. If this is specified,
+  `vocab_freq_table` and `vocab_subsampling` must also be specified.
+* <b>`batch_size`</b>: (Optional) `int` specifying batch size of returned `Tensors`.
+* <b>`batch_capacity`</b>: (Optional) `int` specifying batch capacity for the queue
+  used for batching returned `Tensors`. Only has an effect if
+  `batch_size` > 0. Defaults to 100 * `batch_size` if not specified.
+* <b>`seed`</b>: (Optional) `int` used to create a random seed for window size and
+  subsampling. See `set_random_seed` docs for behavior.
+* <b>`name`</b>: (Optional) A `string` name or a name scope for the operations.
+
+
+#### Returns:
+
+A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of
+rank-1 and has the same type as `input_tensor`. The `Tensors` will be of
+length `batch_size`; if `batch_size` is not specified, they will be of
+random length, though they will be in sync with each other as long as
+they are evaluated together.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If `vocab_freq_table` is not provided, but `vocab_min_count`,
+  `vocab_subsampling`, or `corpus_size` is specified.
+  If `vocab_subsampling` and `corpus_size` are not both present or
+  both absent.
\ No newline at end of file
diff --git a/docs/api_docs/python/tfa/text/skip_gram_sample_with_text_vocab.md b/docs/api_docs/python/tfa/text/skip_gram_sample_with_text_vocab.md
new file mode 100644
index 0000000000..e5543eee6f
--- /dev/null
+++ b/docs/api_docs/python/tfa/text/skip_gram_sample_with_text_vocab.md
@@ -0,0 +1,138 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfa.text.skip_gram_sample_with_text_vocab" />
+<meta itemprop="path" content="Stable" />
+</div>
+
+# tfa.text.skip_gram_sample_with_text_vocab
+
+Skip-gram sampling with a text vocabulary file.
+
+### Aliases:
+
+* `tfa.text.skip_gram_ops.skip_gram_sample_with_text_vocab`
+* `tfa.text.skip_gram_sample_with_text_vocab`
+
+``` python
+tfa.text.skip_gram_sample_with_text_vocab(
+    input_tensor,
+    vocab_freq_file,
+    vocab_token_index=0,
+    vocab_token_dtype=tf.dtypes.string,
+    vocab_freq_index=1,
+    vocab_freq_dtype=tf.dtypes.float64,
+    vocab_delimiter=',',
+    vocab_min_count=0,
+    vocab_subsampling=None,
+    corpus_size=None,
+    min_skips=1,
+    max_skips=5,
+    start=0,
+    limit=-1,
+    emit_self_as_target=False,
+    batch_size=None,
+    batch_capacity=None,
+    seed=None,
+    name=None
+)
+```
+
+
+
+Defined in [`text/skip_gram_ops.py`](https://github.com/tensorflow/addons/tree/0.4-release/tensorflow_addons/text/skip_gram_ops.py).
+
+<!-- Placeholder for "Used in" -->
+
+Wrapper around `skip_gram_sample()` for use with a text vocabulary file.
+The vocabulary file is expected to be a plain-text file, with lines of
+`vocab_delimiter`-separated columns. The `vocab_token_index` column should
+contain the vocabulary term, while the `vocab_freq_index` column should
+contain the number of times that term occurs in the corpus. For example,
+with a text vocabulary file of:
+
+  ```
+  bonjour,fr,42
+  hello,en,777
+  hola,es,99
+  ```
+
+You should set `vocab_delimiter=","`, `vocab_token_index=0`, and
+`vocab_freq_index=2`.
+
+See `skip_gram_sample()` documentation for more details about the skip-gram
+sampling process.
+
+#### Args:
+
+
+* <b>`input_tensor`</b>:   A rank-1 `Tensor` from which to generate skip-gram candidates.
+* <b>`vocab_freq_file`</b>:   `string` specifying full file path to the text vocab file.
+* <b>`vocab_token_index`</b>: `int` specifying which column in the text vocab file
+  contains the tokens.
+* <b>`vocab_token_dtype`</b>:   `DType` specifying the format of the tokens in the text vocab file.
+* <b>`vocab_freq_index`</b>: `int` specifying which column in the text vocab file
+  contains the frequency counts of the tokens.
+* <b>`vocab_freq_dtype`</b>: `DType` specifying the format of the frequency counts
+  in the text vocab file.
+* <b>`vocab_delimiter`</b>: `string` specifying the delimiter used in the text vocab
+  file.
+* <b>`vocab_min_count`</b>: `int`, `float`, or scalar `Tensor` specifying
+  minimum frequency threshold (from `vocab_freq_file`) for a token to be
+  kept in `input_tensor`. This should correspond with `vocab_freq_dtype`.
+* <b>`vocab_subsampling`</b>: (Optional) `float` specifying frequency proportion
+  threshold for tokens from `input_tensor`. Tokens that occur more
+  frequently will be randomly down-sampled. Reasonable starting values
+  may be around 1e-3 or 1e-5. See Eq. 5 in http://arxiv.org/abs/1310.4546
+  for more details.
+* <b>`corpus_size`</b>: (Optional) `int`, `float`, or scalar `Tensor` specifying the
+  total number of tokens in the corpus (e.g., sum of all the frequency
+  counts of `vocab_freq_file`). Used with `vocab_subsampling` for
+  down-sampling frequently occurring tokens. If this is specified,
+  `vocab_freq_file` and `vocab_subsampling` must also be specified.
+  If `corpus_size` is needed but not supplied, then it will be calculated
+  from `vocab_freq_file`. You might want to supply your own value if you
+  have already eliminated infrequent tokens from your vocabulary files
+  (where frequency < vocab_min_count) to save memory in the internal
+  token lookup table. Otherwise, the unused tokens' variables will waste
+  memory.  The user-supplied `corpus_size` value must be greater than or
+  equal to the sum of all the frequency counts of `vocab_freq_file`.
+* <b>`min_skips`</b>: `int` or scalar `Tensor` specifying the minimum window size to
+  randomly use for each token. Must be >= 0 and <= `max_skips`. If
+  `min_skips` and `max_skips` are both 0, the only label outputted will
+  be the token itself.
+* <b>`max_skips`</b>: `int` or scalar `Tensor` specifying the maximum window size to
+  randomly use for each token. Must be >= 0.
+* <b>`start`</b>: `int` or scalar `Tensor` specifying the position in `input_tensor`
+  from which to start generating skip-gram candidates.
+* <b>`limit`</b>: `int` or scalar `Tensor` specifying the maximum number of elements
+  in `input_tensor` to use in generating skip-gram candidates. -1 means
+  to use the rest of the `Tensor` after `start`.
+* <b>`emit_self_as_target`</b>: `bool` or scalar `Tensor` specifying whether to emit
+  each token as a label for itself.
+* <b>`batch_size`</b>: (Optional) `int` specifying batch size of returned `Tensors`.
+* <b>`batch_capacity`</b>: (Optional) `int` specifying batch capacity for the queue
+  used for batching returned `Tensors`. Only has an effect if
+  `batch_size` > 0. Defaults to 100 * `batch_size` if not specified.
+* <b>`seed`</b>: (Optional) `int` used to create a random seed for window size and
+  subsampling. See
+  [`set_random_seed`](../../g3doc/python/constant_op.md#set_random_seed)
+  for behavior.
+* <b>`name`</b>: (Optional) A `string` name or a name scope for the operations.
+
+
+#### Returns:
+
+A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of
+rank-1 and has the same type as `input_tensor`. The `Tensors` will be of
+length `batch_size`; if `batch_size` is not specified, they will be of
+random length, though they will be in sync with each other as long as
+they are evaluated together.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If `vocab_token_index` or `vocab_freq_index` is less than 0
+  or exceeds the number of columns in `vocab_freq_file`.
+  If `vocab_token_index` and `vocab_freq_index` are both set to the same
+  column. If any token in `vocab_freq_file` has a negative frequency.
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 8b3c4f69bc..0349c9f771 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-tf-nightly-2.0-preview
\ No newline at end of file
+tensorflow==2.0.0b1
\ No newline at end of file
diff --git a/tensorflow_addons/version.py b/tensorflow_addons/version.py
index 008d683d2e..1c210762c6 100644
--- a/tensorflow_addons/version.py
+++ b/tensorflow_addons/version.py
@@ -27,7 +27,7 @@
 # stable release (indicated by `_VERSION_SUFFIX = ''`). Outside the context of a
 # release branch, the current version is by default assumed to be a
 # 'development' version, labeled 'dev'.
-_VERSION_SUFFIX = 'dev'
+_VERSION_SUFFIX = ''
 
 # Example, '0.1.0-dev'
 __version__ = '.'.join([
diff --git a/tools/ci_build/builds/release_linux.sh b/tools/ci_build/builds/release_linux.sh
index e6403153e6..873ae704d4 100755
--- a/tools/ci_build/builds/release_linux.sh
+++ b/tools/ci_build/builds/release_linux.sh
@@ -38,7 +38,7 @@ for version in ${PYTHON_VERSIONS}; do
       build_pip_pkg
 
     # Package Whl
-    bazel-bin/build_pip_pkg artifacts --nightly
+    bazel-bin/build_pip_pkg artifacts
 
     # Uncomment and use this command for release branches
     #bazel-bin/build_pip_pkg artifacts
diff --git a/tools/ci_build/builds/release_macos.sh b/tools/ci_build/builds/release_macos.sh
index 4baeba68db..9e8c97ad04 100644
--- a/tools/ci_build/builds/release_macos.sh
+++ b/tools/ci_build/builds/release_macos.sh
@@ -41,7 +41,7 @@ for version in ${PYTHON_VERSIONS}; do
       build_pip_pkg
 
     # Package Whl
-    bazel-bin/build_pip_pkg artifacts --nightly
+    bazel-bin/build_pip_pkg artifacts
 
     # Uncomment and use this command for release branches
     #bazel-bin/build_pip_pkg artifacts