diff --git a/README.md b/README.md
index 755d080b6..cd9ab9331 100644
--- a/README.md
+++ b/README.md
@@ -154,7 +154,7 @@ For all translation problems, we suggest to try the Transformer model:
 this should reach a BLEU score of about 28 on the English-German data-set,
 which is close to state-of-the art. If training on a single GPU, try the
 `--hparams_set=transformer_base_single_gpu` setting. For very good results
-or larger data-sets (e.g., for English-French)m, try the big model
+or larger data-sets (e.g., for English-French), try the big model
 with `--hparams_set=transformer_big`.
 
 ## Basics
diff --git a/docs/distributed_training.md b/docs/distributed_training.md
index 9ed9778da..95b499f87 100644
--- a/docs/distributed_training.md
+++ b/docs/distributed_training.md
@@ -5,7 +5,7 @@ training.
 
 T2T uses TensorFlow Estimators and so distributed training is configured with
 the `TF_CONFIG` environment variable that is read by the
-[RunConfig](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/learn/python/learn/estimators/run_config.py)
+[RunConfig](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/estimator/run_config.py)
 along with a set of flags.
 
 ## `TF_CONFIG`
diff --git a/docs/index.md b/docs/index.md
index 8860e03b7..b7d0236c9 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -69,7 +69,7 @@ For language modeling, we have these data-sets in T2T:
 * LM1B (a billion-word corpus): `--problems=languagemodel_lm1b32k` for
     subword-level modeling and `--problems=languagemodel_lm1b_characters`
     for character-level modeling.
-    
+
 We suggest to start with `--model=transformer` on this task and use
 `--hparams_set=transformer_small` for PTB and
 `--hparams_set=transformer_base` for LM1B.
@@ -95,7 +95,7 @@ For speech-to-text, we have these data-sets in T2T:
 For summarizing longer text into shorter one we have these data-sets:
 * CNN/DailyMail articles summarized into a few sentences:
   `--problems=summarize_cnn_dailymail32k`
-  
+
 We suggest to use `--model=transformer` and
 `--hparams_set=transformer_prepend` for this task.
 This yields good ROUGE scores.
@@ -118,5 +118,5 @@ For all translation problems, we suggest to try the Transformer model:
 this should reach a BLEU score of about 28 on the English-German data-set,
 which is close to state-of-the art. If training on a single GPU, try the
 `--hparams_set=transformer_base_single_gpu` setting. For very good results
-or larger data-sets (e.g., for English-French)m, try the big model
+or larger data-sets (e.g., for English-French), try the big model
 with `--hparams_set=transformer_big`.
diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py
index 9c6144e9f..d05c1f599 100644
--- a/tensor2tensor/models/lstm.py
+++ b/tensor2tensor/models/lstm.py
@@ -65,7 +65,7 @@ def dropout_lstm_cell():
     attention_mechanism_class = tf.contrib.seq2seq.BahdanauAttention
   else:
     raise ValueError("Unknown hparams.attention_mechanism = %s, must be "
-                     "luong or bahdanu." % hparams.attention_mechanism)
+                     "luong or bahdanau." % hparams.attention_mechanism)
   attention_mechanism = attention_mechanism_class(
       hparams.hidden_size, encoder_outputs)
 
@@ -338,7 +338,7 @@ def lstm_attention():
 
 @registry.register_hparams
 def lstm_bahdanau_attention_multi():
-  """Multi-head Bahdanu attention."""
+  """Multi-head Bahdanau attention."""
   hparams = lstm_bahdanau_attention()
   hparams.num_heads = 4
   return hparams