This Note book is adapted from [WuJiaocan's Github](https://github.com/WuJiaocan/tensorflow)

In [0]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [0]:
import tensorflow as tf

### 1.Create Vocabulary

In [0]:
src_path = 'en_clean.txt'
tgt_path = 'fr_clean.txt'

In [0]:
def read_data(txt_path, num_examples):
  text = []
  with open(txt_path, 'r', encoding="utf-8") as f:
    for line in f.readlines():
      line = line[7:-6]
      text.append(line.strip('\n'))
    return text[:num_examples]

In [0]:
src = read_data(src_path, 100000)
tgt = read_data(tgt_path, 100000)

In [5]:
print('size of number of source sentences from dataset {}'.format(len(src)))
print('size of number of target sentences from dataset {}'.format(len(src)))

size of number of source sentences from dataset 100000
size of number of target sentences from dataset 100000


In [17]:
for i in range(5):
  print(src[i])

 global health where do physiotherapy and rehabilitation research fit ? 
 carabin 
 comment on the misappropriation of bibliographical references in science . the example of anti aging medicine 
 anti aging medicine , a science based , essential medicine 
 underwater dive in fresh water complicated by a cardiorespiratory arrest on obstructive shock 


In [18]:
for i in range(5):
  print(tgt[i])

 la place des cheveux et des poils dans les rituels et le sacre 
 les carabins 
 du detournement des references bibliographiques en science . l exemple de la medecine anti age 
 la medecine anti age , une medecine scientifique , indispensable 
 plongee subaquatique en eau douce compliquee d un arret cardiorespiratoire sur choc obstructif 


In [0]:
import codecs
import collections
from operator import itemgetter

def create_vocab(RAW_DATA, VOCAB_OUTPUT):
    counter = collections.Counter()

    for line in RAW_DATA:
        for word in line.strip().lower().split():
            counter[word] += 1

    sorted_word_to_cnt = sorted(counter.items(), key=itemgetter(1), reverse=True)
    sorted_words = [x[0] for x in sorted_word_to_cnt]

    sorted_words =  ['<pad>'] + ['<sos>'] + ['<eos>'] + ['<unk>'] + sorted_words 

    with codecs.open(VOCAB_OUTPUT, "w", encoding="utf-8") as file_output:
        for word in sorted_words:
            file_output.write(word + "\n")
    return sorted_words

In [0]:
vocab_src = create_vocab(src, 'vocab.en')
vocab_tgt = create_vocab(tgt, 'vocab.fr')

In [8]:
vocab_size_src = len(vocab_src)
vocab_size_tgt = len(vocab_tgt)

print('size of source vocab {}'.format(vocab_size_src))
print('size of target vocab {}'.format(vocab_size_tgt))

size of source vocab 40396
size of target vocab 49303


In [13]:
print(vocab_src[-5:])

['referendum', 'countering', 'discouragement', 'mefenamic', 'stereolithography']


In [15]:
print(vocab_tgt[-5:])

['stereolithographie', 'generatives', 'verfahren', 'zahntechnik', 'aortocoronariens']


### 2.Convert Text to Numbers

In [0]:
import codecs
import sys

def text_to_int(RAW_DATA, VOCAB, OUTPUT_DATA):

  with codecs.open(VOCAB, "r", encoding="utf-8") as f_vocab:
      vocab = [w.strip() for w in f_vocab.readlines()]
      word_to_id = {k:v for (k,v) in zip(vocab, range(len(vocab)))}

  def get_id(word):
      return word_to_id[word] if word in word_to_id else word_to_id["<unk>"]

  fout = codecs.open(OUTPUT_DATA, "w", encoding="utf-8")
  for line in RAW_DATA:
      words = line.strip().split() + ["<eos>"]

      out_line = " ".join([str(get_id(w)) for w in words]) + "\n"
      fout.write(out_line)
  fout.close()

In [0]:
text_to_int(src, 'vocab.en', 'train.en')
text_to_int(tgt, 'vocab.fr', 'train.fr')

### 3. Set Hyper Parameters

In [0]:
SRC_TRAIN_DATA = "train.en"
TRG_TRAIN_DATA = "train.fr"
CHECKPOINT_PATH = "./INFO7374"  

HIDDEN_SIZE = 256
DECODER_LAYERS = 2                    # Num_layer for decoder LSTM
SRC_VOCAB_SIZE = vocab_size_src       
TRG_VOCAB_SIZE = vocab_size_tgt       
BATCH_SIZE = 128
NUM_EPOCH = 20 
KEEP_PROB = 0.9
MAX_GRAD_NORM = 5                      # to contral gradient explosion
SHARE_EMB_AND_SOFTMAX = True           # share weights between softmax layer and embedding layer
LEARNING_RATE = 0.002

MAX_LEN = 100   # max length of a sentence
SOS_ID  = 1    # <sos> ID in target vocab

### 4. Create Training Dataset

In [0]:
def MakeDataset(file_path):
    dataset = tf.data.TextLineDataset(file_path)
    # split by space
    dataset = dataset.map(lambda string: tf.string_split([string]).values)
    # convert string to number
    dataset = dataset.map(
        lambda string: tf.string_to_number(string, tf.int32))
    # calc # word/sentence and put in dataset with sentence
    dataset = dataset.map(lambda x: (x, tf.size(x)))
    return dataset

def MakeSrcTrgDataset(src_path, trg_path, batch_size):

    src_data = MakeDataset(src_path)
    trg_data = MakeDataset(trg_path)
    #   ds[0][0]source sentence
    #   ds[0][1]len of source sentence
    #   ds[1][0]target sentence
    #   ds[1][1]len of target sentence
    dataset = tf.data.Dataset.zip((src_data, trg_data))

    # delete empty and super-long sentence
    def FilterLength(src_tuple, trg_tuple):
        ((src_input, src_len), (trg_label, trg_len)) = (src_tuple, trg_tuple)
        src_len_ok = tf.logical_and(
            tf.greater(src_len, 1), tf.less_equal(src_len, MAX_LEN))
        trg_len_ok = tf.logical_and(
            tf.greater(trg_len, 1), tf.less_equal(trg_len, MAX_LEN))
        return tf.logical_and(src_len_ok, trg_len_ok)
    dataset = dataset.filter(FilterLength)
    
    #   1.trg_input: "<sos> X Y Z"
    #   2.trg_label: "X Y Z <eos>"
    #   file: "X Y Z <eos>", need to produce "<sos> X Y Z" and add to dataset
    def MakeTrgInput(src_tuple, trg_tuple):
        ((src_input, src_len), (trg_label, trg_len)) = (src_tuple, trg_tuple)
        trg_input = tf.concat([[SOS_ID], trg_label[:-1]], axis=0)
        return ((src_input, src_len), (trg_input, trg_label, trg_len))
    
    dataset = dataset.map(MakeTrgInput)

    dataset = dataset.shuffle(10000)

    # define shape after padding
    padded_shapes = (
        (tf.TensorShape([None]),
         tf.TensorShape([])), 
        (tf.TensorShape([None]),
         tf.TensorShape([None]),
         tf.TensorShape([]))) 
    
    # padded_batch to pad
    batched_dataset = dataset.padded_batch(batch_size, padded_shapes)
    return batched_dataset

### 3.Define NMT

In [0]:
class NMTModel(object):
  """
  defind the nmt model
  
  """
  # define variables needed by nmt
  def __init__(self):
      # define rnn cell needed by encoder and decoder

      # 1 layer bi-directional LSTM for encoder
      self.enc_cell_fw = tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)
      self.enc_cell_bw = tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)      

      # # of DECODER_LAYERS layer basic LSTM for decoder
      self.dec_cell = tf.nn.rnn_cell.MultiRNNCell(
        [tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE) for _ in range(DECODER_LAYERS)])

      # set word embeddings for source and target language   
      self.src_embedding = tf.get_variable(
          "src_emb", [SRC_VOCAB_SIZE, HIDDEN_SIZE])
      self.trg_embedding = tf.get_variable(
          "trg_emb", [TRG_VOCAB_SIZE, HIDDEN_SIZE])

      # define variables for softmax layer
      if SHARE_EMB_AND_SOFTMAX:
         self.softmax_weight = tf.transpose(self.trg_embedding)
      else:
         self.softmax_weight = tf.get_variable(
             "weight", [HIDDEN_SIZE, TRG_VOCAB_SIZE])
      self.softmax_bias = tf.get_variable(
          "softmax_bias", [TRG_VOCAB_SIZE])

  # calculate forward graph
  # src_input, src_size, trg_input, trg_label, trg_size are from MakeSrcTrgDataset

  def forward(self, src_input, src_size, trg_input, trg_label, trg_size, LEARNING_RATE):
      batch_size = tf.shape(src_input)[0]

      # convert word_int input to embeddings
      src_emb = tf.nn.embedding_lookup(self.src_embedding, src_input)
      trg_emb = tf.nn.embedding_lookup(self.trg_embedding, trg_input)

      # set drop_out rate
      src_emb = tf.nn.dropout(src_emb, KEEP_PROB)
      trg_emb = tf.nn.dropout(trg_emb, KEEP_PROB)

      with tf.variable_scope("encoder"):
          # enc_output: a tuple of two tensors, each tensor > [batch_size, max_time, HIDDEN_SIZE]
          # this will be needed when calculating attention

          # enc_state: a tuple of two LSTMStateTuple tensors, each tensor > [batch_size, HIDDEN_SIZE]
          enc_outputs, enc_state = tf.nn.bidirectional_dynamic_rnn(
              self.enc_cell_fw, self.enc_cell_bw, src_emb, src_size, 
              dtype=tf.float32)
          # contact two seperate tensors into one
          enc_outputs = tf.concat([enc_outputs[0], enc_outputs[1]], -1)     

      with tf.variable_scope("decoder"):

#           attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
#               HIDDEN_SIZE, enc_outputs,
#               memory_sequence_length=src_size)

          attention_mechanism = tf.contrib.seq2seq.LuongAttention(
              HIDDEN_SIZE, enc_outputs,
              memory_sequence_length=src_size)

          # wrap self.dec_cell and attention mechanisim
          attention_cell = tf.contrib.seq2seq.AttentionWrapper(
              self.dec_cell, attention_mechanism,
              attention_layer_size=HIDDEN_SIZE)

          # use attention_cell and dynamic_rnn construct decoder
          # here relay totally on attention as the information source
          dec_outputs, _ = tf.nn.dynamic_rnn(
              attention_cell, trg_emb, trg_size, dtype=tf.float32)

      # calc log perplexity
      output = tf.reshape(dec_outputs, [-1, HIDDEN_SIZE])
      logits = tf.matmul(output, self.softmax_weight) + self.softmax_bias
      loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=tf.reshape(trg_label, [-1]), logits=logits)

      # set padding weight as 0 when calc average loss
      label_weights = tf.sequence_mask(
          trg_size, maxlen=tf.shape(trg_label)[1], dtype=tf.float32)
      label_weights = tf.reshape(label_weights, [-1])
      cost = tf.reduce_sum(loss * label_weights)
      cost_per_token = cost / tf.reduce_sum(label_weights)

      # define backprop
      trainable_variables = tf.trainable_variables()

      # define optimization method/steps
      grads = tf.gradients(cost / tf.to_float(batch_size),
                           trainable_variables)
      
      grads, _ = tf.clip_by_global_norm(grads, MAX_GRAD_NORM)
      
#       optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE)
      optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
      
      train_op = optimizer.apply_gradients(
          zip(grads, trainable_variables))
      
      return cost_per_token, train_op

### 4.Training

In [0]:
def run_epoch(session, cost_op, train_op, saver, step):
    while True:
        try:
            # execute train_op and calc loss
            cost, _ = session.run([cost_op, train_op])
            if step % 10 == 0:
                print("After %d steps, per token cost is %.3f" % (step, cost))
            # save checkpoint per 200 steps
            if step % 200 == 0:
                saver.save(session, CHECKPOINT_PATH, global_step=step)
            step += 1
        except tf.errors.OutOfRangeError:
            break
    return step

def main():

    initializer = tf.random_uniform_initializer(-0.05, 0.05)
    
    tf.reset_default_graph()

    with tf.variable_scope("nmt_model", reuse=None, 
                           initializer=initializer):
        train_model = NMTModel()
  
    # define input data
    data = MakeSrcTrgDataset(SRC_TRAIN_DATA, TRG_TRAIN_DATA, BATCH_SIZE)
    iterator = data.make_initializable_iterator()
    (src, src_size), (trg_input, trg_label, trg_size) = iterator.get_next()
 
    # define forward graph
    cost_op, train_op = train_model.forward(src, src_size, trg_input,
                                            trg_label, trg_size, LEARNING_RATE)

    # train
    saver = tf.train.Saver()
    tf.add_to_collection('train_op', train_op)
    tf.add_to_collection('cost_op', cost_op)
    step = 0
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        for i in range(NUM_EPOCH):
            print("In iteration: %d" % (i + 1))
            sess.run(iterator.initializer)
            step = run_epoch(sess, cost_op, train_op, saver, step)
            
if __name__ == "__main__":
    main()

Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').
In iteration: 1
After 0 steps, per token cost is 10.803
After 10 steps, per token cost is 9.373
After 20 steps, per token cost is 7.310
After 30 steps, per token cost is 7.130
After 40 steps, per token cost is 7.014
After 50 steps, per token cost is 6.863
After 60 steps, per token cost is 7.053
After 70 steps, per token cost is 7.056
After 80 steps, per token cost is 6.931
After 90 steps, per token cost is 6.663
After 100 steps, per token cost is 6.582
After 110 steps, per token cost is 6.374
After 120 steps, per token cost is 6.409
After 130 steps, per token cost is 6.392
After 140 steps, per token cost is 6.352
After 150 steps, per token cost is 6.130
After 160 steps, per token cost is 6.424
After 170 steps, per token cost is 6.280
After 180 steps, per tok

After 1870 steps, per token cost is 2.406
After 1880 steps, per token cost is 2.270
After 1890 steps, per token cost is 2.265
After 1900 steps, per token cost is 2.657
After 1910 steps, per token cost is 2.497
After 1920 steps, per token cost is 2.608
After 1930 steps, per token cost is 2.647
After 1940 steps, per token cost is 2.431
After 1950 steps, per token cost is 2.501
After 1960 steps, per token cost is 2.659
After 1970 steps, per token cost is 2.321
After 1980 steps, per token cost is 2.236
After 1990 steps, per token cost is 2.380
After 2000 steps, per token cost is 2.450
After 2010 steps, per token cost is 2.274
After 2020 steps, per token cost is 2.371
After 2030 steps, per token cost is 2.459
After 2040 steps, per token cost is 2.356
After 2050 steps, per token cost is 2.116
After 2060 steps, per token cost is 2.213
After 2070 steps, per token cost is 2.259
After 2080 steps, per token cost is 2.359
After 2090 steps, per token cost is 2.371
After 2100 steps, per token cost i

After 3820 steps, per token cost is 1.363
After 3830 steps, per token cost is 1.346
After 3840 steps, per token cost is 1.383
After 3850 steps, per token cost is 1.599
After 3860 steps, per token cost is 1.433
After 3870 steps, per token cost is 1.522
After 3880 steps, per token cost is 1.595
After 3890 steps, per token cost is 1.290
After 3900 steps, per token cost is 1.488
In iteration: 6
After 3910 steps, per token cost is 1.385
After 3920 steps, per token cost is 1.554
After 3930 steps, per token cost is 1.425
After 3940 steps, per token cost is 1.378
After 3950 steps, per token cost is 1.196
After 3960 steps, per token cost is 1.334
After 3970 steps, per token cost is 1.332
After 3980 steps, per token cost is 1.333
After 3990 steps, per token cost is 1.411
After 4000 steps, per token cost is 1.656
After 4010 steps, per token cost is 1.267
After 4020 steps, per token cost is 1.337
After 4030 steps, per token cost is 1.320
After 4040 steps, per token cost is 1.279
After 4050 steps, 

After 5760 steps, per token cost is 1.072
After 5770 steps, per token cost is 1.137
After 5780 steps, per token cost is 1.166
After 5790 steps, per token cost is 1.292
After 5800 steps, per token cost is 1.149
After 5810 steps, per token cost is 1.181
After 5820 steps, per token cost is 1.025
After 5830 steps, per token cost is 1.151
After 5840 steps, per token cost is 1.161
After 5850 steps, per token cost is 1.000
After 5860 steps, per token cost is 1.101
After 5870 steps, per token cost is 1.130
After 5880 steps, per token cost is 1.048
After 5890 steps, per token cost is 0.853
After 5900 steps, per token cost is 0.966
After 5910 steps, per token cost is 1.314
After 5920 steps, per token cost is 1.243
After 5930 steps, per token cost is 1.067
After 5940 steps, per token cost is 1.311
After 5950 steps, per token cost is 1.157
After 5960 steps, per token cost is 1.035
After 5970 steps, per token cost is 1.127
After 5980 steps, per token cost is 0.972
After 5990 steps, per token cost i

After 7710 steps, per token cost is 0.798
After 7720 steps, per token cost is 0.854
After 7730 steps, per token cost is 0.810
After 7740 steps, per token cost is 0.729
After 7750 steps, per token cost is 0.817
After 7760 steps, per token cost is 0.751
After 7770 steps, per token cost is 0.746
After 7780 steps, per token cost is 0.735
After 7790 steps, per token cost is 0.794
After 7800 steps, per token cost is 0.649
After 7810 steps, per token cost is 0.659
In iteration: 11
After 7820 steps, per token cost is 0.872
After 7830 steps, per token cost is 0.826
After 7840 steps, per token cost is 0.841
After 7850 steps, per token cost is 0.814
After 7860 steps, per token cost is 0.773
After 7870 steps, per token cost is 0.728
After 7880 steps, per token cost is 0.919
After 7890 steps, per token cost is 0.856
After 7900 steps, per token cost is 0.781
After 7910 steps, per token cost is 0.798
After 7920 steps, per token cost is 0.920
After 7930 steps, per token cost is 0.743
After 7940 steps,

After 9650 steps, per token cost is 0.548
After 9660 steps, per token cost is 0.693
After 9670 steps, per token cost is 0.737
After 9680 steps, per token cost is 0.663
After 9690 steps, per token cost is 0.682
After 9700 steps, per token cost is 0.698
After 9710 steps, per token cost is 0.647
After 9720 steps, per token cost is 0.697
After 9730 steps, per token cost is 0.558
After 9740 steps, per token cost is 0.822
After 9750 steps, per token cost is 0.673
After 9760 steps, per token cost is 0.725
After 9770 steps, per token cost is 0.557
After 9780 steps, per token cost is 0.766
After 9790 steps, per token cost is 0.659
After 9800 steps, per token cost is 0.629
After 9810 steps, per token cost is 0.713
After 9820 steps, per token cost is 0.578
After 9830 steps, per token cost is 0.660
After 9840 steps, per token cost is 0.650
After 9850 steps, per token cost is 0.701
After 9860 steps, per token cost is 0.658
After 9870 steps, per token cost is 0.721
After 9880 steps, per token cost i

After 11560 steps, per token cost is 0.646
After 11570 steps, per token cost is 0.550
After 11580 steps, per token cost is 0.541
After 11590 steps, per token cost is 0.611
After 11600 steps, per token cost is 0.566
After 11610 steps, per token cost is 0.435
After 11620 steps, per token cost is 0.553
After 11630 steps, per token cost is 0.564
After 11640 steps, per token cost is 0.541
After 11650 steps, per token cost is 0.498
After 11660 steps, per token cost is 0.547
After 11670 steps, per token cost is 0.497
After 11680 steps, per token cost is 0.488
After 11690 steps, per token cost is 0.493
After 11700 steps, per token cost is 0.594
After 11710 steps, per token cost is 0.580
After 11720 steps, per token cost is 0.524
In iteration: 16
After 11730 steps, per token cost is 0.566
After 11740 steps, per token cost is 0.516
After 11750 steps, per token cost is 0.703
After 11760 steps, per token cost is 0.514
After 11770 steps, per token cost is 0.496
After 11780 steps, per token cost is 