<a href="https://colab.research.google.com/github/wenxuan0923/My-notes/blob/master/Tensor2Tensor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Transformer model for language translation

In this note I use **Tensor2Tensor** package, a library of deep learning models developed by Google Brain team, to implement the **Transformer** method proposed in the paper in the paper <a href='https://arxiv.org/abs/1706.03762' target='_blank'>Attention Is All You Need </a> for English-Chinese translation. The greatest thing about implementing Transformer with T2T is its functionality to visualize the multi-head attention layers.

- The model is built under the environment of Google Colab with GPU enabled.

- Python code is used for data generation and model training. One can choose to use commands in terminal instead.

- Please refer to this <a href='https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/notebooks/Transformer_translate.ipynb' target='_blank'> tutorial </a> for more details.

- An implementation of Transformer using purely Keras can be found <a href='#'>here<a>.

### Initialization


In [0]:
import os
import sys
import numpy as np
import collections
import matplotlib.pyplot as plt
# Colab-only TensorFlow version selector
if 'google.colab' in sys.modules: 
  %tensorflow_version 1.x
import tensorflow as tf
from tensor2tensor import models
from tensor2tensor import problems
from tensor2tensor.layers import common_layers
from tensor2tensor.utils import trainer_lib
from tensor2tensor.utils import t2t_model
from tensor2tensor.utils import registry
from tensor2tensor.utils import metrics
from tensor2tensor.utils.trainer_lib import (create_hparams, 
                                             create_run_config, 
                                             create_experiment)
# Enable TF Eager execution
tfe = tf.contrib.eager
tfe.enable_eager_execution()
# Other setup
Modes = tf.estimator.ModeKeys

In [0]:
# Not all of these folders are necessary
# Choose the ones that suits your needs

DATA_DIR = os.path.expanduser("/t2t/data") # This folder contain the training data
TMP_DIR = os.path.expanduser("/t2t/tmp")
TRAIN_DIR = os.path.expanduser("/t2t/train") # This folder contain the checkpoints model
EXPORT_DIR = os.path.expanduser("/t2t/export") # This folder contain the exported model for production
TRANSLATIONS_DIR = os.path.expanduser("/t2t/translation") # This folder contain all translated sequence
EVENT_DIR = os.path.expanduser("/t2t/event") # Test the BLEU score
USR_DIR = os.path.expanduser("/t2t/user") # This folder contains our data that we want to add
 
tf.gfile.MakeDirs(DATA_DIR)
tf.gfile.MakeDirs(TMP_DIR)
tf.gfile.MakeDirs(TRAIN_DIR)
tf.gfile.MakeDirs(EXPORT_DIR)
tf.gfile.MakeDirs(TRANSLATIONS_DIR)
tf.gfile.MakeDirs(EVENT_DIR)
tf.gfile.MakeDirs(USR_DIR)

This gonna generate the folders below:
<center><img src='https://drive.google.com/uc?id=1vldUGgC5SaNVjlhXCLcOaAErVckdElGq'></img></center>

### Initialize parameters

In [0]:
# problems.available()   # Show all problems
# this is a English-Chinese dataset with 8192 vocabulary
PROBLEM = 'translate_enzh_wmt8k' 

# registry.list_models() # Show all registered models
MODEL = 'transformer' 

# start with "transformer_base" or 'transformer_base_single_gpu'
# if training on a single GPU
HPARAMS = 'transformer_base_single_gpu'    

### Data Generation

In [0]:
# The data will be stored in the data folder we just created
%%time
t2t_problem = problems.problem(PROBLEM)
t2t_problem.generate_data(DATA_DIR, TMP_DIR)

### Train the model

In [0]:
train_steps = 50000           # Total number of train steps for all Epochs
eval_steps = 20               # Number of steps to perform for each evaluation
batch_size = 1000       
save_checkpoints_steps = 50   # Save checkpoints every 50 steps
ALPHA = 0.1                   # Learning rate
schedule = "continuous_train_and_eval"

# Init Hparams object 
hparams = create_hparams(HPARAMS)

# Make Changes to Hparams
hparams.batch_size = batch_size
hparams.learning_rate = ALPHA
# See all Hparams with code below
#print(json.loads(hparams.to_json())

In [0]:
RUN_CONFIG = create_run_config(
      model_dir=TRAIN_DIR,
      model_name=MODEL,
      save_checkpoints_steps= save_checkpoints_steps
)

tensorflow_exp_fn = create_experiment(
        run_config=RUN_CONFIG,
        hparams=hparams,
        model_name=MODEL,
        problem_name=PROBLEM,
        data_dir=DATA_DIR, 
        train_steps=train_steps, 
        eval_steps=eval_steps, 
        use_xla=True # For acceleration
    ) 

tensorflow_exp_fn.train_and_evaluate()

### Prediction of sentence

In [36]:
# Get the encoders from the problem
encoders = t2t_problem.feature_encoders(DATA_DIR)

# Get the latest checkpoint
ckpt_path = tf.train.latest_checkpoint('../t2t/train')
print(ckpt_path)

# Predict 
hparams = create_hparams(HPARAMS, data_dir=DATA_DIR, problem_name=PROBLEM)
translate_model = registry.model(MODEL)(hparams, Modes.PREDICT)

def translate(inputs):
  encoded_inputs = encode(inputs)
  with tfe.restore_variables_on_create(ckpt_path):
    model_output = translate_model.infer(encoded_inputs)["outputs"]
  return decode(model_output)

def encode(input_str, output_str=None):
  """Input str to features dict, ready for inference"""
  inputs = encoders["inputs"].encode(input_str) + [1]  # add EOS 
  batch_inputs = tf.reshape(inputs, [1, -1, 1])  # Make it 3D
  return {"inputs": batch_inputs}

def decode(integers):
  """List of ints to str"""
  integers = list(np.squeeze(integers))
  if 1 in integers:
    integers = integers[:integers.index(1)]
  return encoders["targets"].decode(np.squeeze(integers))

/t2t/train/model.ckpt-9300


In [128]:
inputs = ["I think they will never come back to the US.", 
          "Human rights is the first priority.",
          'Everyone should have health insurance.',
          'Approval rate of Trump has greatly decreased.']

for sentence in inputs:
  output = translate(sentence)
  print("\33[34m Inputs:\33[30m %s" % sentence)
  print("\033[35m Outputs:\33[30m %s" % output)
  print()

[34m Inputs:[30m I think they will never come back to the US.
[35m Outputs:[30m 我认为他们永远不会回到美国。

[34m Inputs:[30m Human rights is the first priority.
[35m Outputs:[30m 人权权利是第一个问题。

[34m Inputs:[30m Everyone should have health insurance.
[35m Outputs:[30m 每个人应该有健康保险。

[34m Inputs:[30m Approval rate of Trump has greatly decreased.
[35m Outputs:[30m 特朗普的利率已经大幅下降。



In [0]:
from tensor2tensor.visualization import attention
from tensor2tensor.data_generators import text_encoder

SIZE = 35

def encode_eval(input_str, output_str):
  inputs = tf.reshape(encoders["inputs"].encode(input_str) + [1], [1, -1, 1, 1])  # Make it 3D.
  outputs = tf.reshape(encoders["inputs"].encode(output_str) + [1], [1, -1, 1, 1])  # Make it 3D.
  return {"inputs": inputs, "targets": outputs}

def get_att_mats():
  enc_atts = []
  dec_atts = []
  encdec_atts = []

  for i in range(hparams.num_hidden_layers):
    enc_att = translate_model.attention_weights[
      "transformer/body/encoder/layer_%i/self_attention/multihead_attention/dot_product_attention" % i][0]
    dec_att = translate_model.attention_weights[
      "transformer/body/decoder/layer_%i/self_attention/multihead_attention/dot_product_attention" % i][0]
    encdec_att = translate_model.attention_weights[
      "transformer/body/decoder/layer_%i/encdec_attention/multihead_attention/dot_product_attention" % i][0]
    enc_atts.append(resize(enc_att))
    dec_atts.append(resize(dec_att))
    encdec_atts.append(resize(encdec_att))
  return enc_atts, dec_atts, encdec_atts

def resize(np_mat):
  # Sum across heads
  np_mat = np_mat[:, :SIZE, :SIZE]
  row_sums = np.sum(np_mat, axis=0)
  # Normalize
  layer_mat = np_mat / row_sums[np.newaxis, :]
  lsh = layer_mat.shape
  # Add extra dim for viz code to work.
  layer_mat = np.reshape(layer_mat, (1, lsh[0], lsh[1], lsh[2]))
  return layer_mat

def to_tokens(ids, is_input=True):
  ids = np.squeeze(ids)
  if is_input:
      subtokenizer = hparams.problem_hparams.vocabulary['inputs']
  else:
      subtokenizer = hparams.problem_hparams.vocabulary['targets']
  tokens = []
  for _id in ids:
    if _id == 0:
      tokens.append('<PAD>')
    elif _id == 1:
      tokens.append('<EOS>')
    elif _id == -1:
      tokens.append('<NULL>')
    else:
        tokens.append(subtokenizer._subtoken_id_to_subtoken_string(_id))
  return tokens

In [0]:
def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

In [133]:
# Convert inputs and outputs to subwords
inp_text = to_tokens(encoders["inputs"].encode(sentence))
out_text = to_tokens(encoders["targets"].encode(output), is_input=False)

# Run eval to collect attention weights
example = encode_eval(sentence, output)
with tfe.restore_variables_on_create(tf.train.latest_checkpoint(ckpt_path)):
  translate_model.set_mode(Modes.EVAL)
  translate_model(example)
# Get normalized attention weights for each layer
enc_atts, dec_atts, encdec_atts = get_att_mats()

call_html()
attention.show(inp_text, out_text, enc_atts, dec_atts, encdec_atts)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>