This notebook is constructed from https://github.com/Kyubyong/transformer for my study.

Dataset is http://www.phontron.com/kftt/.

## Download data.

In [1]:
import os

if not os.path.exists("./corpora-jp-en"):
    os.makedirs("./corpora-jp-en", exist_ok=True)
    !wget -qO- --show-progress http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz | tar xz; mv kftt-data-1.0 corpora-jp-en

## Set hyperparameters.

In [2]:
class Hyperparams:
    '''Hyperparameters'''
    # data
    source_train = './corpora-jp-en/kftt-data-1.0/data/tok/kyoto-train.ja'
    target_train = './corpora-jp-en/kftt-data-1.0/data/tok/kyoto-train.en'
    source_test = './corpora-jp-en/kftt-data-1.0/data/tok/kyoto-test.ja'
    target_test = './corpora-jp-en/kftt-data-1.0/data/tok/kyoto-test.en'
    
    # training
    batch_size = 32 # alias = N
    lr = 0.0001 # learning rate. In paper, learning rate is adjusted to the global step.
    logdir = 'logdir-jp-en' # log directory
    
    # model
    maxlen = 10 # Maximum number of words in a sentence. alias = T.  Feel free to increase this if you are ambitious.
    min_cnt = 20 # words whose occurred less than min_cnt are encoded as <UNK>.
    hidden_units = 512 # alias = C
    num_blocks = 6 # number of encoder/decoder blocks
    num_epochs = 20
    num_heads = 8
    dropout_rate = 0.1
    sinusoid = False # If True, use sinusoid. If false, positional embedding.

In [3]:
hp = Hyperparams()

## Define functions for data handling and modeling.

In [4]:
from __future__ import print_function
import tensorflow as tf
import numpy as np
import codecs
import os
import regex
from collections import Counter

def make_vocab(fpath
               , fname
              , is_ja = False):
    '''Constructs vocabulary.
    
    Args:
      fpath: A string. Input file path.
      fname: A string. Output file name.
    
    Writes vocabulary line by line to `preprocessed/fname`
    '''  
    text = codecs.open(fpath, 'r', 'utf-8').read()
    if is_ja == False:
        text = regex.sub("[^\s\p{Latin}']", "", text)
    words = text.split()
    word2cnt = Counter(words)
    
    if not os.path.exists('preprocessed-jp-en'): os.mkdir('preprocessed-jp-en')
    
    with codecs.open('preprocessed-jp-en/{}'.format(fname), 'w', 'utf-8') as fout:
        fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
        for word, cnt in word2cnt.most_common(len(word2cnt)):
            fout.write(u"{}\t{}\n".format(word, cnt))

In [5]:
%%time

if not os.path.exists("./preprocessed-jp-en/ja.vocab.tsv"):
    make_vocab(hp.source_train, "ja.vocab.tsv", is_ja=True)
    make_vocab(hp.target_train, "en.vocab.tsv")
    print("Done")
else:
    print("File already exists.")

File already exists.
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.42 ms


In [6]:
def normalize(inputs
              , epsilon = 1e-8
              , scope="ln"
              , reuse=None):
    '''Applies layer normalization.
    
    Args:
      inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
      epsilon: A floating number. A very small number for preventing ZeroDivision Error.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
      
    Returns:
      A tensor with the same shape and data dtype as `inputs`.
    '''
    with tf.variable_scope(scope, reuse=reuse):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]
    
        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta= tf.Variable(tf.zeros(params_shape)) #Is this necessary?
        gamma = tf.Variable(tf.ones(params_shape))
        normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
        outputs = gamma * normalized + beta
        
    return outputs

In [7]:
inputs = tf.constant([[1.0, 2.0, 3.0],[4.0, 8.0, 16.0]], name="test")
outputs = normalize(inputs, scope='test')
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(outputs))

[[-1.2247448   0.          1.2247448 ]
 [-1.0690448  -0.26726115  1.3363062 ]]


In [8]:
def embedding(inputs
              , vocab_size
              , num_units
              , zero_pad=True
              , scale=True
              , scope="embedding"
              , reuse=None):
    '''Embeds a given tensor.
    Args:
      inputs: A `Tensor` with type `int32` or `int64` containing the ids to be looked up in `lookup table`.
      vocab_size: An int. Vocabulary size.
      num_units: An int. Number of embedding hidden units.
      zero_pad: A boolean. If True, all the values of the fist row (id 0) should be constant zeros.
      scale: A boolean. If True. the outputs is multiplied by sqrt num_units.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
    Returns:
      A `Tensor` with one more rank than inputs's. The last dimensionality should be `num_units`.
        
    For example,
    
    ```
    import tensorflow as tf
    
    inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
    outputs = embedding(inputs, 6, 2, zero_pad=True)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print sess.run(outputs)
    >>
    [[[ 0.          0.        ]
      [ 0.09754146  0.67385566]
      [ 0.37864095 -0.35689294]]
     [[-1.01329422 -1.09939694]
      [ 0.7521342   0.38203377]
      [-0.04973143 -0.06210355]]]
    ```
    
    ```
    import tensorflow as tf
    
    inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
    outputs = embedding(inputs, 6, 2, zero_pad=False)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print sess.run(outputs)
    >>
    [[[-0.19172323 -0.39159766]
      [-0.43212751 -0.66207761]
      [ 1.03452027 -0.26704335]]
     [[-0.11634696 -0.35983452]
      [ 0.50208133  0.53509563]
      [ 1.22204471 -0.96587461]]]    
    ```    
    '''
    with tf.variable_scope(scope, reuse=reuse):
        lookup_table = tf.get_variable('lookup_table'
                                       , dtype=tf.float32
                                       , shape=[vocab_size, num_units]
                                       , initializer=tf.contrib.layers.xavier_initializer())
        if zero_pad:
            lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, inputs)
        
        if scale:
            outputs = outputs * (num_units ** 0.5) 
            
    return outputs

In [9]:
inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
outputs = embedding(inputs, 6, 2, zero_pad=False)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print( sess.run(outputs) )

[[[-0.68886787  0.70741284]
  [-0.13518551  0.9180102 ]
  [-1.0187594   0.36321706]]

 [[-0.5569595  -1.1951034 ]
  [-0.50075996  1.1593267 ]
  [ 0.78773326 -0.8533228 ]]]


In [10]:
inputs

<tf.Tensor 'Reshape:0' shape=(2, 3) dtype=int32>

In [11]:
outputs.get_shape

<bound method Tensor.get_shape of <tf.Tensor 'embedding/mul:0' shape=(2, 3, 2) dtype=float32>>

The last dimension 2 is for num_units.

In [12]:
def positional_encoding(inputs
                        , num_units
                        , zero_pad=True
                        , scale=True
                        , scope="positional_encoding"
                        , reuse=None):
    '''Sinusoidal Positional_Encoding.
    Args:
      inputs: A 2d Tensor with shape of (N, T).
      num_units: Output dimensionality
      zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
      scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
    Returns:
        A 'Tensor' with one more rank than inputs's, with the dimensionality should be 'num_units'
    '''

    N, T = inputs.get_shape().as_list()
    with tf.variable_scope(scope, reuse=reuse):
        position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1])

        # First part of the PE function: sin and cos argument
        position_enc = np.array([ [pos / np.power(10000, 2.*i/num_units) for i in range(num_units)] for pos in range(T) ])

        # Second part, apply the cosine to even columns and sin to odds.
        position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
        position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1

        # Convert to a tensor
        lookup_table = tf.convert_to_tensor(position_enc)

        if zero_pad:
            lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, position_ind)

        if scale:
            outputs = outputs * num_units**0.5

        return outputs

In [13]:
inputs = tf.constant([[0, 0, 0, 0],[0, 1, 0, 0],[0, 0, 1, 0]])
outputs = positional_encoding(inputs, 2, zero_pad=False)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print( sess.run(outputs) )

[[[0.         1.41421356]
  [1.19001968 1.41421356]
  [1.28594075 1.41421353]
  [0.19957383 1.4142135 ]]

 [[0.         1.41421356]
  [1.19001968 1.41421356]
  [1.28594075 1.41421353]
  [0.19957383 1.4142135 ]]

 [[0.         1.41421356]
  [1.19001968 1.41421356]
  [1.28594075 1.41421353]
  [0.19957383 1.4142135 ]]]


In [14]:
inputs.get_shape

<bound method Tensor.get_shape of <tf.Tensor 'Const:0' shape=(3, 4) dtype=int32>>

In [15]:
np.array([ [pos / np.power(10000, 2.*i/4) for i in range(4)] for pos in range(3) ])

array([[0.e+00, 0.e+00, 0.e+00, 0.e+00],
       [1.e+00, 1.e-02, 1.e-04, 1.e-06],
       [2.e+00, 2.e-02, 2.e-04, 2.e-06]])

In [16]:
def multihead_attention(queries, 
                        keys, 
                        num_units=None, 
                        num_heads=8, 
                        dropout_rate=0,
                        is_training=True,
                        causality=False,
                        scope="multihead_attention", 
                        reuse=None):
    '''Applies multihead attention.
    
    Args:
      queries: A 3d tensor with shape of [N, T_q, C_q].
      keys: A 3d tensor with shape of [N, T_k, C_k].
      num_units: A scalar. Attention size.
      dropout_rate: A floating point number.
      is_training: Boolean. Controller of mechanism for dropout.
      causality: Boolean. If true, units that reference the future are masked. 
      num_heads: An int. Number of heads.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
        
    Returns
      A 3d tensor with shape of (N, T_q, C)  
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Set the fall back option for num_units
        if num_units is None:
            num_units = queries.get_shape().as_list[-1]
        
        # Linear projections
        Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C)
        K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
        V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
        
        # Split and concat
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 
        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 

        # Multiplication
        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) < Q_ * K_^T
        
        # Scale
        outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
        
        # Key Masking
        key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k) < get sign < compute abs < sum over hidden units
        key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
        key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)
        
        paddings = tf.ones_like(outputs)*(-2**32+1)
        outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)
  
        # Causality = Future blinding
        if causality:
            diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
            tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # (T_q, T_k) < LinearOperator acting like a [batch] square lower triangular matrix
            masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)
   
            paddings = tf.ones_like(masks)*(-2**32+1)
            outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k)
  
        # Activation
        outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
         
        # Query Masking
        query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q)
        query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
        query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
        outputs *= query_masks # broadcasting. (N, T_q, C)
          
        # Dropouts
        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
               
        # Weighted sum
        outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) < masked(Q_ * K_^T) * V_
        
        # Restore shape
        outputs = tf.concat( tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C)
              
        # Residual connection
        outputs += queries
              
        # Normalize
        outputs = normalize(outputs) # (N, T_q, C)
 
    return outputs

In [17]:
def feedforward(inputs
                , num_units=[2048, 512] #Two hidden layers
                , scope="multihead_attention"
                , reuse=None):
    '''Point-wise feed forward net.
    
    Args:
      inputs: A 3d tensor with shape of [N, T, C].
      num_units: A list of two integers.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer by the same name.
        
    Returns:
      A 3d tensor with the same shape and dtype as inputs
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Inner layer
        params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,
                  "activation": tf.nn.relu, "use_bias": True}
        outputs = tf.layers.conv1d(**params)
        
        # Readout layer
        params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,
                  "activation": None, "use_bias": True}
        outputs = tf.layers.conv1d(**params)
        
        # Residual connection
        outputs += inputs
        
        # Normalize
        outputs = normalize(outputs)
    
    return outputs

In [18]:
def label_smoothing(inputs
                    , epsilon=0.1):
    '''Applies label smoothing. See https://arxiv.org/abs/1512.00567.
    
    Args:
      inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary.
      epsilon: Smoothing rate.
    
    For example,
    
    ```
    import tensorflow as tf
    inputs = tf.convert_to_tensor([[[0, 0, 1], 
       [0, 1, 0],
       [1, 0, 0]],
      [[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0]]], tf.float32)
       
    outputs = label_smoothing(inputs)
    
    with tf.Session() as sess:
        print(sess.run([outputs]))
    
    >>
    [array([[[ 0.03333334,  0.03333334,  0.93333334],
        [ 0.03333334,  0.93333334,  0.03333334],
        [ 0.93333334,  0.03333334,  0.03333334]],
       [[ 0.93333334,  0.03333334,  0.03333334],
        [ 0.93333334,  0.03333334,  0.03333334],
        [ 0.03333334,  0.93333334,  0.03333334]]], dtype=float32)]   
    ```    
    '''
    K = inputs.get_shape().as_list()[-1] # number of channels
    return ((1-epsilon) * inputs) + (epsilon / K)

## Load data.

In [47]:
def load_ja_vocab():
    vocab = [line.split()[0] for line in codecs.open('preprocessed-jp-en/ja.vocab.tsv', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt]
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for idx, word in enumerate(vocab)}
    return word2idx, idx2word

def load_en_vocab():
    vocab = [line.split()[0] for line in codecs.open('preprocessed-jp-en/en.vocab.tsv', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=hp.min_cnt]
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for idx, word in enumerate(vocab)}
    return word2idx, idx2word

def create_data(source_sents, target_sents): 
    ja2idx, idx2ja = load_ja_vocab()
    en2idx, idx2en = load_en_vocab()
    
    # Index
    x_list, y_list, Sources, Targets = [], [], [], []
    for source_sent, target_sent in zip(source_sents, target_sents):
        x = [ja2idx.get(word, 1) for word in (source_sent + u" </S>").split()] # 1: OOV, </S>: End of Text
        y = [en2idx.get(word, 1) for word in (target_sent + u" </S>").split()] 
        if max(len(x), len(y)) <=hp.maxlen:
            x_list.append(np.array(x))
            y_list.append(np.array(y))
            Sources.append(source_sent)
            Targets.append(target_sent)
    
    # Pad      
    X = np.zeros([len(x_list), hp.maxlen], np.int32)
    Y = np.zeros([len(y_list), hp.maxlen], np.int32)
    for i, (x, y) in enumerate(zip(x_list, y_list)):
        X[i] = np.lib.pad(x, [0, hp.maxlen-len(x)], 'constant', constant_values=(0, 0))
        Y[i] = np.lib.pad(y, [0, hp.maxlen-len(y)], 'constant', constant_values=(0, 0))
    
    return X, Y, Sources, Targets

def load_train_data():
    ja_sents = [line for line in codecs.open(hp.source_train, 'r', 'utf-8').read().split("\n") if line and line[0] != "<"]
    en_sents = [regex.sub("[^\s\p{Latin}']", "", line) for line in codecs.open(hp.target_train, 'r', 'utf-8').read().split("\n") if line and line[0] != "<"]
    
    X, Y, Sources, Targets = create_data(ja_sents, en_sents)
    return X, Y
    
def load_test_data():
    ja_sents = [line for line in codecs.open(hp.source_test, 'r', 'utf-8').read().split("\n") if line and line[0] != "<"]
    en_sents = [regex.sub("[^\s\p{Latin}']", "", line) for line in codecs.open(hp.target_test, 'r', 'utf-8').read().split("\n") if line and line[0] != "<"]
        
    X, Y, Sources, Targets = create_data(ja_sents, en_sents)
    return X, Sources, Targets

def get_batch_data():
    # Load data
    X, Y = load_train_data()
    
    # calc total batch count
    num_batch = len(X) // hp.batch_size
    
    # Convert to tensor
    X = tf.convert_to_tensor(X, tf.int32)
    Y = tf.convert_to_tensor(Y, tf.int32)
    
    # Create Queues
    input_queues = tf.train.slice_input_producer([X, Y])
            
    # create batch queues
    x, y = tf.train.shuffle_batch(input_queues,
                                num_threads=8,
                                batch_size=hp.batch_size, 
                                capacity=hp.batch_size*64,   
                                min_after_dequeue=hp.batch_size*32, 
                                allow_smaller_final_batch=False)
    
    return x, y, num_batch # (N, T), (N, T), ()

## Define computational graph.

In [20]:
import tqdm

class Graph():
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data() # (N, T)
            else: # inference
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2:<S>

            # Load vocabulary    
            de2idx, idx2de = load_ja_vocab()
            en2idx, idx2en = load_en_vocab()
            
            # Encoder
            with tf.variable_scope("encoder"):
                ## Embedding
                self.enc = embedding(self.x, 
                                      vocab_size=len(de2idx), 
                                      num_units=hp.hidden_units, 
                                      scale=True,
                                      scope="enc_embed")
                
                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="enc_pe")
                else:
                    self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]),
                                      vocab_size=hp.maxlen, 
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="enc_pe")
                    
                 
                ## Dropout
                self.enc = tf.layers.dropout(self.enc, 
                                            rate=hp.dropout_rate, 
                                            training=tf.convert_to_tensor(is_training))
                
                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### Multihead Attention
                        self.enc = multihead_attention(queries=self.enc, 
                                                        keys=self.enc, 
                                                        num_units=hp.hidden_units, 
                                                        num_heads=hp.num_heads, 
                                                        dropout_rate=hp.dropout_rate,
                                                        is_training=is_training,
                                                        causality=False)
                        
                        ### Feed Forward
                        self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units])
            
            # Decoder
            with tf.variable_scope("decoder"):
                ## Embedding
                self.dec = embedding(self.decoder_inputs, 
                                      vocab_size=len(en2idx), 
                                      num_units=hp.hidden_units,
                                      scale=True, 
                                      scope="dec_embed")
                
                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                      vocab_size=hp.maxlen, 
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]),
                                      vocab_size=hp.maxlen, 
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="dec_pe")
                
                ## Dropout
                self.dec = tf.layers.dropout(self.dec, 
                                            rate=hp.dropout_rate, 
                                            training=tf.convert_to_tensor(is_training))
                
                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(queries=self.dec, 
                                                        keys=self.dec, 
                                                        num_units=hp.hidden_units, 
                                                        num_heads=hp.num_heads, 
                                                        dropout_rate=hp.dropout_rate,
                                                        is_training=is_training,
                                                        causality=True, 
                                                        scope="self_attention")
                        
                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(queries=self.dec, 
                                                        keys=self.enc, 
                                                        num_units=hp.hidden_units, 
                                                        num_heads=hp.num_heads,
                                                        dropout_rate=hp.dropout_rate,
                                                        is_training=is_training, 
                                                        causality=False,
                                                        scope="vanilla_attention")
                        
                        ## Feed Forward
                        self.dec = feedforward(self.dec, num_units=[4*hp.hidden_units, hp.hidden_units])
                
            # Final linear projection
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget))
            tf.summary.scalar('acc', self.acc)
                
            if is_training:  
                # Loss
                self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))
               
                # Training Scheme
                self.global_step = tf.Variable(0, name='global_step', trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
                self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
                   
                # Summary 
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()

## Train the model.

In [21]:
# # Load vocabulary    
# ja2idx, idx2ja = load_ja_vocab()
# en2idx, idx2en = load_en_vocab()
# 
# for key in list(ja2idx.keys())[0:5]:
#     print(key, ja2idx[key])

In [23]:
# Construct graph
g = Graph("train"); print("Graph loaded")

# Start session
sv = tf.train.Supervisor(graph=g.graph, 
                         logdir=hp.logdir,
                         save_model_secs=0)

with sv.managed_session() as sess:
    for epoch in range(1, hp.num_epochs+1): 
        if sv.should_stop(): break
        for step in tqdm.tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
            sess.run(g.train_op)

        gs = sess.run(g.global_step)   
        sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))

print("Done")    

Instructions for updating:
Use `argmax` instead
Graph loaded
INFO:tensorflow:Starting standard services.
INFO:tensorflow:Starting queue runners.


  0%|                                         | 0/1283 [00:00<?, ?b/s]

INFO:tensorflow:global_step/sec: 0
INFO:tensorflow:Recording summary at step 0.


  3%|▊                             | 36/1283 [01:57<1:08:04,  3.28s/b]

INFO:tensorflow:global_step/sec: 0.310886
INFO:tensorflow:Recording summary at step 36.


  6%|█▊                            | 75/1283 [03:59<1:04:24,  3.20s/b]

INFO:tensorflow:global_step/sec: 0.324975
INFO:tensorflow:Recording summary at step 75.


  9%|██▌                          | 113/1283 [06:00<1:02:12,  3.19s/b]

INFO:tensorflow:global_step/sec: 0.316655
INFO:tensorflow:Recording summary at step 113.


 12%|███▍                         | 150/1283 [07:57<1:00:10,  3.19s/b]

INFO:tensorflow:global_step/sec: 0.308369
INFO:tensorflow:Recording summary at step 150.


 15%|████▌                          | 189/1283 [09:59<57:47,  3.17s/b]

INFO:tensorflow:global_step/sec: 0.325
INFO:tensorflow:Recording summary at step 189.


 18%|█████▌                         | 228/1283 [11:59<55:28,  3.15s/b]

INFO:tensorflow:global_step/sec: 0.325
INFO:tensorflow:Recording summary at step 228.


 21%|██████▍                        | 268/1283 [13:58<52:56,  3.13s/b]

INFO:tensorflow:global_step/sec: 0.333334
INFO:tensorflow:Recording summary at step 268.


 24%|███████▍                       | 307/1283 [15:57<50:43,  3.12s/b]

INFO:tensorflow:global_step/sec: 0.324988
INFO:tensorflow:Recording summary at step 307.


 27%|████████▎                      | 346/1283 [17:59<48:44,  3.12s/b]

INFO:tensorflow:global_step/sec: 0.325012
INFO:tensorflow:Recording summary at step 346.


 30%|█████████▎                     | 384/1283 [20:00<46:50,  3.13s/b]

INFO:tensorflow:Recording summary at step 384.


 33%|██████████▏                    | 423/1283 [22:00<44:44,  3.12s/b]

INFO:tensorflow:Recording summary at step 423.


 36%|███████████▏                   | 461/1283 [23:58<42:45,  3.12s/b]

INFO:tensorflow:Recording summary at step 461.


 39%|████████████                   | 500/1283 [25:57<40:39,  3.12s/b]

INFO:tensorflow:Recording summary at step 500.


 42%|█████████████                  | 539/1283 [27:58<38:36,  3.11s/b]

INFO:tensorflow:Recording summary at step 539.


 45%|█████████████▉                 | 577/1283 [29:58<36:40,  3.12s/b]

INFO:tensorflow:Recording summary at step 577.


 48%|██████████████▉                | 616/1283 [31:58<34:37,  3.11s/b]

INFO:tensorflow:Recording summary at step 616.


 51%|███████████████▊               | 655/1283 [33:58<32:34,  3.11s/b]

INFO:tensorflow:Recording summary at step 655.


 54%|████████████████▊              | 694/1283 [35:58<30:31,  3.11s/b]

INFO:tensorflow:Recording summary at step 694.


 56%|█████████████████▍             | 721/1283 [37:22<29:07,  3.11s/b]

INFO:tensorflow:Recording summary at step 721.
INFO:tensorflow:Recording summary at step 721.
INFO:tensorflow:Recording summary at step 721.
INFO:tensorflow:Recording summary at step 721.


 59%|██████████████████▏            | 753/1283 [46:00<32:23,  3.67s/b]

INFO:tensorflow:Recording summary at step 753.


 62%|███████████████████            | 791/1283 [47:58<29:50,  3.64s/b]

INFO:tensorflow:Recording summary at step 791.


 65%|████████████████████           | 831/1283 [49:59<27:11,  3.61s/b]

INFO:tensorflow:Recording summary at step 831.


 68%|█████████████████████          | 870/1283 [51:57<24:39,  3.58s/b]

INFO:tensorflow:Recording summary at step 870.


 71%|█████████████████████▉         | 910/1283 [53:59<22:07,  3.56s/b]

INFO:tensorflow:Recording summary at step 910.


 74%|██████████████████████▉        | 949/1283 [55:57<19:41,  3.54s/b]

INFO:tensorflow:Recording summary at step 949.


 77%|███████████████████████▉       | 989/1283 [57:59<17:14,  3.52s/b]

INFO:tensorflow:Recording summary at step 989.


 80%|████████████████████████      | 1028/1283 [59:58<14:52,  3.50s/b]

INFO:tensorflow:Recording summary at step 1028.


 83%|███████████████████████▎    | 1067/1283 [1:02:00<12:33,  3.49s/b]

INFO:tensorflow:Recording summary at step 1067.


 86%|████████████████████████▏   | 1106/1283 [1:03:59<10:14,  3.47s/b]

INFO:tensorflow:Recording summary at step 1106.


 89%|█████████████████████████   | 1146/1283 [1:06:00<07:53,  3.46s/b]

INFO:tensorflow:Recording summary at step 1146.


 92%|█████████████████████████▊  | 1185/1283 [1:07:58<05:37,  3.44s/b]

INFO:tensorflow:Recording summary at step 1185.


 95%|██████████████████████████▋ | 1222/1283 [1:09:59<03:29,  3.44s/b]

INFO:tensorflow:Recording summary at step 1222.


 98%|███████████████████████████▌| 1261/1283 [1:11:59<01:15,  3.43s/b]

INFO:tensorflow:Recording summary at step 1261.


  1%|▎                             | 15/1283 [00:46<1:05:26,  3.10s/b]

INFO:tensorflow:Recording summary at step 1298.


  4%|█▎                            | 54/1283 [02:45<1:02:51,  3.07s/b]

INFO:tensorflow:Recording summary at step 1337.


  7%|██▏                           | 94/1283 [04:47<1:00:34,  3.06s/b]

INFO:tensorflow:Recording summary at step 1377.


  8%|██▌                            | 108/1283 [05:30<59:53,  3.06s/b]

KeyboardInterrupt: 

## Evalute the trained model.

In [55]:
from nltk.translate.bleu_score import corpus_bleu

def eval(): 
    # Load graph
    g = Graph(is_training=False)
    print("Graph loaded")
    
    # Load data
    X, Sources, Targets = load_test_data()
    ja2idx, idx2ja = load_ja_vocab()
    en2idx, idx2en = load_en_vocab()
          
    # Start session         
    with g.graph.as_default():    
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            ## Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print("Restored!")
              
            ## Get model name
            mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
             
            ## Inference
            if not os.path.exists('results-jp-en'): os.mkdir('results-jp-en')
            with codecs.open("results-jp-en/" + mname, "w", "utf-8") as fout:
                list_of_refs, hypotheses = [], []
                for i in range(len(X) // hp.batch_size):
                     
                    ### Get mini-batches
                    x = X[i*hp.batch_size: (i+1)*hp.batch_size]
                    sources = Sources[i*hp.batch_size: (i+1)*hp.batch_size]
                    targets = Targets[i*hp.batch_size: (i+1)*hp.batch_size]
                     
                    ### Autoregressive inference
                    preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
                    for j in range(hp.maxlen):
                        _preds = sess.run(g.preds, {g.x: x, g.y: preds})
                        preds[:, j] = _preds[:, j]
                     
                    ### Write to file
                    for source, target, pred in zip(sources, targets, preds): # sentence-wise
                        got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip()
                        fout.write("- source: " + source +"\n")
                        fout.write("- expected: " + target + "\n")
                        fout.write("- got: " + got + "\n\n")
                        fout.flush()
                          
                        # bleu score
                        ref = target.split()
                        hypothesis = got.split()
                        if len(ref) > 3 and len(hypothesis) > 3:
                            list_of_refs.append([ref])
                            hypotheses.append(hypothesis)
                            
                ## Calculate bleu score
                score = corpus_bleu(list_of_refs, hypotheses)
                fout.write("Bleu Score = " + str(100*score))

In [56]:
eval()
print("Done")

Graph loaded
INFO:tensorflow:Starting standard services.
INFO:tensorflow:Starting queue runners.
INFO:tensorflow:Restoring parameters from logdir-jp-en/model_epoch_01_gs_1283
Restored!
Done


Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
