In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow1/multi_turn_rewrite/chinese/main')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
%tensorflow_version 1.x
!pip install texar

TensorFlow 1.x selected.


In [0]:
import tensorflow as tf
import texar.tf as tx

import numpy as np
import pprint
import logging

from pathlib import Path
from modified_beam_search_decoder import BeamSearchDecoder

print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())

TensorFlow Version 1.15.2
GPU Enabled: True


In [0]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip('\n')
      word2idx[line] = i
  return word2idx

In [0]:
def align_pad(li):
  max_len = max([len(sent) for sent in li])
  for sent in li:
    if len(sent) < max_len:
      sent += [0] * (max_len - len(sent))

In [0]:
def data_generator(f_paths, params):
  for f_path in f_paths:
    with open(f_path) as f:
      print('Reading', f_path)
      for line in f:
        line = line.rstrip()
        h1, h2, q, a = line.split('\t')
        char2idx_fn = lambda x: [params['char2idx'].get(c, len(params['char2idx'])) for c in list(x)]
        h1, h2, q, a = char2idx_fn(h1), char2idx_fn(h2), char2idx_fn(q), char2idx_fn(a)
        a = [c for c in a if c in (h1 + h2 + q)]
        align_pad([h1, h2])
        a_in = [1] + a
        a_out = a + [2]
        q = q + [2]
        nested = ({'history': [h1, h2], 'query': q}, (a_in, a_out))
        yield nested

In [0]:
def dataset(is_training, params):
  _shapes = ({'history':[None, None], 'query':[None]}, ([None], [None]))
  _types = ({'history':tf.int32, 'query': tf.int32}, (tf.int32, tf.int32))
  _pads = ({'history':0, 'query':0}, (0, 0))
  
  if is_training:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['train_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.shuffle(params['buffer_size'])
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  else:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['test_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  
  return ds

In [0]:
def clip_grads(loss):
  variables = tf.trainable_variables()
  pprint.pprint(variables)
  grads = tf.gradients(loss, variables)
  clipped_grads, _ = tf.clip_by_global_norm(grads, params['clip_norm'])
  return zip(clipped_grads, variables)


def rnn_cell():
  def cell_fn():
    cell = tf.nn.rnn_cell.GRUCell(params['hidden_units'],
                                  kernel_initializer=tf.orthogonal_initializer())
    return cell
  if params['dec_layers'] > 1:
    cells = []
    for i in range(params['dec_layers']):
      if i == params['dec_layers'] - 1:
        cells.append(cell_fn())
      else:
        cells.append(tf.nn.rnn_cell.ResidualWrapper(cell_fn(), residual_fn=lambda i,o: tf.concat((i,o), -1)))
    return tf.nn.rnn_cell.MultiRNNCell(cells)
  else:
    return cell_fn()

  
def dec_cell(enc_out, q_enc_len):
  h_enc_out, q_enc_out, h_, q_ = enc_out

  attn_h = tf.contrib.seq2seq.BahdanauAttention(
    num_units = params['hidden_units'],
    memory = h_enc_out,
    memory_sequence_length = None)
  
  attn_q = tf.contrib.seq2seq.BahdanauAttention(
    num_units = params['hidden_units'],
    memory = q_enc_out,
    memory_sequence_length = q_enc_len)
  
  attn_h_ = tf.contrib.seq2seq.BahdanauAttention(
    num_units = params['hidden_units'],
    memory = h_,
    memory_sequence_length = None)
  
  attn_q_ = tf.contrib.seq2seq.BahdanauAttention(
    num_units = params['hidden_units'],
    memory = q_,
    memory_sequence_length = q_enc_len)
  
  return tf.contrib.seq2seq.AttentionWrapper(
    cell = rnn_cell(),
    attention_mechanism = [attn_h_, attn_q_, attn_h, attn_q],
    attention_layer_size = 4 * [params['hidden_units']//4])
    

class Pointer(tf.layers.Layer):
  def __init__(self, encoder_ids, encoder_out, vocab_size, is_beam_search):
    super().__init__()
    self.encoder_ids = tf.cast(encoder_ids, tf.int32)
    self.encoder_out = encoder_out
    self.vocab_size = vocab_size
    self.is_beam_search = is_beam_search

  def call(self, inputs):
    _max_len = tf.shape(self.encoder_ids)[1]
    _batch_size_ori = tf.shape(inputs)[0]
    if self.is_beam_search:
      _batch_size= _batch_size_ori * params['beam_width']
    else:
      _batch_size = _batch_size_ori
    inputs = tf.reshape(inputs, (_batch_size, params['hidden_units']))

    attn_weights = tf.matmul(self.encoder_out, tf.expand_dims(inputs, -1))
    attn_weights = tf.squeeze(attn_weights, -1)
    updates = tf.nn.softmax(attn_weights)
    
    batch_nums = tf.range(0, _batch_size)
    batch_nums = tf.expand_dims(batch_nums, axis=1)
    batch_nums = tf.tile(batch_nums, [1, _max_len])

    indices = tf.stack([batch_nums, self.encoder_ids], axis=2)
    if self.is_beam_search:
      x = tf.scatter_nd(indices, updates, (_batch_size, self.vocab_size))
      return tf.reshape(x, (_batch_size_ori, params['beam_width'], self.vocab_size))
    else:
      x = tf.scatter_nd(indices, updates, (_batch_size, self.vocab_size))
      return x
  
  def compute_output_shape(self, input_shape):
    return input_shape[:-1].concatenate(self.vocab_size)


class OutputProj(tf.layers.Layer):
  def __init__(self, h_encoder_ids, q_encoder_ids, h_enc_out, q_enc_out, vocab_size, is_beam_search):
    super().__init__()
    self.h_pointer = Pointer(h_encoder_ids, h_enc_out, vocab_size, is_beam_search)
    self.q_pointer = Pointer(q_encoder_ids, q_enc_out, vocab_size, is_beam_search)
    self.vocab_size = vocab_size

  def build(self, input_shape):
    self.gate_fc = tf.layers.Dense(1, tf.sigmoid, use_bias=False)
    super().build(input_shape)
  
  def call(self, inputs):
    h_dist = self.h_pointer(inputs)
    q_dist = self.q_pointer(inputs)
    gate = self.gate_fc(inputs)
    return gate * h_dist + (1 - gate) * q_dist
  
  def compute_output_shape(self, input_shape):
    return input_shape[:-1].concatenate(self.vocab_size)

In [0]:
def bigru_encode(encoder, x, mask):
  enc_out, state_fw, state_bw = encoder(x, mask=mask)
  enc_state = tf.concat((state_fw, state_bw), axis=-1)
  return enc_out, enc_state


def teach_forcing(labels, embedding, enc_out, enc_len, enc_state, batch_sz, params, is_training, encoder_ids):
  h_enc_out, q_enc_out, h_, q_ = enc_out
  h_ids, q_ids = encoder_ids
  _, q_enc_len = enc_len
  output_proj = OutputProj(h_ids, q_ids, h_enc_out, q_enc_out, len(params['char2idx'])+1, is_beam_search=False)

  dec_inputs, dec_outputs = labels
  dec_seq_len = tf.count_nonzero(dec_inputs, 1, dtype=tf.int32)
  dec_inputs = tf.nn.embedding_lookup(embedding, dec_inputs)
  dec_inputs = tf.layers.dropout(dec_inputs, params['dropout_rate'], training=is_training)
  cell = dec_cell((h_enc_out, q_enc_out, h_, q_), q_enc_len)
  
  init_state = cell.zero_state(batch_sz, tf.float32).clone(
    cell_state=enc_state)
  
  helper = tf.contrib.seq2seq.TrainingHelper(
    inputs = dec_inputs,
    sequence_length = dec_seq_len,)
  decoder = tf.contrib.seq2seq.BasicDecoder(
    cell = cell,
    helper = helper,
    initial_state = init_state,
    output_layer = output_proj)
  decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
    decoder = decoder,
    maximum_iterations = tf.reduce_max(dec_seq_len))
  
  return decoder_output.rnn_output


def beam_search(embedding, enc_out, enc_len, enc_state, batch_sz, params, encoder_ids):
  h_enc_out, q_enc_out, h_, q_ = enc_out
  h_enc_out_t = tf.contrib.seq2seq.tile_batch(h_enc_out, params['beam_width'])
  q_enc_out_t = tf.contrib.seq2seq.tile_batch(q_enc_out, params['beam_width'])
  h_t_ = tf.contrib.seq2seq.tile_batch(h_, params['beam_width'])
  q_t_ = tf.contrib.seq2seq.tile_batch(q_, params['beam_width'])
  enc_state_t = tf.contrib.seq2seq.tile_batch(enc_state, params['beam_width'])
  h_ids, q_ids = encoder_ids
  h_ids_t = tf.contrib.seq2seq.tile_batch(h_ids, params['beam_width'])
  q_ids_t = tf.contrib.seq2seq.tile_batch(q_ids, params['beam_width'])
  _, q_enc_len = enc_len
  q_enc_len_t = tf.contrib.seq2seq.tile_batch(q_enc_len, params['beam_width'])
  
  output_proj = OutputProj(h_ids_t, q_ids_t, h_enc_out_t, q_enc_out_t, len(params['char2idx'])+1, is_beam_search=True)

  cell = dec_cell((h_enc_out_t, q_enc_out_t, h_t_, q_t_), q_enc_len_t)
  init_state = cell.zero_state(batch_sz*params['beam_width'], tf.float32).clone(
    cell_state=enc_state_t)
  
  decoder = BeamSearchDecoder(
    cell = cell,
    embedding = embedding,
    start_tokens = tf.tile(tf.constant([1], tf.int32), [batch_sz]),
    end_token = 2,
    initial_state = init_state,
    beam_width = params['beam_width'],
    output_layer = output_proj,)
  decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
    decoder = decoder,
    maximum_iterations = params['max_len'],)
  
  return decoder_output.predicted_ids[:, :, 0]

In [0]:
def masked_attention(x, align, mask, tile_len):
  pad = tf.fill(tf.shape(align), float('-inf'))
  mask = tf.tile(tf.expand_dims(mask, 1), [1, tile_len, 1])
  align = tf.where(tf.equal(mask, 0), pad, align)
  align = tf.nn.softmax(align)
  return tf.matmul(align, x)

  
def soft_align_attention(x1, x2, mask1, mask2):
  align12 = tf.matmul(x1, x2, transpose_b=True)
  align21 = tf.transpose(align12, [0,2,1])
  
  x1_ = masked_attention(x2, align12, mask2, tf.shape(x1)[1])
  x2_ = masked_attention(x1, align21, mask1, tf.shape(x2)[1])
  
  return x1_, x2_

In [0]:
def forward(features, labels, mode):
  history = features['history']
  query = features['query']

  is_training = (mode == tf.estimator.ModeKeys.TRAIN)
  batch_sz = tf.shape(query)[0]

  query_valid_len = tf.count_nonzero(query, 1, dtype=tf.int32)
  query_mask = tf.sign(query)

  num_history = tf.shape(history)[1]
  history_len = tf.shape(history)[2]
  history = tf.reshape(history, (num_history*batch_sz, history_len))
  history_mask = tf.sign(history)

  history_ = tf.reshape(history, (batch_sz, num_history*history_len))
  encoder_ids = (history_, query)
  
  
  with tf.variable_scope('Embedding'):
    embedding = tf.Variable(np.load('../vocab/char.npy'),
                            dtype=tf.float32,
                            name='fasttext_vectors')
    def embed_fn(x):
      x = tf.nn.embedding_lookup(embedding, x)
      x = tf.layers.dropout(x, params['dropout_rate'], training=is_training)
      return x
    query = embed_fn(query)
    history = embed_fn(history)
  
  
  with tf.variable_scope('Encoder'):
    encoder = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(
      params['hidden_units'], return_state=True, return_sequences=True, zero_output_for_mask=True))

    query_out, query_state = bigru_encode(encoder, query, query_mask)
    history_out, history_state = bigru_encode(encoder, history, history_mask)
    history_out = tf.reshape(history_out, (batch_sz, num_history*history_len, 2*params['hidden_units']))

    query_out = tf.layers.dropout(query_out, params['dropout_rate'], training=is_training)
    history_out = tf.layers.dropout(history_out, params['dropout_rate'], training=is_training)

    h = tf.layers.dense(history_out, params['hidden_units'], params['activation'], name='fc_encode_history')
    q = tf.layers.dense(query_out, params['hidden_units'], params['activation'], name='fc_encode_query')
    history_mask = tf.reshape(history_mask, (batch_sz, num_history*history_len))

    h_ = tf.layers.dropout(h, params['dropout_rate'], training=is_training)
    q_ = tf.layers.dropout(q, params['dropout_rate'], training=is_training)
    h_, q_ = soft_align_attention(h_, q_, history_mask, query_mask)

    encoder_out = (h, q, h_, q_)
    encoder_state = tf.layers.dense(query_state, params['hidden_units'], params['activation'], name='fc_encode_state')
    enc_len = (None, query_valid_len)


  with tf.variable_scope('Decoder'):
    if is_training or (mode == tf.estimator.ModeKeys.EVAL):
        return teach_forcing(labels, embedding, encoder_out, enc_len, encoder_state, batch_sz, params, is_training, encoder_ids)
    else:
        return beam_search(embedding, encoder_out, enc_len, encoder_state, batch_sz, params, encoder_ids)

In [0]:
def clr(step,
        initial_learning_rate,
        maximal_learning_rate,
        step_size,
        scale_fn,
        scale_mode,):
  step = tf.cast(step, tf.float32)
  
  initial_learning_rate = tf.convert_to_tensor(
    initial_learning_rate, name='initial_learning_rate')
  dtype = initial_learning_rate.dtype
  maximal_learning_rate = tf.cast(maximal_learning_rate, dtype)
  step_size = tf.cast(step_size, dtype)
  cycle = tf.floor(1 + step / (2 * step_size))
  x = tf.abs(step / step_size - 2 * cycle + 1)

  mode_step = cycle if scale_mode == 'cycle' else step

  return initial_learning_rate + (
    maximal_learning_rate - initial_learning_rate) * tf.maximum(
      tf.cast(0, dtype), (1 - x)) * scale_fn(mode_step)


def cross_entropy_loss(logits, labels, vocab_size, smoothing):
  soft_targets = tf.one_hot(tf.cast(labels, tf.int32), depth=vocab_size)
  soft_targets = ((1-smoothing) * soft_targets) + (smoothing / vocab_size)

  logits = tf.minimum(1., logits + 1e-6)
  log_probs = tf.log(logits)
  xentropy = - tf.reduce_sum(soft_targets * log_probs, axis=-1)

  weights = tf.to_float(tf.not_equal(labels, 0))
  xentropy *= weights
  return tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

In [0]:
def model_fn(features, labels, mode, params):
  logits_or_ids = forward(features, labels, mode)
  
  if mode == tf.estimator.ModeKeys.PREDICT:
      return tf.estimator.EstimatorSpec(mode, predictions=logits_or_ids)
  
  dec_inputs, dec_outputs = labels
  loss_op = cross_entropy_loss(logits_or_ids, dec_outputs, len(params['char2idx'])+1, params['label_smoothing'])
    
  if mode == tf.estimator.ModeKeys.TRAIN:
    global_step=tf.train.get_or_create_global_step()
    
    decay_lr = clr(
      step = global_step,
      initial_learning_rate = 1e-4,
      maximal_learning_rate = 8e-4,
      step_size = 8 * params['buffer_size'] // params['batch_size'],
      scale_fn=lambda x: 1 / (2.0 ** (x - 1)),
      scale_mode = 'cycle',)
    
    train_op = tf.train.AdamOptimizer(decay_lr).apply_gradients(
        clip_grads(loss_op), global_step=global_step)
    
    hook = tf.train.LoggingTensorHook({'lr': decay_lr}, every_n_iter=100)
    
    return tf.estimator.EstimatorSpec(
        mode=mode, loss=loss_op, train_op=train_op, training_hooks=[hook],)
    
  if mode == tf.estimator.ModeKeys.EVAL:
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss_op)

In [0]:
def minimal_test(estimator):
  test_str = '成都房价是多少|不买就后悔了成都房价还有上涨空间|买不起'
  h1, h2, q = test_str.split('|')
  char2idx_fn = lambda x: [params['char2idx'].get(c, len(params['char2idx'])) for c in list(x)]
  h1, h2, q = char2idx_fn(h1), char2idx_fn(h2), char2idx_fn(q)
  q = q + [2]
  align_pad([h1, h2])
  predicted = list(estimator.predict(tf.estimator.inputs.numpy_input_fn(
    x = {'history':np.reshape([h1, h2], (1, 2, len(h1))),
         'query':np.reshape(q, (1, len(q)))},
         shuffle = False)))[0]
  predicted = ''.join([params['idx2char'].get(idx, '<unk>') for idx in predicted if (idx != 0 and idx != 2)])
  print('-'*12)
  print('minimal test')
  print('Q:', test_str)
  print('A:', predicted)
  print('-'*12)

In [0]:
params = {
  'model_dir': '../model/pointer_gru_clr_multi_attn',
  'log_path': '../log/pointer_gru_clr_multi_attn.txt',
  'export_dir': '../model/pointer_gru_attn_export',
  'train_path': ['../data/train_pos.txt', '../data/train_neg.txt'],
  'test_path': ['../data/test_pos.txt'],
  'vocab_path': '../vocab/char.txt',
  'max_len': 30,
  'activation': tf.nn.elu,
  'dropout_rate': .2,
  'hidden_units': 300,
  'dec_layers': 1,
  'num_hops': 3,
  'gating_fn': tf.sigmoid,
  'beam_width': 10,
  'clip_norm': .1,
  'buffer_size': 18986 * 2,
  'batch_size': 32,
  'num_patience': 10,
  'label_smoothing': .1,
}

In [0]:
params['char2idx'] = get_vocab(params['vocab_path'])
params['idx2char'] = {idx: char for char, idx in params['char2idx'].items()}

def serving_input_receiver_fn():
  query = tf.placeholder(tf.int32, [None, None], 'query')
  history = tf.placeholder(tf.int32, [None, None, None], 'history')
  
  features = {'query': query, 'history': history}
  receiver_tensors = features
  
  return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)

In [17]:
# Create directory if not exist
Path(os.path.dirname(params['log_path'])).mkdir(exist_ok=True)
Path(params['model_dir']).mkdir(exist_ok=True, parents=True)

# Logging
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)
fh = logging.FileHandler(params['log_path'])
logger.addHandler(fh)

# Create an estimator
estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  model_dir=params['model_dir'],
  config=tf.estimator.RunConfig(
    save_checkpoints_steps=params['buffer_size']//params['batch_size']+1,
    keep_checkpoint_max=3),
  params=params)

best_em = 0.
count = 0
tf.enable_eager_execution()

while True:
  estimator.train(input_fn=lambda: dataset(is_training=True, params=params))

  minimal_test(estimator)

  # BLEU
  labels = [label for _, (_, label) in dataset(is_training=False, params=params)]
  labels = [j for i in labels for j in i.numpy()]
  labels = [[params['idx2char'].get(idx, '<unk>') for idx in arr if (idx!=0 and idx!=2)] for arr in labels]

  preds = list(estimator.predict(input_fn=lambda: dataset(is_training=False, params=params)))
  assert len(labels) == len(preds)
  preds = [[params['idx2char'].get(idx, '<unk>') for idx in arr if (idx!=0 and idx!=2)] for arr in preds]

  em = [np.array_equal(p, l) for p, l in zip(preds, labels)]
  em = np.asarray(em).mean()

  bleu, bleu_1, bleu_2, bleu_3, bleu_4 = tx.evals.corpus_bleu_moses(list_of_references=[[l] for l in labels], hypotheses=preds, return_all=True)
  logger.info("BLEU: {:.3f}, BELU-1: {:.3f}, BLEU-2: {:.3f}, BLEU-4: {:.3f}, EM: {:.3f}".format(bleu, bleu_1, bleu_2, bleu_4, em))

  if em > best_em:
    best_em = em
    count = 0
    if em >= .58:
      estimator.export_saved_model(params['export_dir'], serving_input_receiver_fn)
  else:
    count += 1
  logger.info("Best EM: {:.3f}".format(best_em))

  if count == params['num_patience']:
    print(params['num_patience'], "times not improve the best result, therefore stop training")
    break

INFO:tensorflow:Using config: {'_model_dir': '../model/pointer_gru_clr_multi_attn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1187, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 3, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f353fb44240>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
Use Variable.read_value. Variables in 2.X are i