In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/semantic_parsing/tree_slu/main')

In [2]:
%tensorflow_version 2.x
!pip install tensorflow-addons



In [3]:
from tensorflow_addons.optimizers.cyclical_learning_rate import Triangular2CyclicalLearningRate

import tensorflow as tf
import tensorflow_addons as tfa

import numpy as np
import pprint
import logging
import time
import nltk

print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())

TensorFlow Version 2.2.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU Enabled: True


In [0]:
# stream data from text files
def data_generator(f_path, params):
  with open(f_path) as f:
    print('Reading', f_path)
    for line in f:
      text_raw, text_tokenized, label = line.split('\t')
      text_tokenized = text_tokenized.lower().split()
      label = label.replace('[', '[ ').lower().split()
      source = [params['tgt2idx'].get(w, len(params['tgt2idx'])) for w in text_tokenized]
      target = [params['tgt2idx'].get(w, len(params['tgt2idx'])) for w in label]
      target_in = [1] + target
      target_out = target + [2]
      yield (source, target_in, target_out)

In [0]:
def dataset(is_training, params):
  _shapes = ([None], [None], [None])
  _types = (tf.int32, tf.int32, tf.int32)
  _pads = (0, 0, 0)
  
  if is_training:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['train_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.shuffle(params['buffer_size'])
    ds = ds.padded_batch(params['train_batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  else:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['test_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.padded_batch(params['eval_batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  
  return ds

In [0]:
class Embed(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.embedding = tf.Variable(np.load('../vocab/word.npy'),
                                 dtype=tf.float32,
                                 name='pretrained_embedding')
  
  def call(self, inputs):
    if inputs.dtype != tf.int32:
      inputs = tf.cast(inputs, tf.int32)
    x = tf.nn.embedding_lookup(self.embedding, inputs)
    return x

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, params):
    super().__init__()
    self.dropout = tf.keras.layers.Dropout(params['dropout_rate'])
    self.bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(
      params['rnn_units'], return_state=True, return_sequences=True, zero_output_for_mask=True))
    self.state_fc = tf.keras.layers.Dense(params['rnn_units'], params['activation'], name='state_fc')
  
  def call(self, inputs, mask, training):
    if mask.dtype != tf.bool:
      mask = tf.cast(mask, tf.bool)
    x = self.dropout(inputs, training=training)
    
    encoder_o, state_fw, state_bw = self.bilstm(x, mask=mask)
    encoder_s = self.state_fc(tf.concat((state_fw, state_bw), -1))
    
    return encoder_o, tuple([encoder_s])

In [0]:
class TiedDense(tf.keras.layers.Layer):
  def __init__(self, tied_embed, out_dim):
    super().__init__()
    self.tied_embed = tied_embed
    self.out_dim = out_dim
  
  def build(self, input_shape):
    self.bias = self.add_weight(name='bias',
                                shape=[self.out_dim],
                                trainable=True)
    super().build(input_shape)
  
  def call(self, inputs):
    x = tf.matmul(inputs, self.tied_embed, transpose_b=True)
    x = tf.nn.bias_add(x, self.bias)
    return x
  
  def compute_output_shape(self, input_shape):
    return input_shape[:-1].concatenate(self.out_dim)

In [0]:
class Model(tf.keras.Model):
  def __init__(self, params):
    super().__init__()
    self.embed = Embed()

    self.encoder = Encoder(params)

    self.dropout = tf.keras.layers.Dropout(params['dropout_rate'])

    self.attn = tfa.seq2seq.BahdanauAttention(params['rnn_units'])

    self.decoder_cell = tfa.seq2seq.AttentionWrapper(
      tf.keras.layers.StackedRNNCells([tf.keras.layers.GRUCell(params['rnn_units'])]),
      self.attn,
      attention_layer_size=params['rnn_units'])
    
    self.proj_layer = TiedDense(self.embed.embedding, len(params['tgt2idx'])+1)

    self.teach_forcing = tfa.seq2seq.BasicDecoder(
      self.decoder_cell,
      tfa.seq2seq.sampler.TrainingSampler(),
      output_layer = self.proj_layer)

    self.beam_search = tfa.seq2seq.BeamSearchDecoder(
      self.decoder_cell,
      beam_width = params['beam_width'],
      embedding_fn = lambda x: self.embed(x),
      output_layer = self.proj_layer,
      maximum_iterations = 80,)

  
  def call(self, inputs, training=True):
    if training:
      source, target_in = inputs
    else:
      source = inputs
    batch_sz = tf.shape(source)[0]

    encoder_o, encoder_s = self.encoder(self.embed(source), mask=tf.sign(source), training=training)

    if training:
      self.attn([encoder_o, tf.math.count_nonzero(source, 1)], setup_memory=True)
      attn_state = self.decoder_cell.get_initial_state(batch_size=batch_sz, dtype=tf.float32)
      attn_state = attn_state.clone(cell_state=encoder_s)

      decoder_o, _, _ = self.teach_forcing(
        inputs = self.dropout(self.embed(target_in), training=training),
        initial_state = attn_state,
        sequence_length = tf.math.count_nonzero(target_in, 1, dtype=tf.int32))

      logits_or_ids = decoder_o.rnn_output
    else:
      encoder_o_t = tfa.seq2seq.tile_batch(encoder_o, params['beam_width'])
      encoder_len_t = tfa.seq2seq.tile_batch(tf.math.count_nonzero(source, 1), params['beam_width'])
      encoder_s_t = tfa.seq2seq.tile_batch(encoder_s, params['beam_width'])

      self.attn([encoder_o_t, encoder_len_t], setup_memory=True)
      attn_state = self.decoder_cell.get_initial_state(batch_size=batch_sz*params['beam_width'], dtype=tf.float32)
      attn_state = attn_state.clone(cell_state=encoder_s_t)

      decoder_o, _, _ = self.beam_search(
        None,
        start_tokens = tf.tile(tf.constant([1], tf.int32), [batch_sz]),
        end_token = 2,
        initial_state = attn_state,)

      logits_or_ids = decoder_o.predicted_ids[:, :, 0]

    return logits_or_ids

In [0]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      word2idx[line] = i
  return word2idx

In [0]:
params = {
    'train_path': '../data/train.tsv',
    'test_path': '../data/test.tsv',
    'vocab_src_path': '../vocab/source.txt',
    'vocab_tgt_path': '../vocab/target.txt',
    'model_path': '../model/',
    'dropout_rate': .2,
    'rnn_units': 300,
    'embed_dim': 300,
    'activation': tf.nn.elu,
    'beam_width': 10,
    'init_lr': 1e-4,
    'max_lr': 8e-4,
    'clip_norm': .1,
    'buffer_size': 31279,
    'train_batch_size': 32,
    'eval_batch_size': 128,
    'num_patience': 10,
}

In [0]:
params['tgt2idx'] = get_vocab(params['vocab_tgt_path'])
params['idx2tgt'] = {idx: tgt for tgt, idx in params['tgt2idx'].items()}

In [13]:
model = Model(params)
model.build(input_shape=[[None, None], [None, None]])
pprint.pprint([(v.name, v.shape) for v in model.trainable_variables])

[('pretrained_embedding:0', TensorShape([8692, 300])),
 ('encoder/bidirectional/forward_gru/gru_cell_1/kernel:0',
  TensorShape([300, 900])),
 ('encoder/bidirectional/forward_gru/gru_cell_1/recurrent_kernel:0',
  TensorShape([300, 900])),
 ('encoder/bidirectional/forward_gru/gru_cell_1/bias:0', TensorShape([2, 900])),
 ('encoder/bidirectional/backward_gru/gru_cell_2/kernel:0',
  TensorShape([300, 900])),
 ('encoder/bidirectional/backward_gru/gru_cell_2/recurrent_kernel:0',
  TensorShape([300, 900])),
 ('encoder/bidirectional/backward_gru/gru_cell_2/bias:0',
  TensorShape([2, 900])),
 ('encoder/state_fc/kernel:0', TensorShape([600, 300])),
 ('encoder/state_fc/bias:0', TensorShape([300])),
 ('BahdanauAttention/attention_v:0', TensorShape([300])),
 ('attention_wrapper/BahdanauAttention/kernel:0', TensorShape([300, 300])),
 ('BahdanauAttention/kernel:0', TensorShape([600, 300])),
 ('attention_wrapper/attention_layer/kernel:0', TensorShape([900, 300])),
 ('attention_wrapper/stacked_rnn_cell

In [0]:
decay_lr = Triangular2CyclicalLearningRate(
  initial_learning_rate = params['init_lr'],
  maximal_learning_rate = params['max_lr'],
  step_size = 4*params['buffer_size']//params['train_batch_size'],)
optim = tf.optimizers.Adam(params['init_lr'])
global_step = 0

In [0]:
best_acc = .0
count = 0

In [0]:
t0 = time.time()
logger = logging.getLogger('tensorflow')
logger.propagate = False
logger.setLevel(logging.INFO)

In [0]:
def minimal_test(model, params):
  test_str = ['what', 'times', 'are', 'the', 'nutcracker', 'show', 'playing', 'near', 'me']
  test_arr = tf.convert_to_tensor([[params['tgt2idx'][w] for w in test_str]])
  generated = model(inputs=test_arr, training=False)

  print('-'*12)
  print('minimal test')
  print('utterance:', ' '.join(test_str))
  parsed = ' '.join([params['idx2tgt'][idx] for idx in generated[0].numpy() if (idx != 0 and idx != 2)])
  print('parsed:', parsed)
  print()
  try:
    nltk.tree.Tree.fromstring(parsed.replace('[ ', '(').replace(' ]', ')')).pretty_print()
  except:
    pass
  print('-'*12)

In [0]:
while True:
  # TRAINING
  is_training = True
  for i, (source, target_in, target_out) in enumerate(dataset(is_training=is_training, params=params)):
    with tf.GradientTape() as tape:
      logits_or_ids = model((source, target_in), training=is_training)
      
      loss = tf.compat.v1.losses.softmax_cross_entropy(
        onehot_labels = tf.one_hot(target_out, len(params['tgt2idx'])+1),
        logits = logits_or_ids,
        weights = tf.cast(tf.sign(target_out), tf.float32),
        label_smoothing = .2)  

    variables = model.trainable_variables
    optim.lr.assign(decay_lr(global_step))
    grads = tape.gradient(loss, variables)
    grads, _ = tf.clip_by_global_norm(grads, params['clip_norm'])
    optim.apply_gradients(zip(grads, variables))
    
    if global_step % 50 == 0:
      logger.info("Step {} | Loss: {:.4f} | Spent: {:.1f} secs | LR: {:.6f}".format(
        global_step, loss.numpy().item(), time.time()-t0, optim.lr.numpy().item()))
      t0 = time.time()
    
    global_step += 1

  # EVALUATION
  is_training = False
  minimal_test(model, params)
  m = tf.keras.metrics.Mean()
  
  parse_fn = lambda x: [e for e in x if (e != 0 and e != 2)]

  for i, (source, target_in, target_out) in enumerate(dataset(is_training=is_training, params=params)):
    generated = model(inputs=source, training=is_training)
    for pred, tgt in zip(generated.numpy(), target_out.numpy()):
      matched = np.array_equal(parse_fn(pred), parse_fn(tgt))
      m.update_state(int(matched))
  
  acc = m.result().numpy()
  logger.info("Evaluation: Testing EM: {:.3f}".format(acc))

  if acc > best_acc:
    best_acc = acc
    count = 0
    model.save_weights('../model/gru_seq2seq_clr')
  else:
    count += 1
  logger.info("Best EM: {:.3f}".format(best_acc))

  if count == params['num_patience']:
    print(params['num_patience'], "times not improve the best result, therefore stop training")
    break

Reading ../data/train.tsv
INFO:tensorflow:Step 0 | Loss: 10.0613 | Spent: 25.4 secs | LR: 0.000100
INFO:tensorflow:Step 50 | Loss: 6.1059 | Spent: 24.3 secs | LR: 0.000109
INFO:tensorflow:Step 100 | Loss: 5.1683 | Spent: 24.2 secs | LR: 0.000118
INFO:tensorflow:Step 150 | Loss: 5.0629 | Spent: 24.8 secs | LR: 0.000127
INFO:tensorflow:Step 200 | Loss: 4.5041 | Spent: 25.0 secs | LR: 0.000136
INFO:tensorflow:Step 250 | Loss: 4.4831 | Spent: 23.6 secs | LR: 0.000145
INFO:tensorflow:Step 300 | Loss: 4.2703 | Spent: 24.2 secs | LR: 0.000154
INFO:tensorflow:Step 350 | Loss: 4.2365 | Spent: 23.5 secs | LR: 0.000163
INFO:tensorflow:Step 400 | Loss: 3.8806 | Spent: 23.2 secs | LR: 0.000172
INFO:tensorflow:Step 450 | Loss: 3.7415 | Spent: 23.7 secs | LR: 0.000181
INFO:tensorflow:Step 500 | Loss: 3.6758 | Spent: 23.6 secs | LR: 0.000190
INFO:tensorflow:Step 550 | Loss: 3.5014 | Spent: 23.9 secs | LR: 0.000198
INFO:tensorflow:Step 600 | Loss: 3.4392 | Spent: 24.5 secs | LR: 0.000207
INFO:tensorflo