In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/spoken_language_understanding/atis/main')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%tensorflow_version 2.x
!pip install tensorflow-addons



In [3]:
from sklearn.metrics import classification_report, f1_score, accuracy_score
from tensorflow_addons.optimizers.cyclical_learning_rate import Triangular2CyclicalLearningRate

import tensorflow as tf
import pprint
import logging
import time
import numpy as np

print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())

TensorFlow Version 2.2.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU Enabled: True


In [0]:
def get_vocab(vocab_path):
  word2idx = {}
  with open(vocab_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      word2idx[line] = i
  return word2idx

In [0]:
def data_generator(f_path, params):
  print('Reading', f_path)
  with open(f_path) as f:
    for line in f:
      line = line.rstrip()
      text, slot_intent = line.split('\t')
      words = text.split()[1:-1]
      slot_intent = slot_intent.split()
      slots, intent = slot_intent[1:-1], slot_intent[-1]
      assert len(words) == len(slots)
      
      words = [params['word2idx'].get(w, len(params['word2idx'])) for w in words]
      intent = params['intent2idx'].get(intent, len(params['intent2idx']))
      slots = [params['slot2idx'].get(s, len(params['slot2idx'])) for s in slots]
      
      yield (words, (intent, slots))

In [0]:
def dataset(is_training, params):
  _shapes = ([None], ((), [None]))
  _types = (tf.int32, (tf.int32, tf.int32))
  _pads = (0, (-1, 0))
  
  if is_training:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['train_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.shuffle(params['num_samples'])
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  else:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['test_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  
  return ds

In [0]:
class LayerNorm(tf.keras.layers.Layer):
  def __init__(self, params):
    super().__init__()
    self._epsilon = 1e-6
    self._units = 2 * params['rnn_units']
  
  def build(self, input_shape):
    self.scale = self.add_weight(name='scale',
                                 shape=[self._units],
                                 initializer=tf.ones_initializer(),
                                 trainable=True)
    self.bias = self.add_weight(name='bias',
                                shape=[self._units],
                                initializer=tf.zeros_initializer(),
                                trainable=True)
    super().build(input_shape)
  
  def call(self, inputs):
    mean, variance = tf.nn.moments(inputs, [-1], keepdims=True)
    norm_x = (inputs - mean) * tf.math.rsqrt(variance + self._epsilon)
    return norm_x * self.scale + self.bias
  
  def compute_output_shape(self, input_shape):
    return input_shape


class EncoderBlock(tf.keras.Model):
  def __init__(self, SubModel, params, name):
    super().__init__(name = name)
    self.layer_norm = LayerNorm(params)
    self.sub_model = SubModel(params)
    self.dropout = tf.keras.layers.Dropout(params['dropout_rate'])
  
  def call(self, inputs, training=False):
    inputs, masks = inputs
    x = self.layer_norm(inputs)
    x = self.sub_model((x, masks), training=training)
    x = self.dropout(x, training=training)
    x += inputs
    return x


class MultiheadSelfAttention(tf.keras.Model):
  def __init__(self, params):
    super().__init__()
    self.qkv_linear = tf.keras.layers.Dense(3 * 2 * params['rnn_units'], name='qkv_linear')
    self.dropout = tf.keras.layers.Dropout(params['dropout_rate'])
    self.out_linear = tf.keras.layers.Dense(2 * params['rnn_units'], name='out_linear')
    self._num_heads = params['num_heads']
  
  def call(self, inputs, training=False):
    x, masks = inputs
    timesteps = tf.shape(x)[1]
    
    q_k_v = self.qkv_linear(x)
    q, k, v = tf.split(q_k_v, 3, axis=-1)
    
    if self._num_heads > 1:
      q = tf.concat(tf.split(q, self._num_heads, axis=2), axis=0)                        
      k = tf.concat(tf.split(k, self._num_heads, axis=2), axis=0)                        
      v = tf.concat(tf.split(v, self._num_heads, axis=2), axis=0)
    
    align = tf.matmul(q, k, transpose_b=True)
    align *= tf.math.rsqrt(tf.cast(k.shape[-1], tf.float32))
    
    if (masks is not None) or (not self._is_bidirectional):
      paddings = tf.fill(tf.shape(align), float('-inf'))
    
    if masks is not None:
      c_masks = tf.tile(masks, [params['num_heads'], 1])
      c_masks = tf.tile(tf.expand_dims(c_masks, 1), [1, timesteps, 1])
      align = tf.where(tf.equal(c_masks, 0), paddings, align)
    
    align = tf.nn.softmax(align)
    align = self.dropout(align, training=training)
    
    if masks is not None:
      q_masks = tf.tile(masks, [params['num_heads'], 1])
      q_masks = tf.tile(tf.expand_dims(q_masks, 2), [1, 1, timesteps])
      align *= tf.cast(q_masks, tf.float32)
    
    x = tf.matmul(align, v)
    if self._num_heads > 1:
      x = tf.concat(tf.split(x, self._num_heads, axis=0), axis=2)
    x = self.out_linear(x)
    
    return x
  

class PointwiseFFN(tf.keras.Model):
  def __init__(self, params):
    super().__init__()
    self.dense_1 = tf.keras.layers.Dense(2 * params['multiplier'] * params['rnn_units'], tf.nn.relu, name='fc_relu')
    self.dropout = tf.keras.layers.Dropout(params['dropout_rate'])
    self.dense_2 = tf.keras.layers.Dense(2 * params['rnn_units'], name='linear')
  
  def call(self, inputs, training=False):
    x, masks = inputs
    return self.dense_2(self.dropout(self.dense_1(x), training=training))

In [0]:
class Model(tf.keras.Model):
  def __init__(self, params: dict):
    super().__init__()
    self.embedding = tf.Variable(np.load(params['vocab_path']),
                                 dtype=tf.float32,
                                 name='pretrained_embedding')
    self.input_dropout = tf.keras.layers.Dropout(params['dropout_rate'])
    
    self.bidir_gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(
        params['rnn_units'], return_state=True, return_sequences=True, zero_output_for_mask=True))
    self.transformer = [
        EncoderBlock(MultiheadSelfAttention, params, name='self_attention'),
        EncoderBlock(PointwiseFFN, params, name='pointwise_ffn'),
    ]
    
    self.intent_dropout = tf.keras.layers.Dropout(params['dropout_rate'])
    self.fc_intent = tf.keras.layers.Dense(params['rnn_units'], tf.nn.elu, name='fc_intent')
    self.out_linear_intent = tf.keras.layers.Dense(params['intent_size'], name='output_intent')
    self.out_linear_slot = tf.keras.layers.Dense(params['slot_size'], name='output_slot')
  
  
  def call(self, inputs, training=False):
    if inputs.dtype != tf.int32:
      inputs = tf.cast(inputs, tf.int32)
    mask = tf.sign(inputs)
    rnn_mask = tf.cast(mask, tf.bool)
    
    x = tf.nn.embedding_lookup(self.embedding, inputs)
    x = self.input_dropout(x, training=training)
    x, s_fw, s_bw = self.bidir_gru(x, mask=rnn_mask)
    for block in self.transformer:
      x = block((x, mask), training=training)
    
    x_intent = tf.concat([tf.reduce_max(x, 1), s_fw, s_bw], -1)
    x_intent = self.intent_dropout(x_intent, training=training)
    x_intent = self.out_linear_intent(self.fc_intent(x_intent))
    x_slot = self.out_linear_slot(x)
    return (x_intent, x_slot)

In [0]:
params = {
  'train_path': '../data/atis.train.w-intent.iob',
  'test_path': '../data/atis.test.w-intent.iob',
  'word_path': '../vocab/word.txt',
  'vocab_path': '../vocab/word.npy',
  'intent_path': '../vocab/intent.txt',
  'slot_path': '../vocab/slot.txt',
  'batch_size': 16,
  'num_samples': 4978,
  'rnn_units': 300,
  'num_heads': 6,
  'multiplier': 2,
  'dropout_rate': .2,
  'clip_norm': .1,
}

In [0]:
params['word2idx'] = get_vocab(params['word_path'])
params['intent2idx'] = get_vocab(params['intent_path'])
params['slot2idx'] = get_vocab(params['slot_path'])

params['word_size'] = len(params['word2idx']) + 1
params['intent_size'] = len(params['intent2idx']) + 1
params['slot_size'] = len(params['slot2idx']) + 1

In [11]:
model = Model(params)
model.build(input_shape=(None, None))
pprint.pprint([(v.name, v.shape) for v in model.trainable_variables])

decay_lr = Triangular2CyclicalLearningRate(
  initial_learning_rate = 1e-4,
  maximal_learning_rate = 8e-4,
  step_size = 8 * params['num_samples'] // params['batch_size'],
)
optim = tf.optimizers.Adam(1e-4)
global_step = 0

slot_best_f1 = .0
intent_acc_with_that = .0

t0 = time.time()
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)

for n_epoch in range(1, 64+1):
  # TRAINING
  for (words, (intent, slots)) in dataset(is_training=True, params=params):
    with tf.GradientTape() as tape:
      y_intent, y_slots = model(words, training=True)
      loss_intent = tf.compat.v1.losses.softmax_cross_entropy(
        onehot_labels = tf.one_hot(intent, len(params['intent2idx'])+1),
        logits = y_intent,
        label_smoothing = .2)
      # weight of 'O' is set to be small
      weights = tf.cast(tf.sign(slots), tf.float32)
      padding = tf.constant(1e-2, tf.float32, weights.shape)
      weights = tf.where(tf.equal(weights, 0.), padding, weights)

      loss_slots = tf.compat.v1.losses.softmax_cross_entropy(
        onehot_labels = tf.one_hot(slots, len(params['slot2idx'])+1),
        logits = y_slots,
        weights = tf.cast(weights, tf.float32),
        label_smoothing = .2)
      # joint loss
      loss = loss_intent + loss_slots
    
    optim.lr.assign(decay_lr(global_step))
    grads = tape.gradient(loss, model.trainable_variables)
    grads, _ = tf.clip_by_global_norm(grads, params['clip_norm'])
    optim.apply_gradients(zip(grads, model.trainable_variables))

    if global_step % 50 == 0:
      logger.info("Step {} | Loss: {:.4f} | Loss_intent: {:.4f} | Loss_slots: {:.4f} | Spent: {:.1f} secs | LR: {:.6f}".format(
          global_step, loss.numpy().item(), loss_intent.numpy().item(), loss_slots.numpy().item(), time.time()-t0, optim.lr.numpy().item()))
      t0 = time.time()
    global_step += 1
    
  # EVALUATION
  intent_true = []
  intent_pred = []
  slot_true = []
  slot_pred = []

  for (words, (intent, slots)) in dataset(is_training=False, params=params):
    y_intent, y_slots = model(words, training=False)
    y_intent = tf.argmax(y_intent, -1)
    y_slots = tf.argmax(y_slots, -1)
    
    intent_true += intent.numpy().flatten().tolist()
    intent_pred += y_intent.numpy().flatten().tolist()
    slot_true += slots.numpy().flatten().tolist()
    slot_pred += y_slots.numpy().flatten().tolist()
    
  f1_slots = f1_score(y_true = slot_true,
                      y_pred = slot_pred,
                      labels = list(params['slot2idx'].values()),
                      sample_weight = np.sign(slot_true),
                      average='micro',)
  
  acc_intent = accuracy_score(intent_true, intent_pred)

  logger.info("Slot F1: {:.3f}, Intent Acc: {:.3f}".format(f1_slots, acc_intent))

  if n_epoch != 1 and n_epoch % 8 == 0:
    logger.info('\n'+classification_report(y_true = intent_true,
                                          y_pred = intent_pred,
                                          labels = list(params['intent2idx'].values()),
                                          target_names = list(params['intent2idx'].keys()),
                                          digits=3))
    logger.info('\n'+classification_report(y_true = slot_true,
                                          y_pred = slot_pred,
                                          labels = list(params['slot2idx'].values()),
                                          target_names = list(params['slot2idx'].keys()),
                                          sample_weight = np.sign(slot_true),
                                          digits=3))
  
  if f1_slots > slot_best_f1:
    slot_best_f1 = f1_slots
    intent_acc_with_that = acc_intent
    # you can save model here
  logger.info("Best Slot F1: {:.3f}, Intent Acc: {:.3f}".format(slot_best_f1, intent_acc_with_that))

[('bidirectional/forward_gru/gru_cell_1/kernel:0', TensorShape([300, 900])),
 ('bidirectional/forward_gru/gru_cell_1/recurrent_kernel:0',
  TensorShape([300, 900])),
 ('bidirectional/forward_gru/gru_cell_1/bias:0', TensorShape([2, 900])),
 ('bidirectional/backward_gru/gru_cell_2/kernel:0', TensorShape([300, 900])),
 ('bidirectional/backward_gru/gru_cell_2/recurrent_kernel:0',
  TensorShape([300, 900])),
 ('bidirectional/backward_gru/gru_cell_2/bias:0', TensorShape([2, 900])),
 ('self_attention/layer_norm/scale:0', TensorShape([600])),
 ('self_attention/layer_norm/bias:0', TensorShape([600])),
 ('self_attention/multihead_self_attention/qkv_linear/kernel:0',
  TensorShape([600, 1800])),
 ('self_attention/multihead_self_attention/qkv_linear/bias:0',
  TensorShape([1800])),
 ('self_attention/multihead_self_attention/out_linear/kernel:0',
  TensorShape([600, 600])),
 ('self_attention/multihead_self_attention/out_linear/bias:0',
  TensorShape([600])),
 ('pointwise_ffn/layer_norm_1/scale:0', 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


INFO:tensorflow:Step 2500 | Loss: 1.4694 | Loss_intent: 1.0972 | Loss_slots: 0.3721 | Spent: 4.9 secs | LR: 0.000797
INFO:tensorflow:Step 2550 | Loss: 1.5717 | Loss_intent: 1.1036 | Loss_slots: 0.4681 | Spent: 2.6 secs | LR: 0.000783
INFO:tensorflow:Step 2600 | Loss: 1.3631 | Loss_intent: 1.0909 | Loss_slots: 0.2722 | Spent: 2.6 secs | LR: 0.000769
INFO:tensorflow:Step 2650 | Loss: 1.4330 | Loss_intent: 1.0861 | Loss_slots: 0.3469 | Spent: 2.6 secs | LR: 0.000755
INFO:tensorflow:Step 2700 | Loss: 1.4206 | Loss_intent: 1.1033 | Loss_slots: 0.3172 | Spent: 2.6 secs | LR: 0.000741
INFO:tensorflow:Step 2750 | Loss: 1.6004 | Loss_intent: 1.2087 | Loss_slots: 0.3917 | Spent: 2.7 secs | LR: 0.000727
INFO:tensorflow:Step 2800 | Loss: 1.4213 | Loss_intent: 1.0941 | Loss_slots: 0.3272 | Spent: 2.6 secs | LR: 0.000713
Reading ../data/atis.test.w-intent.iob
INFO:tensorflow:Slot F1: 0.951, Intent Acc: 0.966
INFO:tensorflow:Best Slot F1: 0.951, Intent Acc: 0.966
Reading ../data/atis.train.w-intent.i