In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/text_matching/chinese/main')

In [0]:
%tensorflow_version 2.x

In [3]:
import csv
import tensorflow as tf
import numpy as np
import pprint
import logging
import time
import math

print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())

TensorFlow Version 2.2.0-rc2
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU Enabled: True


In [0]:
def get_vocab(f_path):
  k2v = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      k2v[line] = i
  return k2v

In [0]:
# stream data from text files
def data_generator(f_path, char2idx):
  with open(f_path) as f:
    print('Reading', f_path)
    for i, line in enumerate(csv.reader(f, delimiter=',')):
      if i == 0:
        continue
      text1, text2, label = line
      text1 = [char2idx.get(c, len(char2idx)) for c in list(text1)]
      text2 = [char2idx.get(c, len(char2idx)) for c in list(text2)]
      yield ((text1, text2), int(label))


def dataset(is_training, params):
  _shapes = (([None], [None]), ())
  _types = ((tf.int32, tf.int32), tf.int32)
  _pads = ((0, 0), -1)
  
  if is_training:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['train_path'], params['char2idx']),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.shuffle(params['buffer_size'])
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
  
  else:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['test_path'], params['char2idx']),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
  
  return ds

In [0]:
def masked_attention(x, align, mask, tile_len):
  pad = tf.fill(tf.shape(align), float('-inf'))
  mask = tf.tile(tf.expand_dims(mask, 1), [1, tile_len, 1])
  align = tf.where(tf.equal(mask, 0), pad, align)
  align = tf.nn.softmax(align)
  return tf.matmul(align, x)


def soft_align_attention(x1, x2, mask1, mask2):
  align12 = tf.matmul(x1, x2, transpose_b=True)
  align21 = tf.transpose(align12, [0,2,1])
  x1_ = masked_attention(x2, align12, mask2, tf.shape(x1)[1])
  x2_ = masked_attention(x1, align21, mask1, tf.shape(x2)[1])
  return x1_, x2_


class AttentivePooling(tf.keras.Model):
  def __init__(self, params):
    super().__init__()
    self.dropout = tf.keras.layers.Dropout(.2)
    self.kernel = tf.keras.layers.Dense(units=1,
                                        activation=tf.tanh,
                                        use_bias=False)

  
  def call(self, inputs, training=False):
    x, masks = inputs
    # alignment
    align = tf.squeeze(self.kernel(self.dropout(x, training=training)), -1)
    # masking
    paddings = tf.fill(tf.shape(align), float('-inf'))
    align = tf.where(tf.equal(masks, 0), paddings, align)
    # probability
    align = tf.nn.softmax(align)
    align = tf.expand_dims(align, -1)
    # weighted sum
    return tf.squeeze(tf.matmul(x, align, transpose_a=True), -1)

In [0]:
class ESIM(tf.keras.Model):
  def __init__(self, params: dict):
    super().__init__()
    self.embedding = tf.Variable(np.load(params['embedding_path']), name='pretrained_embedding', dtype=tf.float32)
    
    self.input_dropout = tf.keras.layers.Dropout(.2)
    self.input_encoder = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
        params['rnn_units'], return_sequences=True), name='input_encoder')
    
    self.feature_dropout = tf.keras.layers.Dropout(.5)
    self.feature_fc = tf.keras.layers.Dense(params['rnn_units'], tf.nn.elu, name='feature_fc')
    
    self.revise_dropout = tf.keras.layers.Dropout(.2)
    self.revise_encoder = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
        params['rnn_units'], return_sequences=True), name='revise_encoder')
    
    self.attentive_pooling = AttentivePooling(params)
    
    self.fc1_dropout = tf.keras.layers.Dropout(.5)
    self.fc1 = tf.keras.layers.Dense(params['rnn_units'], tf.nn.elu, name='final_fc1')
    self.fc2_dropout = tf.keras.layers.Dropout(.2)
    self.fc2 = tf.keras.layers.Dense(params['rnn_units'], tf.nn.elu, name='final_fc2')
    
    self.out_linear = tf.keras.layers.Dense(1, name='out_linear')
  
  
  def call(self, inputs, training=False):
    x1, x2 = inputs
    
    if x1.dtype != tf.int32:
      x1 = tf.cast(x1, tf.int32)
    if x2.dtype != tf.int32:
      x2 = tf.cast(x2, tf.int32)
    
    batch_sz = tf.shape(x1)[0]
    
    mask1 = tf.sign(x1)
    mask2 = tf.sign(x2)
    
    x1 = tf.nn.embedding_lookup(self.embedding, x1)
    x2 = tf.nn.embedding_lookup(self.embedding, x2)
    
    self.input_dropout.noise_shape = (batch_sz, 1, 300)
    x1 = self.input_dropout(x1, training=training)
    x2 = self.input_dropout(x2, training=training)
    
    x1 = self.input_encoder(x1, mask=None)
    x2 = self.input_encoder(x2, mask=None)
    
    x1_, x2_ = soft_align_attention(x1, x2, mask1, mask2)
    aggregate_fn = lambda x, x_: tf.concat((x,
                                            x_,
                                           (x - x_),
                                           (x * x_),), -1)
    x1 = aggregate_fn(x1, x1_)
    x2 = aggregate_fn(x2, x2_)
    
    self.feature_dropout.noise_shape = (batch_sz, 1, params['rnn_units']*8)
    x1 = self.feature_dropout(x1, training=training)
    x2 = self.feature_dropout(x2, training=training)
    
    x1 = self.feature_fc(x1)
    x2 = self.feature_fc(x2)
    
    self.revise_dropout.noise_shape = (batch_sz, 1, params['rnn_units'])
    x1 = self.revise_dropout(x1, training=training)
    x2 = self.revise_dropout(x2, training=training)
    
    x1 = self.revise_encoder(x1, mask=None)
    x2 = self.revise_encoder(x2, mask=None)
    
    features = []
    features.append(tf.reduce_max(x1, axis=1))
    features.append(tf.reduce_max(x2, axis=1))
    features.append(self.attentive_pooling((x1, mask1), training=training))
    features.append(self.attentive_pooling((x2, mask2), training=training))
    x = tf.concat(features, axis=-1)
    
    x = self.fc1_dropout(x, training=training)
    x = self.fc1(x)
    
    x = self.fc2_dropout(x, training=training)
    x = self.fc2(x)
    
    x = self.out_linear(x)
    x = tf.squeeze(x, 1)
    
    return x

In [0]:
params = {
  'train_path': '../data/train.csv',
  'test_path': '../data/test.csv',
  'vocab_path': '../vocab/char.txt',
  'embedding_path': '../vocab/char.npy',
  'batch_size': 32,
  'buffer_size': 100000,
  'num_blocks': 2,
  'dropout_rate': 0.2,
  'rnn_units': 300,
  'lr': 4e-4,
  'clip_norm': 5.,
  'num_patience': 5,
}

In [0]:
def is_descending(history):
  history = history[-(params['num_patience']+1):]
  for i in range(1, len(history)):
    if history[i-1] <= history[i]:
      return False
  return True  

In [10]:
params['char2idx'] = get_vocab(params['vocab_path'])
params['vocab_size'] = len(params['char2idx']) + 1

model = ESIM(params)
model.build([[None, None], [None, None]])
pprint.pprint([(v.name, v.shape) for v in model.trainable_variables])

decay_lr = tf.optimizers.schedules.ExponentialDecay(params['lr'], 1000, 0.99)
optim = tf.optimizers.Adam(params['lr'])
global_step = 0

history_acc = []
best_acc = .0

t0 = time.time()
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)

while True:
  # TRAINING
  for ((text1, text2), labels) in dataset(is_training=True, params=params):
    with tf.GradientTape() as tape:
      logits = model((text1, text2), training=True)
      loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(labels, tf.float32), logits=logits)
      loss = tf.reduce_mean(loss)
      
    optim.lr.assign(decay_lr(global_step))
    grads = tape.gradient(loss, model.trainable_variables)
    grads, _ = tf.clip_by_global_norm(grads, params['clip_norm'])
    optim.apply_gradients(zip(grads, model.trainable_variables))
    
    if global_step % 100 == 0:
      logger.info("Step {} | Loss: {:.4f} | Spent: {:.1f} secs | LR: {:.6f}".format(
          global_step, loss.numpy().item(), time.time()-t0, optim.lr.numpy().item()))
      t0 = time.time()
    global_step += 1
  
  # EVALUATION
  m = tf.keras.metrics.Accuracy()

  for ((text1, text2), labels) in dataset(is_training=False, params=params):
    logits = tf.sigmoid(model((text1, text2), training=False))
    y_pred = tf.cast(tf.math.greater_equal(logits, .5), tf.int32)
    m.update_state(y_true=labels, y_pred=y_pred)

  acc = m.result().numpy()
  logger.info("Evaluation: Testing Accuracy: {:.3f}".format(acc))
  history_acc.append(acc)

  if acc > best_acc:
    best_acc = acc
    # you can save model here
  logger.info("Best Accuracy: {:.3f}".format(best_acc))

  if len(history_acc) > params['num_patience'] and is_descending(history_acc):
    logger.info("Testing Accuracy not improved over {} epochs, Early Stop".format(params['num_patience']))
    break

[('input_encoder/forward_lstm/lstm_cell_1/kernel:0', TensorShape([300, 1200])),
 ('input_encoder/forward_lstm/lstm_cell_1/recurrent_kernel:0',
  TensorShape([300, 1200])),
 ('input_encoder/forward_lstm/lstm_cell_1/bias:0', TensorShape([1200])),
 ('input_encoder/backward_lstm/lstm_cell_2/kernel:0', TensorShape([300, 1200])),
 ('input_encoder/backward_lstm/lstm_cell_2/recurrent_kernel:0',
  TensorShape([300, 1200])),
 ('input_encoder/backward_lstm/lstm_cell_2/bias:0', TensorShape([1200])),
 ('feature_fc/kernel:0', TensorShape([2400, 300])),
 ('feature_fc/bias:0', TensorShape([300])),
 ('revise_encoder/forward_lstm_1/lstm_cell_4/kernel:0',
  TensorShape([300, 1200])),
 ('revise_encoder/forward_lstm_1/lstm_cell_4/recurrent_kernel:0',
  TensorShape([300, 1200])),
 ('revise_encoder/forward_lstm_1/lstm_cell_4/bias:0', TensorShape([1200])),
 ('revise_encoder/backward_lstm_1/lstm_cell_5/kernel:0',
  TensorShape([300, 1200])),
 ('revise_encoder/backward_lstm_1/lstm_cell_5/recurrent_kernel:0',
  

KeyboardInterrupt: ignored