In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow1/free_chat/chinese_lccc/main')

In [None]:
%tensorflow_version 1.x
!pip install texar

In [None]:
import tensorflow as tf
import texar.tf as tx

import numpy as np
import pprint
import logging
import copy

from pathlib import Path
from texar.tf.modules import TransformerEncoder

print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())

TensorFlow Version 1.15.2
GPU Enabled: True


In [None]:
# stream data from text files
def data_generator(f_path, params):
  char2idx = params['char2idx']
  with open(f_path) as f:
    print('Reading', f_path)
    for line in f:
      line = line.rstrip()
      source, target = line.split('<SEP>')
      source = [char2idx.get(c, len(char2idx)) for c in list(source)]
      target = [char2idx.get(c, len(char2idx)) for c in list(target)]
      if len(source) > params['max_len']:
        source = source[:params['max_len']]
      if len(target) > params['max_len']:
        target = target[:params['max_len']]
      target_in = [1] + target
      target_out = target + [2]
      yield (source, (target_in, target_out))

In [None]:
def dataset(is_training, params):
  _shapes = ([None], ([None], [None]))
  _types = (tf.int32, (tf.int32, tf.int32))
  _pads = (0, (0, 0))
  
  if is_training:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['train_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.shuffle(params['buffer_size'])
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  else:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['test_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.padded_batch(params['batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  
  return ds

In [None]:
def clip_grads(loss):
    variables = tf.trainable_variables()
    pprint.pprint(variables)
    grads = tf.gradients(loss, variables)
    clipped_grads, _ = tf.clip_by_global_norm(grads, params['clip_norm'])
    return zip(clipped_grads, variables)


def rnn_cell():
    def cell_fn():
        cell = tf.nn.rnn_cell.LSTMCell(params['rnn_units'],
                                       initializer=tf.orthogonal_initializer())
        return cell
    if params['dec_layers'] > 1:
      cells = []
      for i in range(params['dec_layers']):
        if i == params['dec_layers'] - 1:
          cells.append(cell_fn())
        else:
          cells.append(tf.nn.rnn_cell.ResidualWrapper(cell_fn(), residual_fn=lambda i,o: tf.concat((i,o), -1)))
      return tf.nn.rnn_cell.MultiRNNCell(cells)
    else:
      return cell_fn()

  
def dec_cell(enc_out, enc_seq_len):
    attn = tf.contrib.seq2seq.BahdanauAttention(
        num_units = params['rnn_units'],
        memory = enc_out,
        memory_sequence_length = enc_seq_len)
    
    return tf.contrib.seq2seq.AttentionWrapper(
        cell = rnn_cell(),
        attention_mechanism = attn,
        attention_layer_size = params['rnn_units'])

In [None]:
class TiedDense(tf.layers.Layer):
  def __init__(self, tied_embed, out_dim):
    super().__init__()
    self.tied_embed = tied_embed
    self.out_dim = out_dim
  
  def build(self, input_shape):
    self.bias = self.add_weight(name='bias',
                                shape=[self.out_dim],
                                trainable=True)
    if params['rnn_units'] != 300:
      self.proj_W = self.add_weight(name='proj_W',
                                    shape=[params['rnn_units'], params['embed_dim']],
                                    trainable=True)
      self.proj_b = self.add_weight(name='proj_b',
                                    shape=[params['embed_dim']],
                                    trainable=True)
    super().build(input_shape)
  
  def call(self, inputs):
    if params['rnn_units'] != 300:
      inputs = params['activation'](tf.nn.bias_add(tf.matmul(inputs, self.proj_W), self.proj_b))
    x = tf.matmul(inputs, self.tied_embed, transpose_b=True)
    x = tf.nn.bias_add(x, self.bias)
    return x
  
  def compute_output_shape(self, input_shape):
    return input_shape[:-1].concatenate(self.out_dim)

In [None]:
def forward(words, labels, mode):
    words_len = tf.count_nonzero(words, 1, dtype=tf.int32)
    
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    batch_sz = tf.shape(words)[0]
    
  
    with tf.variable_scope('Embedding'):
        embedding = tf.Variable(np.load('../vocab/char.npy'),
                                dtype=tf.float32,
                                name='fasttext_vectors')
        embedding = tf.concat([tf.zeros(shape=[1, params['embed_dim']]), embedding[1:, :]], axis=0)
        x = tf.nn.embedding_lookup(embedding, words)
        pos_embedder = tx.modules.SinusoidsPositionEmbedder(
            position_size = params['max_len'] + 1,
            hparams = config_model.position_embedder_hparams)
        x = (x * config_model.hidden_dim ** 0.5) + pos_embedder(sequence_length=words_len)


    with tf.variable_scope('Encoder'):
        encoder = TransformerEncoder(hparams=config_model.encoder)
        enc_out = encoder(inputs=x,
                          sequence_length=words_len,
                          mode=mode,)
        enc_state = tf.reduce_max(enc_out, axis=1)
        enc_state = tf.nn.rnn_cell.LSTMStateTuple(c=enc_state, h=enc_state)
    
    
    with tf.variable_scope('Decoder'):
        output_proj = TiedDense(embedding, len(params['char2idx'])+1)
        
        if is_training or (mode == tf.estimator.ModeKeys.EVAL):
            dec_inputs, dec_outputs = labels
            dec_seq_len = tf.count_nonzero(dec_inputs, 1, dtype=tf.int32)
            dec_inputs = tf.nn.embedding_lookup(embedding, dec_inputs)
            dec_inputs = tf.layers.dropout(dec_inputs, params['dropout_rate'], training=is_training)
            cell = dec_cell(enc_out, words_len)
            
            init_state = cell.zero_state(batch_sz, tf.float32).clone(
                cell_state=enc_state)
            
            helper = tf.contrib.seq2seq.TrainingHelper(
                inputs = dec_inputs,
                sequence_length = dec_seq_len,)
            decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = cell,
                helper = helper,
                initial_state = init_state,
                output_layer = output_proj)
            decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = decoder,
                maximum_iterations = tf.reduce_max(dec_seq_len))
            
            return decoder_output.rnn_output
        else:
            enc_out_t = tf.contrib.seq2seq.tile_batch(enc_out, params['beam_width'])
            enc_state_t = tf.contrib.seq2seq.tile_batch(enc_state, params['beam_width'])
            enc_seq_len_t = tf.contrib.seq2seq.tile_batch(words_len, params['beam_width'])
            
            cell = dec_cell(enc_out_t, enc_seq_len_t)
            
            init_state = cell.zero_state(batch_sz*params['beam_width'], tf.float32).clone(
                cell_state=enc_state_t)
            
            decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                cell = cell,
                embedding = embedding,
                start_tokens = tf.tile(tf.constant([1], tf.int32), [batch_sz]),
                end_token = 2,
                initial_state = init_state,
                beam_width = params['beam_width'],
                output_layer = output_proj,
                length_penalty_weight = params['length_penalty'],)
            decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = decoder,
                maximum_iterations = params['max_len'],)
            
            return decoder_output.predicted_ids[:, :, :params['top_k']]

In [None]:
def clr(step,
        initial_learning_rate,
        maximal_learning_rate,
        step_size,
        scale_fn,
        scale_mode,):
  step = tf.cast(step, tf.float32)
  
  initial_learning_rate = tf.convert_to_tensor(
    initial_learning_rate, name='initial_learning_rate')
  dtype = initial_learning_rate.dtype
  maximal_learning_rate = tf.cast(maximal_learning_rate, dtype)
  step_size = tf.cast(step_size, dtype)
  cycle = tf.floor(1 + step / (2 * step_size))
  x = tf.abs(step / step_size - 2 * cycle + 1)

  mode_step = cycle if scale_mode == 'cycle' else step

  return initial_learning_rate + (
    maximal_learning_rate - initial_learning_rate) * tf.maximum(
      tf.cast(0, dtype), (1 - x)) * scale_fn(mode_step)

In [None]:
def model_fn(features, labels, mode, params):
    logits_or_ids = forward(features, labels, mode)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions=logits_or_ids)
    
    dec_inputs, dec_outputs = labels
    if (params['label_smoothing'] <= .0) or (mode == tf.estimator.ModeKeys.EVAL):
      loss_op = tf.contrib.seq2seq.sequence_loss(logits = logits_or_ids,
                                                 targets = dec_outputs,
                                                 weights = tf.to_float(tf.sign(dec_outputs)))
    else:
      loss_op = tf.losses.softmax_cross_entropy(onehot_labels = tf.one_hot(dec_outputs, len(params['char2idx'])+1),
                                                logits = logits_or_ids,
                                                weights = tf.to_float(tf.sign(dec_outputs)),
                                                label_smoothing = params['label_smoothing'],)
      
    if mode == tf.estimator.ModeKeys.TRAIN:
        global_step=tf.train.get_or_create_global_step()
        
        decay_lr = clr(
          step = global_step,
          initial_learning_rate = 1e-4,
          maximal_learning_rate = 8e-4,
          step_size = params['num_samples'] / params['batch_size'] // 2,
          scale_fn=lambda x: 1 / (2.0 ** (x - 1)),
          scale_mode = 'cycle',)
        
        train_op = tf.train.AdamOptimizer(decay_lr).apply_gradients(
            clip_grads(loss_op), global_step=global_step)
        
        hook = tf.train.LoggingTensorHook({'lr': decay_lr}, every_n_iter=100)
        
        return tf.estimator.EstimatorSpec(
            mode=mode, loss=loss_op, train_op=train_op, training_hooks=[hook],)
      
    if mode == tf.estimator.ModeKeys.EVAL:
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss_op)

In [None]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip('\n')
      word2idx[line] = i
  return word2idx

In [None]:
def pad(test_strs):
  max_len = max([len(test_str) for test_str in test_strs])
  for test_str in test_strs:
    if len(test_str) < max_len:
      test_str += ['<pad>'] * (max_len - len(test_str))


def unit_test(estimator):
  test_strs = [
    '你好',
    '早上好',
    '晚上好',
    '再见',
    '好久不见',
    '想死你了',
    '谢谢你',
    '爱你',
    '你好厉害啊',
    '你叫什么',
    '你几岁了',
    '现在几点',
    '今天天气怎么样',
    '你今天心情好吗',
    '我们现在在哪里',
    '讲个笑话',
    '你会几种语言呀',
    '你觉得我帅吗',
    '讨厌的周一',
    '好烦啊',
    '天气真好',
    '今天好冷',
    '今天好热',
    '下雨了',
    '风好大',
    '终于周五了',
    '我想去唱歌',
  ]
  test_strs = [list(test_str) for test_str in test_strs]
  pad(test_strs)
  test_arrs = [[params['char2idx'].get(c, len(params['char2idx'])) for c in test_str] for test_str in test_strs]
  predicted = list(estimator.predict(tf.estimator.inputs.numpy_input_fn(
    x = np.asarray(test_arrs), shuffle = False)))
  predicted = np.asarray(predicted)
  print('-'*12)
  print('unit test')
  for i, test_str in enumerate(test_strs):
    print('Q:', ' '.join([c for c in test_str if c != '<pad>']))
    for j in range(params['top_k']):
      sent = ' '.join([params['idx2char'].get(idx, '<unk>') for idx in predicted[i, :, j] if (idx != 0 and idx != 2)])
      print('A{}:'.format(j+1), sent)
    print()
  print('-'*12)

In [None]:
class config_model:
    hidden_dim = 300
    num_heads = 8
    dropout_rate = .2
    num_blocks = 3

    position_embedder_hparams = {
        'dim': hidden_dim
    }

    encoder = {
        'dim': hidden_dim,
        'embedding_dropout': dropout_rate,
        'residual_dropout': dropout_rate,
        'num_blocks': num_blocks,
        'initializer': {
            'type': 'variance_scaling_initializer',
            'kwargs': {
                'scale': 1.0,
                'mode': 'fan_avg',
                'distribution': 'uniform',
            },
        },
        'multihead_attention': {
            'dropout_rate': dropout_rate,
            'num_heads': num_heads,
            'output_dim': hidden_dim,
            'use_bias': True,
        },
        'poswise_feedforward': {
          'name': 'fnn',
          'layers': [
              {
                  'type': 'Dense',
                  'kwargs': {
                      'name': 'conv1',
                      'units': hidden_dim * 4,
                      'activation': 'gelu',
                      'use_bias': True,
                  },
              },
              {
                  'type': 'Dropout',
                  'kwargs': {
                      'rate': dropout_rate,
                  }
              },
              {
                  'type': 'Dense',
                  'kwargs': {
                      'name': 'conv2',
                      'units': hidden_dim,
                      'use_bias': True,
                  }
              }
          ],
        },
    }


params = {
    'model_dir': '../model/transformer_rnn',
    'train_path': '../data/train.txt',
    'test_path': '../data/test.txt',
    'vocab_path': '../vocab/char.txt',
    'max_len': 30,
    'embed_dim': config_model.hidden_dim,
    'rnn_units': 300,
    'dec_layers': 1,
    'dropout_rate': .2,
    'beam_width': 10,
    'top_k': 3,
    'length_penalty': .6,
    'label_smoothing': .2,
    'clip_norm': .1,
    'num_samples': 5000000,
    'buffer_size': 500000,
    'batch_size': 64,
    'num_patience': 5,
}

In [None]:
params['char2idx'] = get_vocab(params['vocab_path'])
params['idx2char'] = {idx: char for char, idx in params['char2idx'].items()}
print(len(params['char2idx']), 'Chars')

# Create directory if not exist
Path(params['model_dir']).mkdir(exist_ok=True, parents=True)

# Logging
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)

# Create an estimator
estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  model_dir=params['model_dir'],
  config=tf.estimator.RunConfig(
    save_checkpoints_steps = params['num_samples'] // params['batch_size'] + 1,
    keep_checkpoint_max = 2),
  params=params)

best_ppl = 10000.
count = 0
tf.enable_eager_execution()

while True:
  estimator.train(input_fn=lambda: dataset(is_training=True, params=params))

  unit_test(estimator)
  
  loss = estimator.evaluate(input_fn=lambda: dataset(is_training=False, params=params))['loss']
  ppl = np.exp(loss)
  print("Perplexity: {:.3f}".format(ppl))

  if ppl < best_ppl:
    best_ppl = ppl
    count = 0
  else:
    count += 1
  print("Best Perplexity: {:.3f}".format(best_ppl))

  if count == params['num_patience']:
    print(params['num_patience'], "times not improve the best result, therefore stop training")
    break

3042 Chars
INFO:tensorflow:Using config: {'_model_dir': '../model/transformer_rnn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 78126, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 2, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6e9eacef60>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
Use Variable.read_value. Variables in 2.X are 

KeyboardInterrupt: ignored