In [1]:
import tensorflow as tf
import numpy as np
import time
import sys
import nvidia_smi

In [2]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


## Transformer XL Model 

In [3]:
INITIALIZER = tf.keras.initializers.RandomNormal(stddev=0.01)


def relative_mask(q_len, m_len):
    mask = tf.sequence_mask(tf.range(1, q_len + 1), q_len, dtype=tf.float32)
    mask = tf.pad(mask, [[0, 0], [m_len, 0]], constant_values=1)
    return mask


def positional_embedding(k_len, d_model):
    inv_freq = 1. / (10000 ** (tf.range(0, d_model, 2.0) / d_model))
    pos_seq = tf.range(k_len - 1, -1, -1.0)
    sinusoid_inp = tf.einsum('i,j->ij', pos_seq, inv_freq)
    pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
    return pos_emb[None, :, :]


def point_wise_feed_forward_network(d_model, d_ff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(d_ff, activation='relu',
                              kernel_initializer=INITIALIZER, name='ffn1'),
        tf.keras.layers.Dense(d_model, kernel_initializer=INITIALIZER, name='ffn2')
    ])



In [4]:
class RelMultiHeadAttention(tf.keras.layers.Layer):

    def __init__(self, d_model, num_heads, dropout_rate):
        """
        d_model: the number of features in the query, key and value vectors.
        num_head: the number of heads. 
        dropout_rate: dropout
        """
        super(RelMultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0

        self.num_heads = num_heads
        self.d_model = d_model
        self.d_depth = self.d_model // self.num_heads

        self.w_head = tf.keras.layers.Dense(
            3 * d_model, use_bias=False, kernel_initializer=INITIALIZER)
        self.r_head = tf.keras.layers.Dense(
            d_model, use_bias=False, kernel_initializer=INITIALIZER)

        self.dense = tf.keras.layers.Dense(
            d_model, use_bias=False, kernel_initializer=INITIALIZER)

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    @staticmethod
    def relative_shift(x):
        x_size = tf.shape(x)
        x = tf.pad(x, [[0, 0], [0, 0], [0, 0], [1, 0]])
        x = tf.reshape(x, (x_size[0], x_size[1], x_size[3] + 1, x_size[2]))
        x = tf.slice(x, [0, 0, 1, 0], [-1, -1, -1, -1])
        x = tf.reshape(x, x_size)
        return x

    def call(self, inputs, pos_emb, r_w_bias, r_r_bias, mems, training, **kwargs):
        """
        inputs: shape=(batch_size, q_len, d_model)
        pos_emb: shape=(1, k_len, d_model)
        u: shape=(num_heads, d_depth)
        v: shape=(num_heads, d_depth)
        mems: shape=(batch_size, m_len, d_model)
        attn_mask: shape=(m_len + q_len, q_len)
        """
        batch_size = tf.shape(inputs)[0]
        q_len = tf.shape(inputs)[1]
        # splice cache
        if mems is None:
            cat = inputs
        else:
            cat = tf.concat((mems, inputs), axis=1)
        cat = self.dropout1(cat, training=training)
        # k_len = m_len + q_len
        k_len = tf.shape(cat)[1]
        m_len = k_len - q_len
        # shape=(1, k_len, d_model)
        pos_emb = pos_emb[:, -k_len:]
        pos_emb = self.dropout2(pos_emb, training=training)

        w_heads = tf.reshape(self.w_head(cat), (
            batch_size, k_len, 3 * self.num_heads, self.d_depth))
        w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=2)
        # shape=(batch_size, q_len, num_heads, d_depth)
        w_head_q = w_head_q[:, -q_len:]

        # shape=(batch_size, num_heads, q_len, k_len)
        # a + c
        ac = tf.einsum('bqnd,bknd->bnqk', w_head_q + r_w_bias, w_head_k)
        r_head_k = tf.reshape(self.r_head(pos_emb), (k_len, self.num_heads, self.d_depth))
        # b + d
        bd = tf.einsum('bqnd,knd->bnqk', w_head_q + r_r_bias, r_head_k)
        bd = self.relative_shift(bd)

        attn_mask = relative_mask(q_len, m_len)
        # shape=(batch_size, num_heads, q_len, k_len)
        # Attention
        attn_score = (ac + bd) / (self.d_depth ** 0.5)
        attn_score = attn_score * attn_mask - 1e30 * (1. - attn_mask)
        attn_score = tf.nn.softmax(attn_score, axis=-1)

        attn_vec = tf.einsum('bnqk,bknd->bqnd', attn_score, w_head_v)
        attn_vec = tf.reshape(attn_vec, (batch_size, q_len, self.d_model))

        attn_out = self.dense(attn_vec)
        return attn_out


class TransformerLayer(tf.keras.layers.Layer):

    def __init__(self, d_model, d_ff, num_heads, dropout_rate):
        super(TransformerLayer, self).__init__()

        self.rel_multihead_attn = RelMultiHeadAttention(
            d_model=d_model, num_heads=num_heads, dropout_rate=dropout_rate)
        # feed forward network
        self.ffn = point_wise_feed_forward_network(d_model, d_ff)
        # layer normalization
        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        # dropout
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    def call(self, inputs, pos_emb, r_w_bias, r_r_bias, mems, training, **kwargs):
        #Attention
        attn_out = self.rel_multihead_attn(inputs=inputs, pos_emb=pos_emb,
                                           r_w_bias=r_w_bias, r_r_bias=r_r_bias,
                                           mems=mems, training=training)
        attn_out = self.dropout1(attn_out, training=training)
        #Add the attention results
        out1 = self.layer_norm1(inputs + attn_out)
        # Pass through the feed-forward network
        ffn_out = self.ffn(out1, training=training)
        ffn_out = self.dropout2(ffn_out, training=training)
        #Add the feed-forward results back
        out2 = self.layer_norm2(out1 + ffn_out) 
        return out2


class TransformerXL(tf.keras.Model):

    def __init__(self, n_vocab, d_embed, d_model, d_ff, q_len, m_len, num_heads,
                 n_layer, dropout_rate, untie_rel_bias):
        super(TransformerXL, self).__init__()
        self.d_embed = d_embed
        self.d_model = d_model

        self.q_len = q_len
        self.m_len = m_len
        self.n_layer = n_layer
        self.untie_rel_bias = untie_rel_bias

        # word embedding
        self.embedding = tf.Variable(INITIALIZER((n_vocab, d_embed)), name='embedding')
        # word embedding size to model size
        self.projection = tf.Variable(INITIALIZER((d_embed, d_model)), name='projection')
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)

        self.pos_emb = positional_embedding(q_len + m_len, d_model)

        shape = (2, n_layer if untie_rel_bias else 1, num_heads, d_model // num_heads)
        self.rw_bias = tf.Variable(INITIALIZER(shape), name='rw_bias')
        self.logit_bias = tf.Variable(tf.zeros((n_vocab,)), name='logit_bias')

        self.multihead_layers = []
        for i in range(n_layer):
            layer = TransformerLayer(d_model=d_model, d_ff=d_ff, num_heads=num_heads,
                                     dropout_rate=dropout_rate)
            self.multihead_layers.append(layer)

    def cache_mems(self, cur_out, pre_mem):
        if self.m_len is None or self.m_len <= 0:
            return None
        if pre_mem is None:
            new_mem = cur_out
        else:
            new_mem = tf.concat((pre_mem, cur_out), axis=1)
        return tf.stop_gradient(new_mem[:, -self.m_len:]) # don't backpropagate to cache

    def call(self, inputs, mems=None, training=False, **kwargs):
        new_mems = []
        x = tf.nn.embedding_lookup(self.embedding, inputs)
        x = tf.matmul(x, self.projection)

        if mems is None:
            mems = [None] * self.n_layer

        for i in range(self.n_layer):
            new_mems.append(self.cache_mems(x, mems[i]))
            j = i if self.untie_rel_bias else 0
            x = self.multihead_layers[i](inputs=x,
                                         pos_emb=self.pos_emb,
                                         r_w_bias=self.rw_bias[0][j],
                                         r_r_bias=self.rw_bias[1][j],
                                         mems=mems[i],
                                         training=training)

        x = self.dropout1(x, training=training)
        # share embedding parameters with inputs
        # shape=(batch_size, seq_len, d_embed)
        # tf.einsum('bik,jk->bij', x, self.projection)
        x = tf.matmul(x, self.projection, transpose_b=True)
        # shape=(batch_size, seq_len, n_vocab)
        x = tf.matmul(x, self.embedding, transpose_b=True) + self.logit_bias

        return x, new_mems

## Training code

In [5]:
from google.colab import drive
drive.mount('/content/drive/')


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [6]:
import sys
sys.path.append('/content/drive/MyDrive/Study/NLP')

In [7]:
from data_utils_tf import Corpus, Vocab

In [8]:
vocab_file = '/content/drive/MyDrive/Study/NLP/Data/train.txt'
vocab_save_dir = '/content/drive/MyDrive/Study/NLP/Data/Data transformer'
vocab = Vocab(vocab_file, vocab_save_dir, min_freq=2, max_size=10000)

final vocab size 10000 from 28913 unique tokens.


In [9]:
VOCAB_FILE = '/content/drive/MyDrive/Study/NLP/Data/Data transformer/vocab.pkl'
# dataset path
DATA_PATH = '/content/drive/MyDrive/Study/NLP/Data/'
# path for training and valid output, such as save model
OUTPUT_PATH = '/content/drive/MyDrive/Study/NLP/output/'
# tensorboard summary
SUMMARY_PATH = '/content/drive/MyDrive/Study/NLP/summary/'
BATCH_SIZE = 64
# target length, or sequence length
SEQ_LEN = 50
# memory length
MEM_LEN = 16
# word embeeding size
EMBEDDING_SIZE = 128
# multihead attetion hidden size
HIDDEN_SIZE = 128
# feed forward network hidden size
FFN_SIZE = 1024
# number of heads of multiheads
NUM_HEADS = 4
# number of layers of multihead attention
N_LAYER = 6
DROPOUT_RATE = 0.1
# wheather the bias of each layer of relative multihead attention is different or not
UNTIE_REL_BIAS = True
# training steps
STEPS = 200000
# warmup steps in the begging of training
WARMUP_STEPS = 0
# initial learning rate
LEARNING_RATE = 0.0001
# minimal learning rate
MIN_LEARNING_RATE = 0.004
# clips values of multiple tensors by the ratio of the sum of their norms
CLIP_NORM = 0.25
# number of steps between show information during training
VERBOSE_STEP = 100
# number of steps between save model
SAVE_STEP = 2000
# number of steps between verify model
VALID_STEP = 500
EARLY_STOPPING_TIMES = 5


In [10]:
class CosineDecayWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):

    def __init__(self, init_lr, steps, warmup_steps, min_lr):
        super(CosineDecayWarmup, self).__init__()

        self.init_lr = init_lr
        self.warmup_steps = warmup_steps
        self.cosine_decay = tf.keras.experimental.CosineDecay(
            init_lr, steps - warmup_steps, min_lr)

    def __call__(self, step):
        linear_increase = self.init_lr * tf.cast(step, tf.float32) / (
                tf.cast(self.warmup_steps, tf.float32) + 1e-5)
        cosine_decay = self.cosine_decay(step)
        return tf.cond(pred=step <= self.warmup_steps,
                       true_fn=lambda: linear_increase,
                       false_fn=lambda: cosine_decay)

    def get_config(self):
        return {
            'warmup_steps': self.warmup_steps,
            'init_lr': self.init_lr
        }


def model_fn():
    model = TransformerXL(n_vocab=corpus.vocab.size, d_embed=EMBEDDING_SIZE,
                          d_model=HIDDEN_SIZE, d_ff=FFN_SIZE, q_len=SEQ_LEN,
                          m_len=MEM_LEN,
                          num_heads=NUM_HEADS, n_layer=N_LAYER, dropout_rate=DROPOUT_RATE,
                          untie_rel_bias=UNTIE_REL_BIAS)

    return model


corpus = Corpus(path=DATA_PATH, vocab=Vocab(VOCAB_FILE))
model = model_fn()


def logits_to_symbols(logits):
    indices = np.argmax(logits, axis=-1)
    if np.ndim(indices) <= 1:
        indices = [indices]
    return corpus.vocab.get_symbols(indices, join=True)


def loss_function(labels, logits):
    """loss function"""
    loss = tf.keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True)
    loss = tf.reduce_mean(loss)
    return loss


@tf.function
def train_step(inputs, labels, optimizer, mems):
    """train a batch"""
    with tf.GradientTape() as tape:
        logits, new_mems = model(inputs, mems=mems, training=True)
        loss = loss_function(labels, logits)

    gradients = tape.gradient(loss, model.trainable_variables)
    clipped, gnorm = tf.clip_by_global_norm(gradients, CLIP_NORM)
    optimizer.apply_gradients(zip(clipped, model.trainable_variables))

    return loss, logits, new_mems


def train():
    ckpt = tf.train.Checkpoint(model=model)
    ckpt_manager = tf.train.CheckpointManager(
        ckpt, OUTPUT_PATH, max_to_keep=3, checkpoint_name='xl-ckpt')
    writer = tf.summary.create_file_writer(SUMMARY_PATH)

    
    learning_rate = CosineDecayWarmup(LEARNING_RATE, STEPS, WARMUP_STEPS,
                                      MIN_LEARNING_RATE)
    optimizer = tf.keras.optimizers.Adam(learning_rate)
    # create corpus dataset
    train_dataset = corpus.get_dataset('train', batch_size=BATCH_SIZE, seq_len=SEQ_LEN)
    mems = None

    train_loss = tf.keras.metrics.Mean(name='train_loss')

    old_time = time.time()
    for step, batch in enumerate(train_dataset):
        loss, logits, mems = train_step(
            batch['inputs'], batch['labels'], optimizer=optimizer, mems=mems)
        train_loss(loss)


        if step % VERBOSE_STEP == 0:
            print('{} step: {} | loss: {:.4f} | lr: {} | {:.2f} step/s'.format(
                time.strftime("%Y-%m-%d %H:%M:%S"),
                step,
                train_loss.result(),
                learning_rate(step),
                VERBOSE_STEP / (time.time() - old_time)))
            old_time = time.time()

            with writer.as_default():
                tf.summary.scalar('train_loss', train_loss.result(), step=step)
            train_loss.reset_states()

            inps = corpus.vocab.get_symbols(batch['inputs'], join=True)[:3]
            outs = logits_to_symbols(logits)[:5]
            print(inps, '\n', outs, '\n', sep='')

        if step % SAVE_STEP == 0:
            print('saving checkpoint for epoch {} at {}'.format(
                step, ckpt_manager.save()))

        if step % VALID_STEP == 0:
            loss = evaluate()
            print(f'====\nvalidation average loss: {loss:.3f}\n====')
            with writer.as_default():
                tf.summary.scalar('valid_loss', loss, step=step)

        if step >= STEPS:
            print(f'reach max step of iteations {STEPS}, training completed.')
            break


def evaluate():

    mems = [None] * model.n_layer
    total_loss, total_cnt = 0., 0

    valid_dataset = corpus.get_dataset('valid', batch_size=8, seq_len=SEQ_LEN)
    for batch in valid_dataset:
        inputs, labels = batch['inputs'], batch['labels']
        logits, mems = model(inputs, mems=mems, training=False)
        loss = loss_function(labels, logits)
        # statistic total loss
        cnt = np.prod(np.shape(labels))
        total_cnt += cnt
        total_loss += loss * cnt

    avg_loss = total_loss / total_cnt
    return avg_loss

vocabulary loaded from `/content/drive/MyDrive/Study/NLP/Data/Data transformer/vocab.pkl`


In [12]:
train()

2022-01-14 11:18:40 step: 0 | loss: 3.9730 | lr: 0.0 | 16.39 step/s
['=valkyriachroniclesiii=<eos><unk>novalkyria3:<unk>chronicles(japanese:<unk>,lit.valkyriaofthebattlefield3),commonlyreferredtoasvalkyriachroniclesiiioutsidejapan,isatacticalrole@-@playingvideogamedevelopedby<unk>and<unk>', 'allthingsas<unk>ofhimself;andamun,accordingtothemythspromotedbyhis<unk>,precededandcreatedtheothercreatorgods.theseandotherversionsoftheeventsofcreationwerenotseenascontradictory.eachgivesadifferentperspectiveonthecomplex', "inthelastsectionsofthebook.<eos>===<unk>===<eos>thestorylineconcerningtheassassinationprimarilyfollowsthefourconspiratorswhodirectlyparticipateintrujillo'sdeath.antonio<unk><unk>isoneofthefewconspiratorswhosurvivestheviolent<unk>that"]

saving checkpoint for epoch 0 at /content/drive/MyDrive/Study/NLP/output/xl-ckpt-1
====
validation average loss: 4.233
====
2022-01-14 11:20:11 step: 100 | loss: 3.5322 | lr: 9.99999392661266e-05 | 1.10 step/s
['allowedsafepassageinanydirectionc

KeyboardInterrupt: ignored

### Get the trained model

In [11]:
checkpoint_path = tf.train.latest_checkpoint(OUTPUT_PATH)
print('restoring model from {}'.format(checkpoint_path))
tf.train.Checkpoint(model=model).restore(checkpoint_path)

restoring model from /content/drive/MyDrive/Study/NLP/output/xl-ckpt-1


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f73a0102b90>

### Perplexity 

In [13]:
los = evaluate()

In [15]:
import math
print(math.exp(los))

69.53471035280468


### Plot summary

In [59]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [61]:
!tensorboard dev upload \
  --logdir /content/drive/MyDrive/Study/NLP/summary \
  --name "(optional) My latest experiment" \
  --description "(optional) Simple comparison of several hyperparameters" \
  --one_shot


***** TensorBoard Uploader *****

This will upload your TensorBoard logs to https://tensorboard.dev/ from
the following directory:

/content/drive/MyDrive/Study/NLP/summary

This TensorBoard will be visible to everyone. Do not upload sensitive
data.

Your use of this service is subject to Google's Terms of Service
<https://policies.google.com/terms> and Privacy Policy
<https://policies.google.com/privacy>, and TensorBoard.dev's Terms of Service
<https://tensorboard.dev/policy/terms/>.

This notice will not be shown again while you are logged into the uploader.
To log out, run `tensorboard dev auth revoke`.

Continue? (yes/NO) yes

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=373649185512-8v619h5kft38l4456nm2dj4ubeqsrvh6.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email&state=LHVSccvoue1rOHstb2tfU1KKkwbRbs&prompt=