In [1]:
import tensorflow as tf
import pickle
import unicodedata
import re
import numpy as np
import os
import io
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

## Read data

In [2]:
PATH = './data/'

In [3]:
BATCH_SIZE = 16
ENC_HIDDEN_SIZE = 512
DEC_HIDDEN_SIZE = 512
NUM_LAYER = 4
DROP_OUT = 0.2
embedding_dim = 512
speaker_dim = 128
MAXLEN = 50
speakerNum = 14
EPOCHS = 10
LATENT_SIZE = 200
# KEEP_PROB = 1.0

In [4]:
with open(PATH + 'tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
d_tensor_train = np.load(PATH + 'd_tensor_train.npy',allow_pickle=True)
r_tensor_train = np.load(PATH + 'r_tensor_train.npy',allow_pickle=True)

dia_train = [t[0] for t in d_tensor_train]
aid_train = [t[1] for t in d_tensor_train]
res_train = [t[0] for t in r_tensor_train]
sid_train = [t[1] for t in r_tensor_train]

In [5]:
sample_dia_train = dia_train[:32]
sample_aid_train = aid_train[:32]
sample_res_train = res_train[:32]
sample_sid_train = sid_train[:32]

In [77]:
BUFFER_SIZE = len(d_tensor_train[:32])
steps_per_epoch = int(np.ceil(len(d_tensor_train[:32]) / BATCH_SIZE ))
vocab_size = len(tokenizer.word_index) + 1

In [7]:
# create tf.dataset
dataset = tf.data.Dataset.from_tensor_slices((dia_train, res_train, sid_train, aid_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=False)
print("Create dataset done.")

Create dataset done.


## Word embedding

In [8]:
class Word_Embedding(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim):
        super(Word_Embedding, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        
    def call(self,x):
        return self.embedding(x)

   ## Encoder (Bidirectional LSTM)

In [9]:
class Encoder(tf.keras.Model):
    def __init__(self, hidden_size, vocab_size, batch_size=1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.fw_layer = tf.keras.layers.LSTM(self.hidden_size,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.bw_layer = tf.keras.layers.LSTM(self.hidden_size,
                                       return_sequences=True,
                                       return_state=True,
                                       go_backwards=True,
                                       recurrent_initializer='glorot_uniform')
        self.bi_rnn = tf.keras.layers.Bidirectional(self.fw_layer, merge_mode='concat', backward_layer=self.bw_layer)
    def call(self, d):
        output,fw_hidden,fw_c,bw_hidden,bw_c = self.bi_rnn(d)
        hidden = tf.concat([fw_hidden,bw_hidden],1)
        c = tf.concat([fw_c,bw_c],1)
        return output, hidden,c

In [10]:
example_input, example_target,example_sid, example_aid = next(iter(dataset))

In [11]:
example_target.shape

TensorShape([16, 50])

In [12]:
word_embedding = Word_Embedding(vocab_size,embedding_dim)

In [13]:
encoder = Encoder(ENC_HIDDEN_SIZE, vocab_size, BATCH_SIZE)
# enc_hidden = encoder.initialize_hidden_state(BATCH_SIZE)
inp_emb = word_embedding(example_input)
enc_output,enc_hidden,enc_c = encoder(inp_emb)
print("encoder hidden state.shape:{}".format(enc_hidden.shape))
print("encoder cell state.shape:{}".format(enc_c.shape))
targ_emb = word_embedding(example_target)
_,targ_hidden,_ = encoder(targ_emb)

encoder hidden state.shape:(16, 1024)
encoder cell state.shape:(16, 1024)


In [83]:
targ_emb[:,1,:].shape

TensorShape([16, 512])

## VAE

In [14]:
def sample_gaussian(mu, logvar):
    epsilon = tf.random.normal(logvar.shape)
    std = tf.exp(0.5 * logvar)
    z= mu + tf.multiply(std, epsilon)
    return z

In [15]:
class Recognition_Network(tf.keras.Model):
    def __init__(self, latent_size):
        super(Recognition_Network, self).__init__()
        self.rec_nn = tf.keras.layers.Dense(latent_size*2)
    
    def call(self,enc_hidden,targ_hidden):
#         print(enc_hidden.shape)
        recog_input = tf.concat([enc_hidden,targ_hidden],1)
        recog_mulogvar = self.rec_nn(recog_input)
#         print("recog_mulogvar.shape:{}".format(recog_mulogvar.shape))
        recog_mu,recog_logvar = tf.split(recog_mulogvar,2,axis=1)
        return recog_mu,recog_logvar

In [16]:
recognition_network = Recognition_Network(LATENT_SIZE)
recog_mu,recog_logvar = recognition_network(enc_hidden, targ_hidden)

In [17]:
class Prior_Network(tf.keras.Model):
    def __init__(self, latent_size):
        super(Prior_Network, self).__init__()
        self.fc = tf.keras.layers.Dense(max(latent_size*2,100))
        self.pri_nn = tf.keras.layers.Dense(latent_size*2)
    
    def call(self,enc_hidden):
#         print(enc_hidden.shape)
        prior_fc1 = self.fc(enc_hidden)
#         print(prior_fc1.shape)
        prior_mulogvar = self.pri_nn(prior_fc1)
#         print(prior_mulogvar.shape)
        prior_mu,prior_logvar = tf.split(prior_mulogvar,2,axis=1)
        return prior_mu,prior_logvar

In [18]:
prior_network = Prior_Network(LATENT_SIZE)
prior_mu,prior_logvar = prior_network(enc_hidden)

In [19]:
use_prior = False ##seem to use prior for test dataset?
latent_sample = tf.cond(use_prior,lambda:sample_gaussian(prior_mu,prior_logvar),
                       lambda:sample_gaussian(recog_mu,recog_logvar))

In [20]:
latent_sample.shape

TensorShape([16, 200])

In [21]:
class Generation_Network(tf.keras.Model):
    def __init__(self, vocab_size,hidden_size):
        super(Generation_Network, self).__init__()
        self.bow_fc = tf.keras.layers.Dense(400)
        self.bow_logits = tf.keras.layers.Dense(vocab_size)
        self.init_fc = tf.keras.layers.Dense(hidden_size)
    
    def call(self,enc_hidden,latent_sample):
        gen_inputs = tf.concat([enc_hidden,latent_sample],1)
        bow_fc1 = self.bow_fc(gen_inputs)
#         if KEEP_PROB < 1.0:
#             bow_fc1 = tf.nn.dropout(bow_fc1,KEEP_PROB)
        bow_logit = self.bow_logits(bow_fc1)
#         print("bow_logit.shape:{}".format(bow_logit.shape))
        dec_init_state = self.init_fc(gen_inputs)
#         print("dec_init_state.shape:{}".format(dec_init_state.shape))
        # return decoder initial state and bag of word loss
        # we use lstm in decoder
        return [dec_init_state,dec_init_state],bow_logit

In [22]:
generation_network = Generation_Network(vocab_size,DEC_HIDDEN_SIZE)
dec_init_state, bow_logit = generation_network(enc_hidden,latent_sample)

## Decoder (old one)

In [23]:
class Attention_Feed(tf.keras.Model):
    def __init__(self, hidden_size):
        super(Attention_Feed, self).__init__()
        self.W1 = tf.keras.layers.Dense(hidden_size)
        self.W2 = tf.keras.layers.Dense(hidden_size)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [24]:
attention = Attention_Feed(DEC_HIDDEN_SIZE)
context_vector, attention_weights = attention(enc_hidden, enc_output)
print(context_vector.shape)

(16, 1024)


In [52]:
class Decoder(tf.keras.Model):
    def __init__(self, hidden_size, vocab_size, speaker_dim, num_layers=1):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        #         self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.speaker_embedding = tf.keras.layers.Embedding(speakerNum, speaker_dim)
        #         self.input_size = embedding_dim
        self.output_size = vocab_size  # vocabulary size
        self.lstm_1 = tf.keras.layers.LSTM(self.hidden_size,
                                           return_sequences=True,
                                           return_state=True,
                                           dropout=DROP_OUT,
                                           recurrent_initializer='glorot_uniform')
        self.lstms = []
        for k in range(self.num_layers - 1):
            self.lstms.append(tf.keras.layers.LSTM(self.hidden_size,
                                                   return_sequences=True,
                                                   return_state=True,
                                                   dropout=DROP_OUT))
        self.fc = tf.keras.layers.Dense(self.output_size)
        self.W1 = tf.keras.layers.Dense(speaker_dim)
        self.W2 = tf.keras.layers.Dense(speaker_dim)
        # attention feed on context
        self.attention = Attention_Feed(self.hidden_size)

    def call(self, target, enc_output, init_state, speaker_id, addressee_id=None):
        hidden = init_state[0]
        context_vector, attention_weights = self.attention(hidden, enc_output)
        # personas
        speaker = self.speaker_embedding(speaker_id)

        if addressee_id is not None:
            addressee = self.speaker_embedding(addressee_id)
            v_ij = self.combine_user_vector(speaker, addressee)
            features = tf.concat([target, tf.expand_dims(v_ij, 1)], axis=-1)
        else:
            features = tf.concat([target, tf.expand_dims(speaker, 1)], axis=-1)
        r = tf.concat([tf.expand_dims(context_vector, 1), features], axis=-1)

        # passing the concatenated vector to the 4-layer LSTM
        output, hidden, c = self.lstm_1(r, initial_state=init_state)
        init_state = [hidden, c]
        for k in range(self.num_layers - 1):
            #             print("output.shape: {}".format(output.shape))
            output, state, c = self.lstms[k](output, initial_state=init_state)
            init_state = [hidden, c]
        output = tf.reshape(output, (-1, output.shape[2]))
        # log_softmax used before
        output = tf.nn.log_softmax(self.fc(output), axis=1)
        return output, state, c, attention_weights

    def combine_user_vector(self,i_em, j_em):
        V_ij = tf.nn.tanh(self.W1(i_em) + self.W2(j_em))
        return V_ij

In [27]:
dec_init_state[0].shape

TensorShape([16, 512])

In [29]:
decoder = Decoder(DEC_HIDDEN_SIZE,vocab_size,speaker_dim,4)
targ = word_embedding(tf.expand_dims([tokenizer.word_index['<sos>']]*BATCH_SIZE,1))
output, state,_,_ = decoder(targ,enc_output,dec_init_state,example_sid)

In [30]:
output.shape

TensorShape([16, 26028])

## Training

In [31]:
def count_real_word(real):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=tf.float32)
#     pdb.set_trace()
    word_per_line = tf.math.reduce_sum(mask,1)
    return word_per_line

In [32]:
def bow_loss_function(reals,bow_logit):
    # if I need to adjust according to the lenght of sentence???
    labels = reals[:,1:]
    mask = tf.math.logical_not(tf.math.equal(labels, 0))
    
    bow_logit_tile = tf.tile(tf.expand_dims(bow_logit,1),[1,MAXLEN-1,1])
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels,bow_logit_tile) 
    mask = tf.cast(mask, dtype=loss_.dtype)
#     print("bow_loss.shape before reduction:{}".format(loss_.shape))
    bow_loss = tf.reduce_sum(loss_*mask,axis = 1)
#     print("bow_loss.shape after reduction:{}".format(bow_loss.shape))
    return tf.reduce_mean(bow_loss)

In [33]:
avg_bow_loss = bow_loss_function(example_target,bow_logit)
print(avg_bow_loss)

tf.Tensor(136.69319, shape=(), dtype=float32)


In [34]:
def rc_loss_function(real, pred,word_per_line):
    # loss of every word
    # true word mask
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ = loss_ * mask
    return tf.math.reduce_sum(loss_/word_per_line)

In [35]:
word_per_line = count_real_word(example_target)
rc_loss = rc_loss_function(example_target[:,1], output,word_per_line)
print("rc_loss:{}".format(rc_loss.numpy()))

rc_loss:17.32868003845215


In [36]:
isAnnealing = True

In [37]:
def loss_function(avg_rc_loss,avg_bow_loss,recog_mu,recog_logvar,prior_mu,prior_logvar,global_iter,kl_full_step):
    temp = 1 + (recog_logvar - prior_logvar) - tf.math.divide(tf.pow(prior_mu - recog_mu,2),tf.exp(prior_logvar))-tf.math.divide(tf.exp(recog_logvar),tf.exp(prior_logvar))
    kld = -1/2 * tf.reduce_sum(temp,axis = 1)
    avg_kld = tf.reduce_mean(kld)
    kl_weight = tf.minimum(tf.cast(global_iter/kl_full_step,dtype=tf.float32),1.0)
    
    elbo = avg_rc_loss + kl_weight * avg_kld
    aug_elbo = avg_bow_loss + elbo
    
    return elbo,aug_elbo,avg_kld

In [38]:
elbo,aug_elbo,avg_kld = loss_function(rc_loss,avg_bow_loss,recog_mu,recog_logvar,prior_mu,prior_logvar,1,2000)
print("ELBO:{}".format(elbo))
print("Augmented elbo:{}".format(aug_elbo))
print("Kl divergence:{}".format(avg_kld))

ELBO:17.328733444213867
Augmented elbo:154.0219268798828
Kl divergence:0.10809069126844406


In [85]:
class Train(object):
    def __init__(self,word_embedding,encoder,decoder,recognition_network,prior_network,generation_network,optimizer,tokenizer):
        self.word_embedding = word_embedding
        self.encoder = encoder
        self.decoder = decoder
        self.recognition_network  = recognition_network
        self.prior_network = prior_network
        self.generation_network = generation_network
        self.tokenizer = tokenizer
        self.optimizer = optimizer
        
#     @tf.function
    def train_step(self,inp, targ,global_iter,kl_full_step,speaker_id,batch_size=BATCH_SIZE, addressee_id=None):
        rc_avg_loss = 0
        word_per_line = count_real_word(targ)
        with tf.GradientTape() as tape:
            inp_emb = self.word_embedding(inp)
            targ_emb = self.word_embedding(targ)
            enc_output,enc_hidden,enc_c = self.encoder(inp_emb)
            _,targ_hidden,_ = self.encoder(targ_emb)
            # VAE module
            prior_mu,prior_logvar = self.prior_network(enc_hidden)
            recog_mu,recog_logvar = self.recognition_network(enc_hidden,targ_hidden)
            
            latent_sample = tf.cond(use_prior,lambda:sample_gaussian(prior_mu,prior_logvar),
                       lambda:sample_gaussian(recog_mu,recog_logvar))
            dec_init_state,bow_logit = self.generation_network(enc_hidden,latent_sample)
            
#             dec_input = self.word_embedding(tf.expand_dims([self.tokenizer.word_index['<sos>']]*batch_size,1))
            dec_input = tf.expand_dims(targ_emb[:,0,:],1)
            
            rc_loss = 0
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                if addressee_id is not None:
                    predictions, dec_hidden,dec_c, _ = self.decoder(
                        dec_input,enc_output, dec_init_state,speaker_id,addressee_id)
                else:
                    predictions, dec_hidden, dec_c,_ = self.decoder(
                        dec_input,enc_output, dec_init_state,speaker_id)
                dec_init_state = [dec_hidden,dec_c]
                rc_loss += rc_loss_function(targ[:, t], predictions, word_per_line)
                
                # using teacher forcing
#                 dec_input = self.word_embedding(tf.expand_dims(targ[:, t], 1))
                dec_input = tf.expand_dims(targ_emb[:,t,:],1)
            
            avg_rc_loss = (rc_loss / int(targ.shape[0]))
            avg_bow_loss = bow_loss_function(targ,bow_logit)
            elbo,aug_elbo,kl_loss = loss_function(avg_rc_loss,avg_bow_loss,recog_mu,recog_logvar,prior_mu,prior_logvar,global_iter,kl_full_step)
            variables = self.encoder.trainable_variables+ self.decoder.trainable_variables + self.recognition_network.trainable_variables+ self.prior_network.trainable_variables+ self.generation_network.trainable_variables
            gradients = tape.gradient(aug_elbo, variables)
            self.optimizer.apply_gradients(zip(gradients, variables))
            return elbo,avg_bow_loss,avg_rc_loss,kl_loss

    def run_iter(self,epochs,isAddressee,steps_per_epoch,dataset,checkpoint,checkpoint_prefix):
        global_iter = 0
        kl_full_step = steps_per_epoch * epochs / 2
        for e in range(epochs):
            start = time.time()
            elbo_losses = 0
            bow_losses = 0
            rc_losses = 0
            kl_losses = 0
            for (batch, (inp, targ,sid,aid)) in enumerate(dataset.take(steps_per_epoch)):
                batch_sz =targ.shape[0]
                if isAddressee==True:
                    # global iter to count iter
                    elbo,avg_bow_loss,avg_rc_loss,kl_loss = self.train_step(inp, targ,global_iter,kl_full_step,sid,batch_sz,aid)
                else:
                    # global iter to count iter
                    elbo,avg_bow_loss,avg_rc_loss,kl_loss = self.train_step(inp, targ,global_iter,kl_full_step,sid,batch_sz)
                elbo_losses += elbo
                bow_losses += avg_bow_loss
                rc_losses += avg_rc_loss
                kl_losses += kl_loss
                global_iter += 1
                if batch % 100 == 0:
                    print('Epoch {} Batch {} ELBO {:.4f} BOW LOSS {:.4f} RC LOSS {:.4f} KL LOSS {:.4f}'.format(e + 1,
                                                                 batch,
                                                                 elbo.numpy(),avg_bow_loss.numpy(),
                                                                 avg_rc_loss.numpy(),kl_loss.numpy()))
            
            # saving (checkpoint) the model every 2 epochs
            if (e + 1) % 2 == 0:
                checkpoint.save(file_prefix = checkpoint_prefix)

            print('Epoch {} ELBO {:.4f} BOW LOSS {:.4f} RC LOSS {:.4f}'.format(e + 1,
                                              elbo_losses / steps_per_epoch,
                                              bow_losses / steps_per_epoch,
                                              rc_losses / steps_per_epoch,
                                              kl_losses / steps_per_epoch))
            print('Time taken for 1 epoch {} sec\n'.format(time.time() - start)) 
    
    def run_iter_test(self, epochs, isAddressee, steps_per_epoch, dataset):
        global_iter = 0
        kl_full_step = steps_per_epoch * epochs / 2
        for e in range(epochs):
            start = time.time()
            elbo_losses = 0
            bow_losses = 0
            rc_losses = 0
            kl_losses = 0
            for (batch, (inp, targ,sid,aid)) in enumerate(dataset.take(steps_per_epoch)):
                batch_sz =targ.shape[0]
                print("batch size:{}".format(batch_sz))
                if isAddressee==True:
                    # global iter to count iter
                    elbo,avg_bow_loss,avg_rc_loss,kl_loss = self.train_step(inp, targ,global_iter,kl_full_step,sid,batch_sz,aid)
                else:
                    # global iter to count iter
                    elbo,avg_bow_loss,avg_rc_loss,kl_loss = self.train_step(inp, targ,global_iter,kl_full_step,sid,batch_sz)
                elbo_losses += elbo
                bow_losses += avg_bow_loss
                rc_losses += avg_rc_loss
                kl_losses += kl_loss
                global_iter += 1

                print('Epoch {} Batch {} ELBO {:.4f} BOW LOSS {:.4f} RC LOSS {:.4f} KL LOSS {:.4f}'.format(e + 1,
                                                                 batch,elbo.numpy(),avg_bow_loss.numpy(),
                                                                 avg_rc_loss.numpy(),kl_loss.numpy()))


            print('Epoch {} ELBO {:.4f} BOW LOSS {:.4f} RC LOSS {:.4f}'.format(e + 1,
                                              elbo_losses / steps_per_epoch,
                                              bow_losses / steps_per_epoch,
                                              rc_losses / steps_per_epoch,
                                              kl_losses / steps_per_epoch))
            print('Time taken for 1 epoch {} sec\n'.format(time.time() - start)) 

In [59]:
word_embedding = Word_Embedding(vocab_size,embedding_dim)
encoder = Encoder(ENC_HIDDEN_SIZE, vocab_size, BATCH_SIZE)
decoder = Decoder(DEC_HIDDEN_SIZE,vocab_size,speaker_dim,4)
recognition_network = Recognition_Network(LATENT_SIZE)
prior_network = Prior_Network(LATENT_SIZE)
generation_network = Generation_Network(vocab_size,DEC_HIDDEN_SIZE)
optimizer = tf.keras.optimizers.Adam()

In [86]:
train_nn = Train(word_embedding,encoder,decoder,recognition_network,prior_network,generation_network,optimizer,tokenizer)

In [87]:
elbo,avg_bow_loss,avg_rc_loss,kl_loss = train_nn.train_step(example_input, example_target,1,2000,example_sid)

In [88]:
train_nn.run_iter_test(3,False,steps_per_epoch,dataset)

batch size:16
Epoch 1 Batch 0 ELBO 4.0693 BOW LOSS 77.5264 RC LOSS 4.0693 KL LOSS 4.4133
batch size:16
Epoch 1 Batch 1 ELBO 6.4102 BOW LOSS 44.3939 RC LOSS 3.8987 KL LOSS 7.5344
Epoch 1 ELBO 5.2397 BOW LOSS 60.9601 RC LOSS 3.9840
Time taken for 1 epoch 22.433491945266724 sec

batch size:16
Epoch 2 Batch 0 ELBO 9.9989 BOW LOSS 46.5391 RC LOSS 3.8640 KL LOSS 9.2024
batch size:16
Epoch 2 Batch 1 ELBO 11.6055 BOW LOSS 83.9159 RC LOSS 4.2512 KL LOSS 7.3543
Epoch 2 ELBO 10.8022 BOW LOSS 65.2275 RC LOSS 4.0576
Time taken for 1 epoch 22.074671983718872 sec

batch size:16
Epoch 3 Batch 0 ELBO 8.5056 BOW LOSS 73.6567 RC LOSS 4.1440 KL LOSS 4.3617
batch size:16
Epoch 3 Batch 1 ELBO 7.1713 BOW LOSS 95.5927 RC LOSS 4.5393 KL LOSS 2.6320
Epoch 3 ELBO 7.8384 BOW LOSS 84.6247 RC LOSS 4.3416
Time taken for 1 epoch 22.195114135742188 sec

