In [1]:
import tensorflow as tf

import unicodedata
import re
import numpy as np
import os
import io
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
path = 'data/'
friends = 'friends.csv'
bigbang = 'bigbang.csv'

friends_df = pd.read_csv(path+friends)
bigbang_df = pd.read_csv(path+bigbang)

# need to drop sentence which are NA (because they represents some action of the characters)
na_index = bigbang_df[bigbang_df['dialogue'].isna()].index
bigbang_df.drop(index = na_index,inplace=True )

df = pd.concat([friends_df, bigbang_df], ignore_index=True, sort=False)
df.reset_index(drop=True, inplace=True)

In [3]:
main_c = ['joey','rachel','chandler','monica','ross','phoebe','leonard',
          'sheldon','penny','howard','raj','amy','bernadette','other']
speakers_ind=dict()
for ind, c in enumerate(main_c,1):
    speakers_ind[c] = ind
    
df['speaker_id'] = df['speakers'].apply(lambda x: speakers_ind[x] - 1)
speakerid_list = list(df['speaker_id'])
speakers = list(df['speakers'])
dialogues = list(df['dialogue'])
episodes = list(df['episodes'])

In [4]:
data_size = len(df)
MAXLEN = 100
print(data_size)

112700


In [5]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.rstrip().strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<SOS> ' + w + ' <EOS>'
    return w

In [6]:
print(preprocess_sentence(dialogues[0]))
print(preprocess_sentence(dialogues[0]).encode('utf-8'))

<SOS> i m tellin ya that girl totally winked at me . <EOS>
b'<SOS> i m tellin ya that girl totally winked at me . <EOS>'


In [7]:
def max_length(tensor):
    return max(len(t[0]) for t in tensor)

In [8]:
def tokenize(sentence,num_samples):
    sent_list = list()
    for k in range(0,num_samples):
        sent_list.append(preprocess_sentence(dialogues[k]))
    sent_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters='')
    sent_tokenizer.fit_on_texts(sent_list)

    tensor = sent_tokenizer.texts_to_sequences(sent_list)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                    maxlen=MAXLEN,
                                                     padding='post',
                                                    truncating='post',
                                                     value=0)

    return tensor, sent_tokenizer,sent_list

In [9]:
# just to show if tokenizer work well
# count = 0
# for key,val in tokenizer.word_index.items():
#     count += 1
#     print ("{0} ----> {1}".format(key, val))
#     if count==3:
#         break

In [9]:
tensor, tokenizer,sent_list = tokenize(dialogues,data_size)
# build up a dictionary index:word
index2word = {v: k for k, v in tokenizer.word_index.items()}

In [11]:
# just to show if reverse work well
# count = 0
# for key,val in index2word.items():
#     count += 1
#     print ("{0} ----> {1}".format(key, val))
#     if count==3:
#         break

In [10]:
def create_dataset(tensor,num_samples):
    dialogues_list = list()
    response_list = list()
    for k in range(0,num_samples):
        if(k+1 >= num_samples):
            break
        if episodes[k]==episodes[k+1]:
            dialogue = tensor[k]
#             pdb.set_trace()
            response = tensor[k+1]
            addressee = tf.convert_to_tensor(speakerid_list[k])
            speaker = tf.convert_to_tensor(speakerid_list[k+1])
            dialogues_list.append([dialogue,addressee])
            response_list.append([response,speaker])
#     print(dialogues_list)
#     print(response_list)
    return dialogues_list,response_list    

In [11]:
d_tensor,r_tensor = create_dataset(tensor,data_size)

In [12]:
# Calculate max_length of the target tensors
max_length_targ, max_length_inp = max_length(r_tensor), max_length(d_tensor)
print(max_length_targ)
print(max_length_inp)

100
100


In [14]:
len(d_tensor)

112244

In [13]:
# shuffle
tensor = shuffle(d_tensor,r_tensor)
d_tensor = tensor[0]
r_tensor = tensor[1]

# Creating training and validation sets using an 80-20 split
d_tensor_train, d_tensor_val, r_tensor_train, r_tensor_val = train_test_split(d_tensor, r_tensor, test_size=0.2)

# Show length
print(len(d_tensor_train), len(r_tensor_train), len(d_tensor_val), len(r_tensor_val))

89795 89795 22449 22449


In [15]:
dia_train = [t[0] for t in d_tensor_train]
dia_val = [t[0] for t in d_tensor_val]
aid_train = [t[1] for t in d_tensor_train]
aid_val = [t[1] for t in d_tensor_val]
res_train = [t[0] for t in r_tensor_train]
res_val = [t[0] for t in r_tensor_val]
sid_train = [t[1] for t in r_tensor_train]
sid_val = [t[1] for t in r_tensor_val]

In [90]:
BUFFER_SIZE = len(d_tensor_train)
# remember to change BATCH_SIZE, 16 just for test
BATCH_SIZE = 16
steps_per_epoch = len(d_tensor_train)//BATCH_SIZE  + 1
HIDDEN_SIZE = 1000
NUM_LAYER = 4
DROP_OUT = 0.2
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 512
speakerNum = len(main_c)

In [91]:
steps_per_epoch

5613

In [17]:
# create tf.dataset
dataset = tf.data.Dataset.from_tensor_slices((dia_train,res_train, sid_train,aid_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=False)

In [18]:
example_input_batch, example_target_batch,example_sid_batch, example_aid_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([16, 100]), TensorShape([16, 100]))

   ## Encoder and Decoder

In [85]:
class Encoder(tf.keras.Model):
    def __init__(self, hidden_size, vocab_size,embedding_dim, num_layers=1, batch_size=1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.input_size = embedding_dim            
        self.lstm_1 = tf.keras.layers.LSTM(self.hidden_size,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout=DROP_OUT,
                                       recurrent_initializer='glorot_uniform')
        self.lstms = []
        for k in range(self.num_layers - 1):
            self.lstms.append(tf.keras.layers.LSTM(self.hidden_size,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout=DROP_OUT))
    def call(self, d, init_state):
        d = self.embedding(d)
#         print ('Encoder input shape: {}'.format(d.shape))
        output, hidden,c = self.lstm_1(d, initial_state = init_state)
        init_state = [hidden, c]
        # four layer train, 4 lstm
        for k in range(self.num_layers - 1):
            output, hidden,c = self.lstms[k](output, initial_state = init_state)
            init_state = [hidden, c]
        return output, hidden,c

    def initialize_hidden_state(self,batch_size=0):
        if batch_size == 0: batch_size =self.batch_size
        init_hidden = tf.zeros((batch_size, self.hidden_size))
        init_c = tf.zeros((batch_size, self.hidden_size))
        return [init_hidden,init_c]

In [86]:
# test encoder
encoder = Encoder(HIDDEN_SIZE,vocab_size, embedding_dim, NUM_LAYER, BATCH_SIZE)

# sample input
sample_init_state = encoder.initialize_hidden_state()
sample_output, sample_hidden,sample_c = encoder(example_input_batch, sample_init_state)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (16, 100, 1000)
Encoder Hidden state shape: (batch size, units) (16, 1000)


In [21]:
class Attention_Feed(tf.keras.Model):
    def __init__(self, hidden_size):
        super(Attention_Feed, self).__init__()
        self.W1 = tf.keras.layers.Dense(hidden_size)
        self.W2 = tf.keras.layers.Dense(hidden_size)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))
#         print("score.size():{0}".format(score.shape))
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
#         print("values.size():{0}".format(values.shape))
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
#         print("context_vector.size():{0}".format(context_vector.shape))
        return context_vector, attention_weights

In [22]:
# test attention_feed
attention_layer = Attention_Feed(HIDDEN_SIZE)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (16, 1000)
Attention weights shape: (batch_size, sequence_length, 1) (16, 100, 1)


In [23]:
def combine_user_vector(i_em,j_em):
    # size is equal to the number of 
    size = i_em.shape[-1]
    W1 = tf.keras.layers.Dense(size)
    W2 = tf.keras.layers.Dense(size)
    V_ij = tf.nn.tanh(W1(i_em) + W2(j_em))
    return V_ij

In [24]:
class Decoder(tf.keras.Model):
    def __init__(self, hidden_size, vocab_size,embedding_dim, num_layers=1):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.speaker_embedding = tf.keras.layers.Embedding(speakerNum, embedding_dim)
        self.input_size = embedding_dim
        self.output_size = vocab_size #vocabulary size           
        self.lstm_1 = tf.keras.layers.LSTM(self.hidden_size,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout = DROP_OUT,
                                       recurrent_initializer='glorot_uniform')
        self.lstms = []
        for k in range(self.num_layers - 1):
            self.lstms.append(tf.keras.layers.LSTM(self.hidden_size,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout=DROP_OUT))
        self.fc = tf.keras.layers.Dense(self.output_size)
        
        # attention feed on context
        self.attention = Attention_Feed(self.hidden_size)

    def call(self, x, enc_output,init_state,speaker_id,addressee_id=None):
#         batch_size = x.size()[1]
        hidden = init_state[0]
        context_vector, attention_weights = self.attention(hidden, enc_output)
        features = self.embedding(x)
        # personas
        speaker = self.speaker_embedding(speaker_id)
#         print("Speaker shape: {}".format(speaker.shape))
#         print("finish speaker embedding")
        if addressee_id is not None:
#             print("detect addressee")
            addressee = self.speaker_embedding(addressee_id)
            v_ij = combine_user_vector(speaker,addressee)
            features = tf.concat([features,tf.expand_dims(v_ij,1)], axis=-1)
        else:
            features = tf.concat([features, tf.expand_dims(speaker, 1)], axis=-1)
#         max_length = enc_output.size(0)  
        r = tf.concat([tf.expand_dims(context_vector, 1), features], axis=-1)
#         print("finish concatenate")
        
        # passing the concatenated vector to the 4-layer LSTM
        output, hidden,c = self.lstm_1(r,initial_state = init_state)
        init_state = [hidden, c]
        for k in range(self.num_layers - 1):
            output, state,c = self.lstms[k](output,initial_state = init_state)
            init_state = [hidden, c]
        
#         print("finish 4-layer LSTM")
        # Removes dimensions of size 1 from the shape of a tensor.
        # output shape: (batch_size, 1, hidden_size) --> (batch_size *1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
#         output = output.squeeze(0)

        # output shape == （batch_size, hidden_size)
        output = tf.nn.log_softmax(self.fc(output),axis=1)
#         print("finish all")
        return output, state,c,attention_weights

In [87]:
# test decoder
decoder1 = Decoder(HIDDEN_SIZE,vocab_size, embedding_dim,NUM_LAYER)
init_state = [sample_hidden,sample_c]
sp = tf.convert_to_tensor([1]*BATCH_SIZE)

In [88]:
# speaker model
sample_decoder_output1, _, _,_ = decoder1(tf.random.uniform((BATCH_SIZE, 1)),
                                 sample_output,init_state,sp)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output1.shape))

Decoder output shape: (batch_size, vocab size) (16, 26028)


In [33]:
# speaker addressee model
decoder2 = Decoder(HIDDEN_SIZE,vocab_size, embedding_dim,NUM_LAYER)
add = tf.convert_to_tensor([2]*BATCH_SIZE)
sample_decoder_output2, _, _,_ = decoder2(tf.random.uniform((BATCH_SIZE, 1)),
                                 sample_output,init_state,sp,add)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output2.shape))

Decoder output shape: (batch_size, vocab size) (16, 26028)


In [34]:
def count_real_word(real):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=tf.float32)
#     pdb.set_trace()
    word_per_line = tf.math.reduce_sum(mask,1)
    return word_per_line

In [92]:
import pdb
class Train(object):
    def __init__(self,encoder,decoder,optimizer,tokenizer,num_layers=1):
        self.encoder = encoder
        self.decoder = decoder
        self.tokenizer = tokenizer
        self.optimizer = optimizer
        self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction='none')
#         self.index2word = index2word
#         self.num_layers = num_layers
        
    def loss_function(self,real, pred):
        # loss of every word
        # true word mask
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = self.loss_object(real, pred)
        
        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ = loss_ * mask
        return loss_
    
#     @tf.function
    def train_step(self,inp, targ, enc_hidden,speaker_id,batch_size=BATCH_SIZE, addressee_id=None):
        loss = 0
        word_per_line = count_real_word(targ)
        with tf.GradientTape() as tape:
            enc_output,enc_hidden,enc_c = self.encoder(inp,enc_hidden)
            dec_init_state = [enc_hidden,enc_c]
            dec_input = tf.expand_dims([self.tokenizer.word_index['<sos>']]*batch_size,1)
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                if addressee_id is not None:
#                     print("detect addressee")
                    predictions, dec_hidden,dec_c, _ = self.decoder(
                        dec_input,enc_output, dec_init_state,speaker_id,addressee_id)
                else:
                    predictions, dec_hidden, dec_c,_ = self.decoder(
                        dec_input,enc_output, dec_init_state,speaker_id)
                dec_init_state = [dec_hidden,dec_c]
#                 loss += self.loss_function(targ[:,t], predictions)
                loss_ = self.loss_function(targ[:, t], predictions)
                loss  += tf.math.reduce_sum(loss_/word_per_line)

                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
            
            batch_loss = (loss / int(targ.shape[1]))
#             print("batch_loss: {}".format(batch_loss))
            variables = self.encoder.trainable_variables + self.decoder.trainable_variables

            gradients = tape.gradient(loss, variables)
#             print("Get gradient:{}".format(type(gradients)))
            self.optimizer.apply_gradients(zip(gradients, variables))
#             print("finish all.")
            return batch_loss

    def run_iter(self,epochs,isAddressee,steps_per_epoch,checkpoint,checkpoint_prefix):
        for e in range(epochs):
            start = time.time()

            enc_hidden = self.encoder.initialize_hidden_state()
            total_loss = 0

            
            for (batch, (inp, targ,sid,aid)) in enumerate(dataset.take(steps_per_epoch)):
                # drop_reminder = False
                batch_sz =targ.shape[0]
                # just to test the last batch
#                 if batch_sz == BATCH_SIZE:
#                     continue

#                 print("batch_size: {}".format(batch_sz))
                enc_hidden = self.encoder.initialize_hidden_state(batch_sz)
#                 print("enc_hidden.shape: {}".format(enc_hidden[0].shape))
                if isAddressee==True:
                    batch_loss = self.train_step(inp, targ, enc_hidden,sid,batch_sz,aid)
                else:
                    batch_loss = self.train_step(inp, targ, enc_hidden,sid,batch_sz)
                total_loss += batch_loss

#                 if batch % 100 == 0:
#                     print('Epoch {} Batch {} Loss {:.4f}'.format(e + 1,
#                                                                  batch,
#                                                                  batch_loss.numpy()))
                #just for test
                print('Epoch {} Batch {} Loss {:.4f}'.format(e + 1,
                                                                 batch,
                                                                 batch_loss.numpy()))
                # just for test
                if batch==3: break
            
            # saving (checkpoint) the model every 2 epochs
            if (e + 1) % 2 == 0:
                checkpoint.save(file_prefix = checkpoint_prefix)

#             print('Epoch {} Loss {:.4f}'.format(e + 1,
#                                               total_loss / steps_per_epoch))
            # just for test
            print('Epoch {} Loss {:.4f}'.format(e + 1,
                                              total_loss / 3))
            print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))  

In [36]:
optimizer = tf.keras.optimizers.Adam()

In [37]:
checkpoint_dir = './persona_training_checkpoint'
checkpoint_prefix = os.path.join(checkpoint_dir, "sam_test2")

In [39]:
EPOCHS = 3

In [102]:
# test train_step
train_nn = Train(encoder,decoder1,optimizer,tokenizer)
enc_hidden = encoder.initialize_hidden_state()

In [None]:
# example_input_batch
# example_target_batch
# example_sid_batch

In [49]:
# checkpoint
checkpoint = tf.train.Checkpoint(optimizer=train_nn.optimizer,
                                 encoder=train_nn.encoder,
                                 decoder=train_nn.decoder)

In [50]:
# speaker model
loss = train_nn.train_step(example_input_batch,example_target_batch,enc_hidden,example_sid_batch)
print(loss)

tf.Tensor(1.5076135, shape=(), dtype=float32)


In [104]:
train_nn = Train(encoder,decoder2,optimizer,tokenizer)
enc_hidden = encoder.initialize_hidden_state()

In [105]:
# spekaer-addressee model
loss = train_nn.train_step(example_input_batch,example_target_batch,enc_hidden,
                           example_sid_batch,example_aid_batch)
print(loss)

tf.Tensor(1.0233467, shape=(), dtype=float32)


In [89]:
# speaker model
train_nn = Train(encoder,decoder1,optimizer,tokenizer)
train_nn.run_iter(EPOCHS,False,steps_per_epoch,checkpoint,checkpoint_prefix)

batch_size: 3
enc_hidden.shape: (3, 1000)
Epoch 1 Batch 5612 Loss 0.2745
Epoch 1 Loss 0.0915
Time taken for 1 epoch 43.728718996047974 sec

batch_size: 3
enc_hidden.shape: (3, 1000)
Epoch 2 Batch 5612 Loss 0.2510
Epoch 2 Loss 0.0837
Time taken for 1 epoch 47.54292893409729 sec

batch_size: 3
enc_hidden.shape: (3, 1000)
Epoch 3 Batch 5612 Loss 0.2600
Epoch 3 Loss 0.0867
Time taken for 1 epoch 44.630903005599976 sec



In [93]:
# speaker-addressee model
train_nn = Train(encoder,decoder2,optimizer,tokenizer)
train_nn.run_iter(EPOCHS,True,steps_per_epoch,checkpoint,checkpoint_prefix)

batch_size: 16
enc_hidden.shape: (16, 1000)
Epoch 1 Batch 0 Loss 1.4706
batch_size: 16
enc_hidden.shape: (16, 1000)
Epoch 1 Batch 1 Loss 1.4594
batch_size: 16
enc_hidden.shape: (16, 1000)
Epoch 1 Batch 2 Loss 1.3419
batch_size: 16
enc_hidden.shape: (16, 1000)
Epoch 1 Batch 3 Loss 1.2221
Epoch 1 Loss 1.8313
Time taken for 1 epoch 205.0519299507141 sec

batch_size: 16
enc_hidden.shape: (16, 1000)
Epoch 2 Batch 0 Loss 1.0640
batch_size: 16
enc_hidden.shape: (16, 1000)
Epoch 2 Batch 1 Loss 1.0076
batch_size: 16
enc_hidden.shape: (16, 1000)
Epoch 2 Batch 2 Loss 0.8958
batch_size: 16
enc_hidden.shape: (16, 1000)
Epoch 2 Batch 3 Loss 0.9274
Epoch 2 Loss 1.2983
Time taken for 1 epoch 204.30272889137268 sec

batch_size: 16
enc_hidden.shape: (16, 1000)
Epoch 3 Batch 0 Loss 0.8793
batch_size: 16
enc_hidden.shape: (16, 1000)
Epoch 3 Batch 1 Loss 0.9056
batch_size: 16
enc_hidden.shape: (16, 1000)
Epoch 3 Batch 2 Loss 0.9431
batch_size: 16
enc_hidden.shape: (16, 1000)
Epoch 3 Batch 3 Loss 0.8042
Epo

In [100]:
def validation(train_nn,inp,targ,speaker_id,addressee_id=None,batch_size=BATCH_SIZE):
    loss = 0
    inp = np.asarray(inp)
    targ = np.asarray(targ)
    speaker_id = np.asarray(speaker_id)
    if addressee_id is not None:
        addressee_id = np.asarray(addressee_id)
#     print("test targ[:,1].shape: {}".format(targ[:,1].shape))
    val_size = targ.shape[0]
    print("val_size: {}".format(val_size))
    num_each_batch = (int)(np.floor(val_size/batch_size))
    remaining_num = val_size - num_each_batch * batch_size
    if remaining_num == 0:
        remaining_num = batch_size
    for k in range(0,val_size,batch_size):
        start = time.time()
        batch_loss = 0
        
        if (k+batch_size)>=val_size:
            print("k now +batch_size>=val_size: {}".format(k))
            inputs = inp[k:]
            targs = targ[k:]
            s_id = speaker_id[k:]
#             print("s_id.shape: {}".format(s_id.shape))
            if addressee_id is not None:
                a_id = addressee_id[k:]
            enc_hidden = [tf.zeros((remaining_num, HIDDEN_SIZE)),tf.zeros((remaining_num, HIDDEN_SIZE))]
            
            enc_out, enc_hidden,enc_c = train_nn.encoder(inputs, enc_hidden)
            dec_init_state = [enc_hidden,enc_c]
            dec_input = tf.expand_dims([train_nn.tokenizer.word_index['<sos>']]*remaining_num, 1)
        else:
            print("k now: {}".format(k))
            inputs = inp[k:k+batch_size]
            targs = targ[k:k+batch_size]
            s_id = speaker_id[k:k+batch_size]
            print("s_id.shape: {}".format(s_id.shape))
            if addressee_id is not None:
                a_id = addressee_id[k:k+batch_size]
            
            enc_hidden = [tf.zeros((batch_size, HIDDEN_SIZE)),tf.zeros((batch_size, HIDDEN_SIZE))]
            enc_out, enc_hidden,enc_c = train_nn.encoder(inputs, enc_hidden)
            dec_init_state = [enc_hidden,enc_c]
            dec_input = tf.expand_dims([train_nn.tokenizer.word_index['<sos>']]*batch_size, 1)
            
        word_per_line = count_real_word(targs)
        
        for t in range(targ.shape[1]):
            if addressee_id is not None:
                predictions, dec_hidden, dec_c,_ = train_nn.decoder(dec_input,enc_out, dec_init_state,s_id,a_id)
            else:
                predictions, dec_hidden, dec_c,_ = train_nn.decoder(dec_input,enc_out, dec_init_state,s_id)
            
            # use the max prob one in each sentence in the batch
            predicted_id = tf.argmax(predictions,axis=1)
            dec_init_state = [dec_hidden,dec_c]
#             batch_loss += train_nn.loss_function(targs[:,t], predictions)
            loss_ = train_nn.loss_function(targs[:, t], predictions)
            batch_loss  += tf.math.reduce_sum(loss_/word_per_line)
#             print("test predictions[0]: {}".format(predictions[0]))
#             predictions = np.asarray(predictions)
#             predicted = tf.constant(predicted_id)
            dec_input = tf.expand_dims(predicted_id, 1)
        
        if (k+batch_size)>=val_size:
            loss += batch_loss/batch_size
        else:
            loss += batch_loss/remaining_num
        
        print('batch {} Loss {:.4f}'.format(k/batch_size + 1,
                                              loss))
        print('Time taken {} sec\n'.format(time.time() - start))  
    return loss

In [95]:
batch_sz = 16
size = 2*batch_sz
sample_input_val = dia_val[:size]
sample_targ_val = res_val[:size]
sample_sid_val = sid_val[:size]
sample_aid_val = aid_val[:size]

In [101]:
# speaker model
validation(train_nn,sample_input_val,sample_targ_val,sample_sid_val,batch_size = batch_sz)

val_size: 32
k now: 0
s_id.shape: (16,)
batch 1.0 Loss 7.2168
Time taken 9.33592700958252 sec

k now +batch_size>=val_size: 16
batch 2.0 Loss 14.7253
Time taken 9.266100883483887 sec



<tf.Tensor: id=2420473, shape=(), dtype=float32, numpy=14.725277>

In [40]:
# speaker-addressee model
validation(train_nn,sample_input_val,sample_targ_val,sample_sid_val,sample_aid_val,batch_size = batch_sz)

val_size: 32
k now: 0
s_id.shape: (16,)
batch 1.0 Loss 7.5812
Time taken 37.32085299491882 sec

k now +batch_size>=val_size: 16
s_id.shape: (16,)
batch 2.0 Loss 14.3822
Time taken 38.703108072280884 sec



<tf.Tensor: id=1738526, shape=(), dtype=float32, numpy=14.382153>