In [77]:
# import libraries
import numpy as np
import re
import time

import tensorflow as tf
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [78]:
# import the datasets
lines = open('cornell movie-dialogs corpus/movie_lines.txt', encoding='utf-8', 
             errors = "ignore").read().split("\n")
conversations = open('cornell movie-dialogs corpus/movie_conversations.txt', 
                     encoding='utf-8', errors = "ignore").read().split("\n")

In [79]:
# Creating a dictionary that maps each line and its id
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [80]:
# Creating a list of all of the conversations
conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(','))

In [81]:
# getting separately the questions and the answers
rawQuestions = []
rawAnswers = []

for conversation in conversations_ids:
    for i in range(len(conversation) - 1):
        rawQuestions.append(id2line[conversation[i]])
        rawAnswers.append(id2line[conversation[i+1]])

In [82]:
# clean the texts
def clean_text(text):
    
    contractions = {
    "ain't": "am not / are not",
    "aren't": "are not / am not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had / he would",
    "he'd've": "he would have",
    "he'll": "he shall / he will",
    "he'll've": "he shall have / he will have",
    "he's": "he has / he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how has / how is",
    "i'd": "I had / I would",
    "i'd've": "I would have",
    "i'll": "I shall / I will",
    "i'll've": "I shall have / I will have",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it had / it would",
    "it'd've": "it would have",
    "it'll": "it shall / it will",
    "it'll've": "it shall have / it will have",
    "it's": "it has / it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had / she would",
    "she'd've": "she would have",
    "she'll": "she shall / she will",
    "she'll've": "she shall have / she will have",
    "she's": "she has / she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as / so is",
    "that'd": "that would / that had",
    "that'd've": "that would have",
    "that's": "that has / that is",
    "there'd": "there had / there would",
    "there'd've": "there would have",
    "there's": "there has / there is",
    "they'd": "they had / they would",
    "they'd've": "they would have",
    "they'll": "they shall / they will",
    "they'll've": "they shall have / they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had / we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what shall / what will",
    "what'll've": "what shall have / what will have",
    "what're": "what are",
    "what's": "what has / what is",
    "what've": "what have",
    "when's": "when has / when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where has / where is",
    "where've": "where have",
    "who'll": "who shall / who will",
    "who'll've": "who shall have / who will have",
    "who's": "who has / who is",
    "who've": "who have",
    "why's": "why has / why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had / you would",
    "you'd've": "you would have",
    "you'll": "you shall / you will",
    "you'll've": "you shall have / you will have",
    "you're": "you are",
    "you've": "you have"
    }
    
    text = text.lower()
    for word in text.split():
        if word in contractions:
            text = text.replace(word, contractions[word])
    
    return text

In [83]:
questions = []
answers = []

for question in rawQuestions:
    questions.append(clean_text(question))
    
for answer in rawAnswers:
    answers.append(clean_text(answer))

In [84]:
# Filtering out the questions and answers that are too short or too long
short_questions = []
short_answers = []
i = 0
for question in questions:
    if 2 <= len(question.split()) <= 25:
        short_questions.append(question)
        short_answers.append(answers[i])
    i += 1
questions = []
answers = []
i = 0
for answer in short_answers:
    if 2 <= len(answer.split()) <= 25:
        answers.append(answer)
        questions.append(short_questions[i])
    i += 1

In [85]:
def preprocess_sentence(w):

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [86]:
def tokenize(questions, answers):

  '''
  fit a tokenizer on the text and pad the sequences
  '''

  tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', 
                                                    lower=False, split=" ", char_level=False, oov_token=None, 
                                                    document_count=0)
  text = questions.copy()
  text.extend(answers)
  tokenizer.fit_on_texts(text)

  Qtensor = tokenizer.texts_to_sequences(questions)
  Qtensor = tf.keras.preprocessing.sequence.pad_sequences(Qtensor, padding='post')

  Atensor = tokenizer.texts_to_sequences(answers)
  Atensor = tf.keras.preprocessing.sequence.pad_sequences(Atensor, padding='post')

  return Qtensor, Atensor, tokenizer

In [87]:
# sort the questions and answers by the length of the questions (speeds up the training by reducing padding)
sorted_questions = []
sorted_answers = []

for questLen in range(1, 25 + 1):
    for idx, quest in enumerate(questions):
        if len(quest) == questLen:
            sorted_questions.append(preprocess_sentence(questions[idx]))
            sorted_answers.append(preprocess_sentence(answers[idx]))

In [88]:
num_examples = 1000

input_tensor, target_tensor, tokenizer = tokenize(sorted_questions[:num_examples], sorted_answers[:num_examples])
max_input_length, max_target_length = input_tensor.shape[1], input_tensor.shape[1]

In [89]:
print(input_tensor.shape)
print(target_tensor.shape)

(1000, 4)
(1000, 32)


In [90]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [91]:
from tensorflow.keras.layers import Input, LSTMCell, Dense, Flatten, Dropout, StackedRNNCells, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam

In [92]:
# setting the hyperparameters
batch_size = 64
lstm_units = 512
num_layers = 3
encoder_embedding_size = 512
decoder_embedding_size = 512
steps_per_epoch = len(input_tensor_train)//batch_size

vocab_size = len(tokenizer.word_index)+1

# dropout rate of 50% for hidden units
keep_probability = 0.5

In [93]:
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(len(input_tensor_train))
dataset = dataset.batch(batch_size, drop_remainder=True)

dataset_val = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val)).shuffle(len(input_tensor_val))
dataset_val = dataset_val.batch(batch_size, drop_remainder=True)

In [94]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 4]), TensorShape([64, 32]))

In [95]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size, keep_probability):
    super(Encoder, self).__init__()
    self.batch_size = batch_size
    self.encoder_units = encoder_units
    self.keep_prob = keep_probability
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm = tf.keras.layers.LSTM(self.encoder_units,
                                     return_sequences=True,
                                    return_state=True, dropout=0.5,
                                     recurrent_initializer='glorot_uniform')
    self.gru = tf.keras.layers.GRU(self.encoder_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.lstmb = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.encoder_units,
                                                                        return_sequences=True,
                                                                        return_state=True,
                                                                        dropout=0.5, recurrent_initializer='glorot_uniform'))

  def call(self, x, hidden):
    # pass through embedding layer
    x = self.embedding(x)

    #output, forward_h, forward_c, backward_h, backward_c = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(encoder_embedding_size, dropout=0.5, return_state = True), merge_mode = 'concat')(x,  initial_state = hidden)
    #encoder_states = [forward_h, forward_c, backward_h, backward_c]

    output, forward_h, forward_c, backward_h, backward_c = self.lstmb(x,  initial_state = hidden)
    encoder_states = tf.keras.layers.Concatenate()([forward_h, backward_h])

    return output, encoder_states

  '''
  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size, self.encoder_units))
  '''

  def initialize_hidden_state(self):
    init_state = [tf.zeros((self.batch_size, self.encoder_units)) for i in range(4)]
    return init_state

In [96]:
encoder = Encoder(vocab_size, encoder_embedding_size, lstm_units, batch_size, keep_probability)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 4, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [97]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [98]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 4, 1)


In [99]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size):
    super(Decoder, self).__init__()
    self.batch_size = batch_size
    self.decoder_units = decoder_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.decoder_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.lstm = tf.keras.layers.LSTM(self.decoder_units,
                                     return_sequences=True,
                                    return_state=True, dropout=0.5,
                                     recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.decoder_units)

  def call(self, x, hidden, encoder_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, encoder_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the LSTM
    output, state_h, state_c = self.lstm(x)
    state = tf.keras.layers.Concatenate()([state_h, state_c])

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [100]:
decoder = Decoder(vocab_size, decoder_embedding_size, lstm_units, batch_size)

sample_decoder_output, _, _ = decoder(tf.random.uniform((batch_size, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 1848)


In [101]:
#import tensorflow_addons as tfa

input_shape = input_tensor.shape

In [102]:
initial_learning_rate = 0.1

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=100,
    decay_rate=0.96,
    staircase=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred, input_shape, sequence_length=25):

  #https://www.tensorflow.org/addons/api_docs/python/tfa/seq2seq/sequence_loss
  #return tfa.seq2seq.sequence_loss(pred, real, tf.ones([input_shape[0], sequence_length]))

  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

#gradients = optimizer.compute_gradients(loss_error)
#clipped_gradients = [(tf.clip_by_value(grad_tensor, -5., 5.), grad_variable) for grad_tensor, grad_variable in gradients if grad_tensor is not None]
#optimizer_gradient_clipping = optimizer.apply_gradients(clipped_gradients)

In [103]:
import os

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [104]:
@tf.function
def train_step(inputs, targets, encoder_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    encoder_output, encoder_hidden = encoder(inputs, encoder_hidden)

    decoder_hidden = encoder_hidden

    decoder_input = tf.expand_dims([tokenizer.word_index['<start>']] * batch_size, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targets.shape[1]):
      # passing enc_output to the decoder
      predictions, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_output)

      #print(targets[:, t].shape, predictions.shape, input_shape)
      loss += loss_function(targets[:, t], predictions, input_shape)

      # using teacher forcing
      decoder_input = tf.expand_dims(targets[:, t], 1)

  batch_loss = (loss / int(targets.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)
  #gradients = optimizer.get_gradients(loss, variables)
  #clipped_gradients = [(tf.clip_by_value(grad_tensor, -5., 5.), grad_variable) for grad_tensor, grad_variable in gradients if grad_tensor is not None]
  #optimizer_gradient_clipping = optimizer.apply_gradients(clipped_gradients)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [105]:
def evaluate_validation(inputs, targets, encoder_hidden):
  
  loss = 0
  encoder_output, encoder_hidden = encoder(inputs, encoder_hidden)
  decoder_hidden = encoder_hidden
  
  decoder_input = tf.expand_dims([tokenizer.word_index['<start>']] * batch_size, 1)
  
  # Teacher forcing - feeding the target as the next input
  for t in range(1, targets.shape[1]):
    # passing enc_output to the decoder
    predictions, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_output)
    loss += loss_function(targets[:, t], predictions, input_shape)
    
    # using teacher forcing
    decoder_input = tf.expand_dims(targets[:, t], 1)

  batch_loss = (loss / int(targets.shape[1]))

  return batch_loss

In [106]:
EPOCHS = 0
batch_training_loss_check = 10
batch_validation_loss_check = steps_per_epoch // 2 - 1

# elements for early stopping
list_validation_loss_error = []
early_stopping_check = 0
early_stopping_stop = 100

for epoch in range(EPOCHS):

  enc_hidden = encoder.initialize_hidden_state()
  total_train_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    start = time.time()
    batch_train_loss = train_step(inp, targ, enc_hidden)
    total_train_loss += batch_train_loss
    end = time.time()
    batch_time = end - start

    if batch % batch_training_loss_check == 0:
      print('Epoch {}, Batch {}, Training Loss {:.3f}, Training Time on {} batches: {:.2f} s'.format(epoch + 1,
                                                   batch, batch_train_loss.numpy(), batch_training_loss_check, 
                                                   batch_time*batch_training_loss_check))
      
    if batch % batch_validation_loss_check == 0 and batch > 0:
      total_valid_loss = 0
      start_val_time = time.time()

      for (batch_val, (inp_val, targ_val)) in enumerate(dataset_val.take(len(input_tensor_val)//batch_size)):
        total_valid_loss += evaluate_validation(inp_val, targ_val, enc_hidden)

      ending_val_time = time.time()
      val_time = ending_val_time - start_val_time

      average_validation_loss = total_valid_loss / (len(input_tensor_val) / batch_size)
      print("Avg Validation Loss Error: {:>6.3f}, Validation Time: {:.2f} s".format(average_validation_loss, val_time))

      # early stopping
      list_validation_loss_error.append(average_validation_loss_error)
      
      if average_validation_loss_error <= min(list_validation_loss_error):
        early_stopping_check = 0
        checkpoint.save(file_prefix = checkpoint_prefix)
      else:
        early_stopping_check += 1
        if early_stopping_check == early_stopping_stop:
          break

  print('Epoch {}, Training Loss {:.3f}, Avg Validation Loss {:.3f}\n'.format(epoch + 1,
                                      total_loss / steps_per_epoch, average_validation_loss))
  
  if early_stopping_check == early_stopping_stop:
    break

In [107]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint('training_checkpoints/'))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x220e71f40a0>

In [108]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [109]:
tokenizer

<keras_preprocessing.text.Tokenizer at 0x220e7071eb0>

In [110]:
def evaluate(sentence):
  attention_plot = np.zeros((max_target_length, max_input_length))

  sentence = preprocess_sentence(sentence)

  inputs = [tokenizer.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_input_length,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, lstm_units)) for i in range(4)] #[tf.zeros((1, lstm_units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)

  for t in range(max_target_length):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += tokenizer.index_word[predicted_id] + ' '

    if tokenizer.index_word[predicted_id] == '<end>':
      return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot

In [111]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='viridis')

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

In [112]:
def answer(sentence):
  result, sentence, attention_plot = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Answer: {}'.format(result))

  attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
  plot_attention(attention_plot, sentence.split(' '), result.split(' '))