In [1]:
##---------------------------------------------------------------
## Summary : Implement encoder-decoder using greedy matching
## Author  : Srinivas Venkata Vemparala
##----------------------------------------------------------------

# source https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
import numpy as np
import pandas as pd
from __future__ import absolute_import, division, print_function

# Import TensorFlow >= 1.10 and enable eager execution
import tensorflow as tf

tf.enable_eager_execution()

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import defaultdict

import unicodedata
import re
import numpy as np
import os
import time

print(tf.__version__)

import warnings
from collections import defaultdict
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters


1.10.0


In [2]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
    w = w.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [3]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [context, utterance]
def ExtractData(path, num_examples=None):
    lines = open(path, encoding='UTF-8').read().strip().lower().split('\n')
    
    if(num_examples==None):
        sent_pairs = [[preprocess_sentence(w).split(' ') for w in l.split('\t')]  for l in lines]
    else:
        sent_pairs = [[preprocess_sentence(w).split(' ') for w in l.split('\t')]  for l in lines[:num_examples]]
    
    return sent_pairs

In [4]:
# class to create the language index. A language index is a mapping from words to integers
# and integers to words for all different words
class LanguageIndex:
    
    def __init__(self):
        self.w2i = defaultdict(lambda:len(self.w2i))
        self.PAD = self.w2i['<pad>']
        self.UNK = self.w2i['<unk>']
        
    def createIndexMapping(self,listOfSentences):
        retList = []
        for sent in listOfSentences:
            words = [self.w2i[word] for word in sent] 
            retList.append(words)
                
        return retList
    
    def createReverseIndex(self):
        # freeze the list so that if new word comes it will be treated as UNK
        self.w2i = defaultdict(lambda:self.UNK,self.w2i)
        
        self.i2w = {v:k for k,v in self.w2i.items()}
        
        # compute the vocabulary size
        self.vocabSize = len(self.w2i)
        print('VocabularySize : ',self.vocabSize)

In [5]:
# now lets write a method to create dataset given path
def createDataSet(fileName,num_examples = 30000):
    sentPairs = ExtractData(fileName,num_examples)
    
    # seperate the input and output Data
    inputData = [context for context, utterance in sentPairs]
    outputData = [utterance for context,utterance in sentPairs]
    
    # create a languageIndex object and convert the words to numbers
    languageIndex = LanguageIndex()
    inputIndexed = languageIndex.createIndexMapping(inputData)
    outputIndexed = languageIndex.createIndexMapping(outputData)
    
    # create the inverse index and freeze the vocabulary i..e rest words will be treated as UNK
    languageIndex.createReverseIndex()
    
    # get the max length of the input and output
    MAX_LENGTH_INPUT = max(len(sent) for sent in inputData)
    MAX_LENGTH_OUTPUT = max(len(sent) for sent in outputData)
    
    # now let's perform padding for input and output sentences. Padding helps for batching and 
    # faster processing we are padding post and reversing it so that better accuracy can be 
    # obtained while decoding ex: ['how','are','you'] after padding for 
    # len 5 becomes ['<pad>','<pad>','you','are','how']
    inputPadded = tf.keras.preprocessing.sequence.pad_sequences(inputIndexed,
                                                                maxlen=MAX_LENGTH_INPUT,
                                                                padding='post')
    inputPadded = np.flip(inputPadded,1) 
    
    # lets pad output. We need to pad it post but no need to reverse it.
    # ex:['i','am','fine'] after padding for len 5 becomes ['i','am','fine','<pad>','<pad>']
    outputPadded = tf.keras.preprocessing.sequence.pad_sequences(outputIndexed,
                                                                maxlen=MAX_LENGTH_OUTPUT,
                                                                padding='post') 
    
    return inputPadded,outputPadded,languageIndex,MAX_LENGTH_INPUT,MAX_LENGTH_OUTPUT
    

# lets get the Dataset
inputPadded,outputPadded,languageIndex,MAX_LENGTH_INPUT,MAX_LENGTH_OUTPUT = createDataSet('./Twitter data/Dialogue_Question_Answer.txt')
    

VocabularySize :  68239


In [6]:
# Now we will split the input data into train and test data 80-20 split
inputPaddedTrain, inputPaddedTest, outputPaddedTrain, outputPaddedTest = train_test_split(inputPadded, outputPadded, test_size=0.2)

print('length of training Data : ',len(inputPaddedTrain))
print('length of testing Data : ',len(inputPaddedTest))

length of training Data :  24000
length of testing Data :  6000


In [7]:
# Now lets create a tf.data dataset
BUFFER_SIZE = len(inputPaddedTrain)
BATCH_SIZE = 64
NUM_BATCHES = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 128
units = 512

vocabSize = languageIndex.vocabSize

# create the dataset
dataset = tf.data.Dataset.from_tensor_slices((inputPaddedTrain, outputPaddedTrain)).shuffle(BUFFER_SIZE)

# after creating N_batches if few examples are present we drop them
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [8]:
# Now lets define gru which we will be using for encoder and decoder
def gru(numUnits):
    # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)
    # the code automatically does that. glorot_uniform samples weights from
    # [-limit,limit] where limit = sqrt(6/fan_in+fan_out)
    if tf.test.is_gpu_available():
        return tf.keras.layers.CuDNNGRU(numUnits, 
                                        return_sequences=True, 
                                        return_state=True, 
                                        recurrent_initializer='glorot_uniform')
    else:
        return tf.keras.layers.GRU(numUnits, 
                                   return_sequences=True, 
                                   return_state=True, 
                                   recurrent_activation='sigmoid', 
                                   recurrent_initializer='glorot_uniform')

In [9]:
# lets define an encoder class this extends the tf.Keras.Model
class Encoder(tf.keras.Model):
    def __init__(self,vocabSize,embedding_dim,enc_units,batchSize):
        super(Encoder, self).__init__()
        self.batch_sz = batchSize
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocabSize, embedding_dim)
        self.gru = gru(self.enc_units)
        
    # let's implement the call method
    def call(self,x,hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)        
        return output, state
    
    # let's write a method to initialize the hidden state to zeros
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))
        
    

In [10]:
# Now lets write the Decoder class. This class also extends the tf.keras.Model
class Decoder(tf.keras.Model):
    def __init__(self, vocabSize, embedding_dim, dec_units, batchSize):
        super(Decoder, self).__init__()
        self.batch_sz = batchSize
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocabSize, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocabSize)
        
    # let's implement the call method
    def call(self,x,hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        output = self.fc(output)
        return output,state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [11]:
# let's create the encoder and decoder
encoder = Encoder(vocabSize, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocabSize, embedding_dim, units, BATCH_SIZE)

In [12]:
# lets define the optimizer
optimizer = tf.train.AdamOptimizer()

# loss function
def loss_function(real, pred):
    # if the real label is PAD then we will consider the loss from it
    mask = 1 - np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()
    
    hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, hidden)
            
            # the hidden state of decoder is initialized to the encoder hidden state
            dec_hidden = enc_hidden
            
            dec_input = tf.expand_dims([languageIndex.w2i['<start>']] * BATCH_SIZE, 1)       
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden = decoder(dec_input, dec_hidden)
                
                realLabels = targ[:,t]
                
                loss += loss_function(realLabels, predictions)
                # using teacher forcing. Get the correct word from the training data
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
      checkpoint.save(file_prefix = checkpoint_prefix)
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / NUM_BATCHES))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 2.9017
Epoch 1 Batch 100 Loss 1.5327
Epoch 1 Batch 200 Loss 1.7658
Epoch 1 Batch 300 Loss 1.7106
Epoch 1 Loss 1.7568
Time taken for 1 epoch 12045.129220724106 sec

Epoch 2 Batch 0 Loss 1.5567
Epoch 2 Batch 100 Loss 1.5638
Epoch 2 Batch 200 Loss 1.7031
Epoch 2 Batch 300 Loss 1.6888
Epoch 2 Loss 1.5519
Time taken for 1 epoch 13097.710629701614 sec

Epoch 3 Batch 0 Loss 1.4765
Epoch 3 Batch 100 Loss 1.4417
Epoch 3 Batch 200 Loss 1.3797
Epoch 3 Batch 300 Loss 1.6113
Epoch 3 Loss 1.4444
Time taken for 1 epoch 7319.031521320343 sec

Epoch 4 Batch 0 Loss 1.3812
Epoch 4 Batch 100 Loss 1.3784
Epoch 4 Batch 200 Loss 1.4308
Epoch 4 Batch 300 Loss 1.5519
Epoch 4 Loss 1.3315
Time taken for 1 epoch 7279.965874671936 sec

Epoch 5 Batch 0 Loss 1.2284
Epoch 5 Batch 100 Loss 1.1166
Epoch 5 Batch 200 Loss 1.1341
Epoch 5 Batch 300 Loss 1.3431
Epoch 5 Loss 1.1994
Time taken for 1 epoch 7249.232221841812 sec

Epoch 6 Batch 0 Loss 1.0355
Epoch 6 Batch 100 Loss 0.9736
Epoch 6 Batch 200 Lo

In [27]:
# now lets write a method to get the response given the input utterances
def getResponse(sentence):
    # preprocess the sentence
    preprocessed = [preprocess_sentence(sentence).split(' ')] 
    
    # convert the words to int
    indexed = languageIndex.createIndexMapping(preprocessed)
    
    # pad the sentences
    inputPadded = tf.keras.preprocessing.sequence.pad_sequences(indexed,
                                                                maxlen=MAX_LENGTH_INPUT,
                                                                padding='post')
    inputPadded = np.flip(inputPadded,1)[0]
        
    inputPadded = tf.expand_dims(tf.convert_to_tensor(inputPadded),0)
    
    #initialize the hidden state
    hidden = [tf.zeros((1, units))]
    encoderOutput,encoderHiddenState = encoder(inputPadded,hidden)
    
    # the hidden state of decoder is initialized to the encoder hidden state
    decHidden = encoderHiddenState
        
    decInput = tf.expand_dims([languageIndex.w2i['<start>']] , 0)
    
    #initialize output
    output = ''
    
    
    for i in range(0,MAX_LENGTH_OUTPUT):
        predictions, decHidden = decoder(decInput, decHidden)
        
        # greedy decoding
        predictedWordIndex = tf.argmax(predictions[0]).numpy()
        
        # non-greedy decoding
        #predictedWordIndex = tf.multinomial(predictions, num_samples=1)[0][0].numpy() 
        
        predictedWord = languageIndex.i2w[predictedWordIndex]
        
        output += predictedWord+' '
        
        # if end is encountered then stop. 
        if(predictedWord == '<end>'):
            break
        decInput = tf.expand_dims([predictedWordIndex], 0)

    print(output)    

        

In [28]:
getResponse('Learn, connect and explore the future of making things at')

brightyujin filosofiachaeng im so soft <end> 


In [29]:
getResponse('I am happy.')

sageymoore i love you <end> 
