In [3]:
import os
import re
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import*
from nltk.translate.bleu_score import sentence_bleu

warnings.filterwarnings('ignore')

UNIGRAM_VOCAB = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',                  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',                  '<SOW>', '<EOW>']

NGRAM_VOCAB = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',                'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',                ' ', '<SOW>', '<EOW>']

unigram_maxlen = 20
bigram_maxlen = 24
trigram_maxlen = 32

unigram_train = pd.read_csv('unigram_train.csv', index_col = 0)
unigram_test = pd.read_csv('unigram_test.csv', index_col = 0)
unigram_val = pd.read_csv('unigram_val.csv', index_col = 0)

bigram_train = pd.read_csv('bigram_train.csv', index_col = 0)
bigram_test = pd.read_csv('bigram_test.csv', index_col = 0)
bigram_val = pd.read_csv('bigram_val.csv', index_col = 0)

trigram_train = pd.read_csv('trigram_train.csv', index_col = 0)
trigram_test = pd.read_csv('trigram_test.csv', index_col = 0)
trigram_val = pd.read_csv('trigram_val.csv', index_col = 0)

bigram_test['enc_inp'] = bigram_test['input'].astype(str).apply(lambda x: '<SOW>*'+'*'.join(list(x))+'*<EOW>')
bigram_test['dec_inp'] = bigram_test['output'].astype(str).apply(lambda x: '<SOW>*'+'*'.join(list(x)))
bigram_test['dec_out'] = bigram_test['output'].astype(str).apply(lambda x: '*'.join(list(x))+'*<EOW>')

trigram_test['enc_inp'] = trigram_test['input'].astype(str).apply(lambda x: '<SOW>*'+'*'.join(list(x))+'*<EOW>')
trigram_test['dec_inp'] = trigram_test['output'].astype(str).apply(lambda x: '<SOW>*'+'*'.join(list(x)))
trigram_test['dec_out'] = trigram_test['output'].astype(str).apply(lambda x: '*'.join(list(x))+'*<EOW>')

print('Shape of unigram train set :',unigram_train.shape)
print('Shape of unigram test set :',unigram_test.shape)
print('Shape of unigram validation set :', unigram_val.shape)

print('\nShape of bigram train set :',bigram_train.shape)
print('Shape of bigram test set :',bigram_test.shape)
print('Shape of bigram validation set :', bigram_val.shape)

print('\nShape of trigram train set :',trigram_train.shape)
print('Shape of trigram test set :',trigram_test.shape)
print('Shape of trigram validation set :', trigram_val.shape)

def split_on_star(input_data):
    return tf.strings.split(input_data, sep = '*')

batch_size = 128

unigram_maxlen = 20
bigram_maxlen = 24
trigram_maxlen = 32

unigram_vec = TextVectorization(output_sequence_length= unigram_maxlen+2, standardize = None, split='whitespace', max_tokens = len(UNIGRAM_VOCAB)+2, output_mode='int')
unigram_vec.adapt(UNIGRAM_VOCAB)

bigram_vec = TextVectorization(output_sequence_length= bigram_maxlen+2, standardize = None, split = split_on_star, max_tokens = len(NGRAM_VOCAB)+2, output_mode='int')
bigram_vec.adapt(NGRAM_VOCAB)

trigram_vec = TextVectorization(output_sequence_length= trigram_maxlen+2, standardize = None, split = split_on_star, max_tokens = len(NGRAM_VOCAB)+2, output_mode='int')
trigram_vec.adapt(NGRAM_VOCAB)

unigram_index_to_word = {idx: word for idx, word in enumerate(unigram_vec.get_vocabulary())}
unigram_word_to_index = {word: idx for idx, word in enumerate(unigram_vec.get_vocabulary())}

bigram_index_to_word = {idx: word for idx, word in enumerate(bigram_vec.get_vocabulary())}
bigram_word_to_index = {word: idx for idx, word in enumerate(bigram_vec.get_vocabulary())}

trigram_index_to_word = {idx: word for idx, word in enumerate(trigram_vec.get_vocabulary())}
trigram_word_to_index = {word: idx for idx, word in enumerate(trigram_vec.get_vocabulary())}

def pred_bigram_mapping(x):
    enc_inp = bigram_vec(x[:, 2])
    return enc_inp

def pred_trigram_mapping(x):
    enc_inp = trigram_vec(x[:, 2])
    return enc_inp

train_in = bigram_train.values.shape[0]%batch_size
val_in = bigram_val.shape[0]%batch_size
test_in = bigram_test.shape[0]%batch_size

bigram_train_dataset = tf.data.Dataset.from_tensor_slices(bigram_train.values[:-train_in, :]).batch(batch_size).map(pred_bigram_mapping).prefetch(1)
bigram_val_dataset = tf.data.Dataset.from_tensor_slices(bigram_val.values[:-val_in, :]).batch(batch_size).map(pred_bigram_mapping).prefetch(1)
bigram_test_dataset = tf.data.Dataset.from_tensor_slices(bigram_test.values[:-test_in, :]).batch(batch_size).map(pred_bigram_mapping).prefetch(1)

train_in = trigram_train.values.shape[0]%batch_size
val_in = trigram_val.shape[0]%batch_size
test_in = trigram_test.shape[0]%batch_size

trigram_train_dataset = tf.data.Dataset.from_tensor_slices(trigram_train.values[:-train_in, :]).batch(batch_size).map(pred_trigram_mapping).prefetch(1)
trigram_val_dataset = tf.data.Dataset.from_tensor_slices(trigram_val.values[:-val_in, :]).batch(batch_size).map(pred_trigram_mapping).prefetch(1)
trigram_test_dataset = tf.data.Dataset.from_tensor_slices(trigram_test.values[:-test_in, :]).batch(batch_size).map(pred_trigram_mapping).prefetch(1)

a = next(bigram_train_dataset.as_numpy_iterator())
a 

Shape of unigram train set : (33101, 5)
Shape of unigram test set : (3150, 2)
Shape of unigram validation set : (3678, 5)

Shape of bigram train set : (884533, 5)
Shape of bigram test set : (26377, 5)
Shape of bigram validation set : (98282, 5)

Shape of trigram train set : (884550, 5)
Shape of trigram test set : (24586, 5)
Shape of trigram validation set : (98284, 5)


array([[54, 23,  4, ...,  0,  0,  0],
       [54,  7, 12, ...,  0,  0,  0],
       [54, 34, 20, ...,  0,  0,  0],
       ...,
       [54,  3, 23, ...,  0,  0,  0],
       [54,  9, 16, ...,  0,  0,  0],
       [54, 15, 27, ...,  0,  0,  0]], dtype=int64)

In [123]:
class Encoder(tf.keras.Model):
    def __init__(self,inp_vocab_size,embedding_size,lstm_size,input_length):
        super().__init__()
        self.lstm_size = lstm_size
        #Initialize Embedding layer
        self.enc_embed = Embedding(input_dim = inp_vocab_size, output_dim = embedding_size, input_length= input_length)
        #Intialize Encoder LSTM layer
        self.enc_lstm = LSTM(lstm_size, return_sequences = True, return_state = True, dropout = 0.4)

    def call(self,input_sequence,states):
        embedding = self.enc_embed(input_sequence)
        output_state, enc_h, enc_c = self.enc_lstm(embedding, initial_state = states)
        return output_state, enc_h, enc_c
    
    def initialize_states(self,batch_size):
        return [tf.zeros((batch_size, self.lstm_size)), tf.zeros((batch_size, self.lstm_size))]

class Decoder(tf.keras.Model):
    def __init__(self,out_vocab_size,embedding_size,lstm_size,input_length):
        super().__init__()
        #Initialize Embedding layer
        self.dec_embed = Embedding(input_dim = out_vocab_size, output_dim = embedding_size, input_length = input_length)
        #Intialize Decoder LSTM layer
        self.dec_lstm = LSTM(lstm_size, return_sequences = True, return_state = True, dropout = 0.4)
    
    def call(self,input_sequence, initial_states):
        embedding = self.dec_embed(input_sequence)
        output_state, dec_h, dec_c = self.dec_lstm(embedding, initial_state = initial_states)
        return output_state, dec_h, dec_c

class Encoder_decoder(tf.keras.Model): 
    def __init__(self,*params):
        super().__init__()
        #Create encoder object
        self.encoder = Encoder(inp_vocab_size = params[0], embedding_size = params[2], lstm_size = params[3], input_length = params[4])
        #Create decoder object
        self.decoder = Decoder(out_vocab_size = params[1], embedding_size = params[2], lstm_size = params[3], input_length = params[5])
        #Intialize Dense layer(out_vocab_size) with activation='softmax'
        self.dense = Dense(params[1], activation='softmax')
    
    def call(self, params, training = True):
        enc_inp, dec_inp = params[0], params[1]
        # print(enc_inp, dec_inp)
        initial_state = self.encoder.initialize_states(batch_size)
        output_state, enc_h, enc_c = self.encoder(enc_inp, initial_state)
        output, _, _ = self.decoder(dec_inp ,[enc_h, enc_c])
        output = Dropout(0.5)(output)
        return self.dense(output)

class pred_Encoder_decoder(tf.keras.Model): 
    def __init__(self,*params):
        super().__init__()
        #Create encoder object
        self.encoder = Encoder(inp_vocab_size = params[0], embedding_size = params[2], lstm_size = params[3], input_length = params[4])
        #Create decoder object
        self.decoder = Decoder(out_vocab_size = params[1], embedding_size = params[2], lstm_size = params[3], input_length = params[5])
        #Intialize Dense layer(out_vocab_size) with activation='softmax'
        self.dense = Dense(params[1], activation='softmax')
        self.word_to_index = params[6]
        self.max_len = params[4]
    
    def call(self, params):
        enc_inp = params
        initial_state = self.encoder.initialize_states(batch_size)
        output_state, enc_h, enc_c = self.encoder(enc_inp, initial_state)
        pred = tf.fill((batch_size, 1), self.word_to_index['<SOW>'])
        dec_h = enc_h
        dec_c = enc_c
        all_outputs = tf.TensorArray(dtype = tf.int64, size= self.max_len)
        for t in range(self.max_len):  
            pred, dec_h,dec_c = self.decoder(pred, [dec_h, dec_c])
            pred = self.dense(pred)
            pred = tf.argmax(pred, axis = -1)
            all_outputs = all_outputs.write(t, pred)
        all_outputs = tf.transpose(all_outputs.stack(), (1, 0, 2))
        all_outputs = tf.reshape(all_outputs, (batch_size, self.max_len))
        return all_outputs
    
def idx_to_word(idx, index_to_word):
    output = []
    for j in idx:
        word = index_to_word[j]
        if word == '<EOW>':
            break
        output.append(word)
    return ''.join(output)

def predict(seq, vectorizer, index_to_word, gram ='uni'):
    if gram == 'uni':
        seq = ' '.join(list(seq))
        seq = '<SOW> '+seq+' <EOW>'
    else:
        seq = '*'.join(list(seq))
        seq = '<SOW>*'+seq+'*<EOW>'
    seq = vectorizer([seq])
    pred = pred_model.predict(tf.expand_dims(seq, 0))
    output = idx_to_word(pred, index_to_word)
    return output

In [95]:
vocab_size = len(bigram_vec.get_vocabulary())
embedding_dim = 100
lstm_size = 256
max_len = 26
shape = bigram_val.shape[0]

pred_model = pred_Encoder_decoder(vocab_size, vocab_size, embedding_dim, lstm_size, max_len, max_len, bigram_word_to_index)
pred_model.compile(optimizer = 'Adam', loss = 'sparse_categorical_crossentropy')
pred_model.build(input_shape=(batch_size, max_len))
pred_model.load_weights('seq2seq_bigram.h5')

train_pred = pred_model.predict(bigram_train_dataset)
train_bleu = 0
for i in tqdm(range(train_pred.shape[0])):
    output = idx_to_word(train_pred[i], bigram_index_to_word)
    inp = bigram_train.output.values[i]
    train_bleu += sentence_bleu([inp], output)
    
val_pred = pred_model.predict(bigram_val_dataset)
val_bleu = 0
for i in tqdm(range(val_pred.shape[0])):
    output = idx_to_word(val_pred[i], bigram_index_to_word)
    inp = bigram_val.output.values[i]
    val_bleu += sentence_bleu([inp], output)
    
test_pred = pred_model.predict(bigram_test_dataset)
test_bleu = 0
for i in tqdm(range(test_pred.shape[0])):
    output = idx_to_word(test_pred[i], bigram_index_to_word)
    inp = bigram_test.output.values[i]
    test_bleu += sentence_bleu([inp], output)
    
print('BLEU Score on train: ',train_bleu/train_pred.shape[0])
print('BLEU Score on val: ',val_bleu/val_pred.shape[0])
print('BLEU Score on test: ',test_bleu/test_pred.shape[0])

100%|████████████████████████████████████████████████████████████████████████| 884480/884480 [01:37<00:00, 9105.30it/s]
100%|██████████████████████████████████████████████████████████████████████████| 98176/98176 [00:10<00:00, 8988.81it/s]
100%|██████████████████████████████████████████████████████████████████████████| 26368/26368 [00:02<00:00, 9383.63it/s]

BLEU Score on train:  0.9654521214897293
BLEU Score on val:  0.9467498122935679
BLEU Score on test:  0.931299275417208





In [124]:
vocab_size = len(trigram_vec.get_vocabulary())
embedding_dim = 100
lstm_size = 256
max_len = 34

pred_model = pred_Encoder_decoder(vocab_size, vocab_size, embedding_dim, lstm_size, max_len, max_len, trigram_word_to_index)
pred_model.compile(optimizer = 'Adam', loss = 'sparse_categorical_crossentropy')
pred_model.build(input_shape=(batch_size, max_len))
pred_model.load_weights('seq2seq_trigram.h5')

train_pred = pred_model.predict(trigram_train_dataset)
train_bleu = 0
for i in tqdm(range(train_pred.shape[0])):
    output = idx_to_word(train_pred[i], trigram_index_to_word)
    inp = trigram_train.output.values[i]
    train_bleu += sentence_bleu([inp], output)
    
val_pred = pred_model.predict(trigram_val_dataset)
val_bleu = 0
for i in tqdm(range(val_pred.shape[0])):
    output = idx_to_word(val_pred[i], trigram_index_to_word)
    inp = trigram_val.output.values[i]
    val_bleu += sentence_bleu([inp], output)
    
test_pred = pred_model.predict(trigram_test_dataset)
test_bleu = 0
for i in tqdm(range(test_pred.shape[0])):
    output = idx_to_word(test_pred[i], trigram_index_to_word)
    inp = trigram_test.output.values[i]
    test_bleu += sentence_bleu([inp], output)
    
print('BLEU Score on train: ',train_bleu/train_pred.shape[0])
print('BLEU Score on val: ',val_bleu/val_pred.shape[0])
print('BLEU Score on test: ',test_bleu/test_pred.shape[0])

100%|████████████████████████████████████████████████████████████████████████| 884480/884480 [02:05<00:00, 7070.92it/s]
100%|██████████████████████████████████████████████████████████████████████████| 98176/98176 [00:15<00:00, 6236.88it/s]
100%|██████████████████████████████████████████████████████████████████████████| 24576/24576 [00:03<00:00, 7938.08it/s]

BLEU Score on train:  0.9684256422311769
BLEU Score on val:  0.9575594555349892
BLEU Score on test:  0.949065764861953





In [167]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self,inp_vocab_size,embedding_size,lstm_size,input_length):
        super(Encoder, self).__init__()
        self.lstm_size = lstm_size
        #Initialize Embedding layer
        self.enc_embed = Embedding(input_dim = inp_vocab_size, output_dim = embedding_size)
        #Intialize Encoder LSTM layer
        self.enc_lstm = LSTM(lstm_size, return_sequences = True, return_state = True, dropout = 0.4)
        
    def call(self,input_sequence,states):
        embedding = self.enc_embed(input_sequence)
        output_state, enc_h, enc_c = self.enc_lstm(embedding, initial_state = states)
        return output_state, enc_h, enc_c
    
    def initialize_states(self,batch_size):
        return [tf.zeros((batch_size, self.lstm_size)), tf.zeros((batch_size, self.lstm_size))]

class Attention(tf.keras.layers.Layer):

    def __init__(self,scoring_function, att_units):
        super(Attention, self).__init__()
        self.scoring_function = scoring_function
        if scoring_function == 'dot':
            self.dot = Dot(axes = (1, 2))
        elif scoring_function == 'general':
          # Intialize variables needed for General score function here
            self.W = Dense(att_units)
            self.dot = Dot(axes = (1, 2))
        elif scoring_function == 'concat':
          # Intialize variables needed for Concat score function here
            self.W1 = Dense(att_units)
            self.W2 = Dense(att_units)
            self.V = Dense(1)
    def call(self,decoder_hidden_state,encoder_output):
    
        decoder_hidden_state = tf.expand_dims(decoder_hidden_state, 1)
        
        if self.scoring_function == 'dot':
            # Implement Dot score function here
            score = tf.transpose(self.dot([tf.transpose(decoder_hidden_state, (0, 2, 1)), encoder_output]), (0, 2,1))
            
        elif self.scoring_function == 'general':
            # Implement General score function here
            mul = self.W(encoder_output)
            score = tf.transpose(self.dot([tf.transpose(decoder_hidden_state, (0, 2, 1)), mul]), (0, 2,1))
            
        elif self.scoring_function == 'concat':
            # Implement General score function here
            inter = self.W1(decoder_hidden_state) + self.W2(encoder_output)
            tan = tf.nn.tanh(inter)
            score = self.V(tan)
        attention_weights = tf.nn.softmax(score, axis =1)
        context_vector = attention_weights * encoder_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

class OneStepDecoder(tf.keras.layers.Layer):
    def __init__(self,tar_vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units):
        super(OneStepDecoder, self).__init__()
      # Initialize decoder embedding layer, LSTM and any other objects needed
        self.embed_dec = Embedding(input_dim = tar_vocab_size, output_dim = embedding_dim)
        self.lstm = LSTM(dec_units, return_sequences = True, return_state = True, dropout = 0.4)
        self.attention = Attention(scoring_function = score_fun, att_units = att_units)
        self.fc = Dense(tar_vocab_size)
    
    def call(self,input_to_decoder, encoder_output, state_h,state_c):
        embed = self.embed_dec(input_to_decoder)
        context_vect, attention_weights = self.attention(state_h, encoder_output)  
        embed = tf.reshape(embed, (batch_size,1, 100))
        context_vect = tf.reshape(context_vect, (batch_size, 1, 256))
        final_inp = tf.concat([context_vect, embed], axis = -1)
        out, dec_h, dec_c = self.lstm(final_inp, [state_h, state_c])
        out = tf.reshape(out, (-1, out.shape[2]))
        output = self.fc(out)
        output = Dropout(0.5)(output)
        return output, dec_h, dec_c, attention_weights, context_vect

class encoder_decoder(tf.keras.Model):
    def __init__(self, inp_vocab_size, out_vocab_size, embedding_dim, enc_units, dec_units, max_len_inp, max_len_out, score_fun, att_units, batch_size):
        #Intialize objects from encoder decoder
        super(encoder_decoder, self).__init__()
        self.encoder = Encoder(inp_vocab_size, embedding_dim, enc_units, max_len_inp)
        self.one_step_decoder = OneStepDecoder(out_vocab_size, embedding_dim, max_len_out, dec_units ,score_fun ,att_units)
        self.batch_size = batch_size
    
    def call(self, data):
        enc_inp, dec_inp = data[0], data[1]
        initial_state = self.encoder.initialize_states(self.batch_size)
        enc_output, enc_h, enc_c = self.encoder(enc_inp, initial_state)
        all_outputs = tf.TensorArray(dtype = tf.float32, size= max_len)
        
        dec_h = enc_h
        dec_c = enc_c
        for timestep in range(max_len):
            # Call onestepdecoder for each token in decoder_input
            output, dec_h, dec_c, _, _ = self.one_step_decoder(dec_inp[:, timestep:timestep+1], 
                                                               enc_output, 
                                                               dec_h,
                                                               dec_c)
            # Store the output in tensorarray
            all_outputs = all_outputs.write(timestep, output)
        # Return the tensor array
        all_outputs = tf.transpose(all_outputs.stack(), (1, 0, 2))
        # return the decoder output
        return all_outputs

class pred_Encoder_decoder(tf.keras.Model): 
    def __init__(self, inp_vocab_size, out_vocab_size, embedding_dim, enc_units, dec_units, max_len_ita, max_len_eng, score_fun, att_units, word_to_index):
        #Intialize objects from encoder decoder
        super(pred_Encoder_decoder, self).__init__()
        self.encoder = Encoder(inp_vocab_size, embedding_dim, enc_units, max_len_ita)
        self.one_step_decoder = OneStepDecoder(out_vocab_size, embedding_dim, max_len_eng, dec_units ,score_fun ,att_units)
        self.batch_size = batch_size
        self.word_to_index = word_to_index
        self.max_len = max_len_ita

    def call(self, params):
        enc_inp = params
        initial_state = self.encoder.initialize_states(batch_size)
        output_state, enc_h, enc_c = self.encoder(enc_inp, initial_state)
        pred = tf.fill((batch_size, 1), self.word_to_index['<SOW>'])
        dec_h = enc_h
        dec_c = enc_c
        all_outputs = tf.TensorArray(dtype = tf.int64, size= self.max_len)
        for t in range(self.max_len):  
            output, dec_h,dec_c, attention, _ = self.one_step_decoder(pred, output_state, dec_h, dec_c)
            pred = tf.argmax(output, axis = -1)
            all_outputs = all_outputs.write(t, pred)
        all_outputs = tf.transpose(all_outputs.stack(), (1, 0))
        return all_outputs
    
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

def predict(seq, vectorizer, index_to_word, gram = 'uni'):
    if gram =='uni':
        seq = '<SOW> '+' '.join(list(seq))+' <EOW>'
    else:
        seq = '<SOW>*'+'*'.join(list(seq))+'*<EOW>'
    seq = vectorizer([seq])
    pred, attention_weights = pred_model.predict(tf.expand_dims(seq, 0))
    output = idx_to_word(pred, index_to_word)
    return output

In [173]:
vocab_size = len(bigram_vec.get_vocabulary())
embedding_dim = 100
lstm_size = 256
att_units = 256
max_len = 26

pred_model = pred_Encoder_decoder(vocab_size, vocab_size, embedding_dim, lstm_size, lstm_size, max_len, max_len, 'concat', att_units, bigram_word_to_index)
pred_model.compile(optimizer = 'Adam', loss = 'sparse_categorical_crossentropy')
pred_model.build(input_shape=(batch_size, max_len))
pred_model.load_weights('Attention_concat_lstm_bigram.h5')

train_pred = pred_model.predict(bigram_train_dataset)
train_bleu = 0
for i in tqdm(range(train_pred.shape[0])):
    output = idx_to_word(train_pred[i], bigram_index_to_word)
    inp = bigram_train.output.values[i]
    train_bleu += sentence_bleu([inp], output)
    
val_pred = pred_model.predict(bigram_val_dataset)
val_bleu = 0
for i in tqdm(range(val_pred.shape[0])):
    output = idx_to_word(val_pred[i], bigram_index_to_word)
    inp = bigram_val.output.values[i]
    val_bleu += sentence_bleu([inp], output)
    
test_pred = pred_model.predict(bigram_test_dataset)
test_bleu = 0
for i in tqdm(range(test_pred.shape[0])):
    output = idx_to_word(test_pred[i], bigram_index_to_word)
    inp = bigram_test.output.values[i]
    test_bleu += sentence_bleu([inp], output)
    
print('BLEU Score on train: ',train_bleu/train_pred.shape[0])
print('BLEU Score on val: ',val_bleu/val_pred.shape[0])
print('BLEU Score on test: ',test_bleu/test_pred.shape[0])

100%|████████████████████████████████████████████████████████████████████████| 884480/884480 [01:43<00:00, 8574.14it/s]
100%|██████████████████████████████████████████████████████████████████████████| 98176/98176 [00:11<00:00, 8424.60it/s]
100%|██████████████████████████████████████████████████████████████████████████| 26368/26368 [00:02<00:00, 9064.28it/s]

BLEU Score on train:  0.9719945427791346
BLEU Score on val:  0.9588801560719675
BLEU Score on test:  0.9460340413418259





In [175]:
vocab_size = len(trigram_vec.get_vocabulary())
embedding_dim = 100
lstm_size = 256
att_units = 256
max_len = 34

pred_model = pred_Encoder_decoder(vocab_size, vocab_size, embedding_dim, lstm_size, lstm_size, max_len, max_len, 'concat', att_units, trigram_word_to_index)
pred_model.compile(optimizer = 'Adam', loss = 'sparse_categorical_crossentropy')
pred_model.build(input_shape=(batch_size, max_len))
pred_model.load_weights('Attention_concat_lstm_trigram.h5')

train_pred = pred_model.predict(trigram_train_dataset)
train_bleu = 0
for i in tqdm(range(train_pred.shape[0])):
    output = idx_to_word(train_pred[i], trigram_index_to_word)
    inp = trigram_train.output.values[i]
    train_bleu += sentence_bleu([inp], output)
    
val_pred = pred_model.predict(trigram_val_dataset)
val_bleu = 0
for i in tqdm(range(val_pred.shape[0])):
    output = idx_to_word(val_pred[i], trigram_index_to_word)
    inp = trigram_val.output.values[i]
    val_bleu += sentence_bleu([inp], output)
    
test_pred = pred_model.predict(trigram_test_dataset)
test_bleu = 0
for i in tqdm(range(test_pred.shape[0])):
    output = idx_to_word(test_pred[i], trigram_index_to_word)
    inp = trigram_test.output.values[i]
    test_bleu += sentence_bleu([inp], output)
    
print('BLEU Score on train: ',train_bleu/train_pred.shape[0])
print('BLEU Score on val: ',val_bleu/val_pred.shape[0])
print('BLEU Score on test: ',test_bleu/test_pred.shape[0])

100%|████████████████████████████████████████████████████████████████████████| 884480/884480 [02:07<00:00, 6952.47it/s]
100%|██████████████████████████████████████████████████████████████████████████| 98176/98176 [00:13<00:00, 7106.41it/s]
100%|██████████████████████████████████████████████████████████████████████████| 24576/24576 [00:03<00:00, 7524.74it/s]

BLEU Score on train:  0.9811416412867453
BLEU Score on val:  0.9743928120876085
BLEU Score on test:  0.96227561809527





In [14]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_size, lstm_size, input_length):
        super(Encoder, self).__init__()
        self.lstm_size = lstm_size
        self.enc_embed = Embedding(input_dim = vocab_size, output_dim = embedding_size)
        self.enc_lstm = Bidirectional(LSTM(lstm_size, return_sequences = True, return_state = True, dropout = 0.4))
    
    def call(self, input_sequence, states):
        embedding = self.enc_embed(input_sequence)
        output_state, enc_frwd_h, enc_frwd_c, enc_bkwd_h, enc_bkwd_c = self.enc_lstm(embedding, initial_state = states)
        return output_state, enc_frwd_h, enc_frwd_c, enc_bkwd_h, enc_bkwd_c
    
    def initialize_states(self, batch_size):
        return [tf.zeros((batch_size, self.lstm_size)), tf.zeros((batch_size, self.lstm_size)),
                tf.zeros((batch_size, self.lstm_size)), tf.zeros((batch_size, self.lstm_size))]

class Attention(tf.keras.layers.Layer):
    def __init__(self,scoring_function, att_units):
        super(Attention, self).__init__()
        self.scoring_function = scoring_function
        if scoring_function == 'dot':
            self.dot = Dot(axes = (1, 2))
        elif scoring_function == 'general':
            self.W = Dense(att_units)
            self.dot = Dot(axes = (1, 2))
        elif scoring_function == 'concat':
            self.W1 = Dense(att_units)
            self.W2 = Dense(att_units)
            self.W3 = Dense(att_units)
            self.V = Dense(1)
            
    def call(self, dec_frwd_state, dec_bkwd_state, encoder_output):
        dec_frwd_state = tf.expand_dims(dec_frwd_state, 1) 
        dec_bkwd_state = tf.expand_dims(dec_bkwd_state, 1)
#         
        if self.scoring_function == 'dot':
            score = tf.transpose(self.dot([tf.transpose(decoder_hidden_state, (0, 2, 1)), encoder_output]), (0, 2,1))           
        elif self.scoring_function == 'general':
            mul = self.W(encoder_output)
            score = tf.transpose(self.dot([tf.transpose(decoder_hidden_state, (0, 2, 1)), mul]), (0, 2,1))           
        elif self.scoring_function == 'concat':
            inter = self.W1(dec_frwd_state) + self.W2(dec_bkwd_state) + self.W3(encoder_output)
            tan = tf.nn.tanh(inter)
            score = self.V(tan)
        attention_weights = tf.nn.softmax(score, axis =1)
        context_vector = attention_weights * encoder_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

class OneStepDecoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units):
        super(OneStepDecoder, self).__init__()
      # Initialize decoder embedding layer, LSTM and any other objects needed
        self.embed_dec = Embedding(input_dim = vocab_size, output_dim = embedding_dim)
        self.lstm = Bidirectional(LSTM(dec_units, return_sequences = True, return_state = True, dropout = 0.4))
        self.attention = Attention(scoring_function = score_fun, att_units = att_units)
        self.fc = Dense(vocab_size)
    
    def call(self,input_to_decoder, encoder_output, state_frwd_h, state_frwd_c, state_bkwd_h, state_bkwd_c):
        embed = self.embed_dec(input_to_decoder)
        context_vect, attention_weights = self.attention(state_frwd_h, state_bkwd_h, encoder_output)  
        embed = tf.reshape(embed, (batch_size,1, 100))
        context_vect = tf.reshape(context_vect, (batch_size, 1, 512))
        final_inp = tf.concat([context_vect, embed], axis = -1)
        out, dec_frwd_h, dec_frwd_c, dec_bkwd_h, dec_bkwd_c = self.lstm(final_inp, [state_frwd_h, state_frwd_c, state_bkwd_h, state_bkwd_c])
        out = tf.reshape(out, (-1, out.shape[2]))
        out = Dropout(0.5)(out)
        output = self.fc(out)
        return output, dec_frwd_h, dec_frwd_c, dec_bkwd_h, dec_bkwd_c, attention_weights, context_vect

class encoder_decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, dec_units, max_len, score_fun, att_units, batch_size):
        #Intialize objects from encoder decoder
        super(encoder_decoder, self).__init__()
        self.encoder = Encoder(vocab_size, embedding_dim, enc_units, max_len)
        self.one_step_decoder = OneStepDecoder(vocab_size, embedding_dim, max_len, dec_units ,score_fun ,att_units)
        self.batch_size = batch_size
    
    def call(self, data):
        enc_inp, dec_inp = data[0], data[1]
        initial_state = self.encoder.initialize_states(self.batch_size)
        enc_output, enc_frwd_h, enc_frwd_c, enc_bkwd_h, enc_bkwd_c = self.encoder(enc_inp, initial_state)
        all_outputs = tf.TensorArray(dtype = tf.float32, size= max_len)
        
        dec_frwd_h = enc_frwd_h
        dec_frwd_c = enc_frwd_c
        dec_bkwd_h = enc_bkwd_h
        dec_bkwd_c = enc_bkwd_c
        for timestep in range(max_len):
            # Call onestepdecoder for each token in decoder_input
            output, dec_frwd_h, dec_frwd_c, dec_bkwd_h, dec_bkwd_c, _, _ = self.one_step_decoder(dec_inp[:, timestep:timestep+1], enc_output, dec_frwd_h, dec_frwd_c, dec_bkwd_h, dec_bkwd_c)
            # Store the output in tensorarray
            all_outputs = all_outputs.write(timestep, output)
        # Return the tensor array
        all_outputs = tf.transpose(all_outputs.stack(), (1, 0, 2))
        # return the decoder output
        return all_outputs
    
class pred_Encoder_decoder(tf.keras.Model): 
    def __init__(self, inp_vocab_size, out_vocab_size, embedding_dim, enc_units, dec_units, max_len_ita, max_len_eng, score_fun, att_units, word_to_index):
        #Intialize objects from encoder decoder
        super(pred_Encoder_decoder, self).__init__()
        self.encoder = Encoder(inp_vocab_size, embedding_dim, enc_units, max_len_ita)
        self.one_step_decoder = OneStepDecoder(out_vocab_size, embedding_dim, max_len_eng, dec_units, score_fun, att_units)
        self.word_to_index = word_to_index
        self.max_len = max_len_ita
        
    def call(self, params):
        enc_inp = params
        initial_state = self.encoder.initialize_states(batch_size)
        enc_output, enc_frwd_h, enc_frwd_c, enc_bkwd_h, enc_bkwd_c = self.encoder(enc_inp, initial_state)
        pred = tf.fill((batch_size, 1), self.word_to_index['<SOW>'])
        all_outputs = tf.TensorArray(dtype = tf.int64, size= self.max_len)
        
        dec_frwd_h = enc_frwd_h
        dec_frwd_c = enc_frwd_c
        dec_bkwd_h = enc_bkwd_h
        dec_bkwd_c = enc_bkwd_c
        for timestep in range(self.max_len):
            # Call onestepdecoder for each token in decoder_input
            output, dec_frwd_h, dec_frwd_c, dec_bkwd_h, dec_bkwd_c, _, _ = self.one_step_decoder(pred, enc_output, dec_frwd_h, dec_frwd_c, dec_bkwd_h, dec_bkwd_c)
            pred = tf.argmax(output, axis = -1)
            all_outputs = all_outputs.write(timestep, pred)
#             pred = tf.expand_dims(pred, 0)
        all_outputs = tf.transpose(all_outputs.stack(), (1, 0))
        return all_outputs

In [17]:
vocab_size = len(bigram_vec.get_vocabulary())
embedding_dim = 100
lstm_size = 256
att_units = 256
max_len = 26

pred_model = pred_Encoder_decoder(vocab_size, vocab_size, embedding_dim, lstm_size, lstm_size, max_len, max_len, 'concat', att_units, bigram_word_to_index)
pred_model.compile(optimizer = 'Adam', loss = 'sparse_categorical_crossentropy')
pred_model.build(input_shape=(batch_size, max_len))
pred_model.load_weights('concat_best_bigram.h5')

train_pred = pred_model.predict(bigram_train_dataset, verbose=1)
train_bleu = 0
for i in tqdm(range(train_pred.shape[0])):
    output = idx_to_word(train_pred[i], bigram_index_to_word)
    inp = bigram_train.output.values[i]
    train_bleu += sentence_bleu([inp], output)
    
val_pred = pred_model.predict(bigram_val_dataset, verbose=1)
val_bleu = 0
for i in tqdm(range(val_pred.shape[0])):
    output = idx_to_word(val_pred[i], bigram_index_to_word)
    inp = bigram_val.output.values[i]
    val_bleu += sentence_bleu([inp], output)
    
test_pred = pred_model.predict(bigram_test_dataset, verbose=1)
test_bleu = 0
for i in tqdm(range(test_pred.shape[0])):
    output = idx_to_word(test_pred[i], bigram_index_to_word)
    inp = bigram_test.output.values[i]
    test_bleu += sentence_bleu([inp], output)
    
print('BLEU Score on train: ',train_bleu/train_pred.shape[0])
print('BLEU Score on val: ',val_bleu/val_pred.shape[0])
print('BLEU Score on test: ',test_bleu/test_pred.shape[0])

100%|████████████████████████████████████████████████████████████████████████| 884480/884480 [01:40<00:00, 8810.67it/s]




100%|██████████████████████████████████████████████████████████████████████████| 98176/98176 [00:11<00:00, 8791.59it/s]

  1/206 [..............................] - ETA: 0s






100%|██████████████████████████████████████████████████████████████████████████| 26368/26368 [00:02<00:00, 9222.69it/s]

BLEU Score on train:  0.9808526374708109
BLEU Score on val:  0.9669617055111845
BLEU Score on test:  0.9539630640021209





In [18]:
vocab_size = len(trigram_vec.get_vocabulary())
embedding_dim = 100
lstm_size = 256
att_units = 256
max_len = 34

pred_model = pred_Encoder_decoder(vocab_size, vocab_size, embedding_dim, lstm_size, lstm_size, max_len, max_len, 'concat', att_units, trigram_word_to_index)
pred_model.compile(optimizer = 'Adam', loss = 'sparse_categorical_crossentropy')
pred_model.build(input_shape=(batch_size, max_len))
pred_model.load_weights('concat_best_trigram.h5')

train_pred = pred_model.predict(trigram_train_dataset)
train_bleu = 0
for i in tqdm(range(train_pred.shape[0])):
    output = idx_to_word(train_pred[i], trigram_index_to_word)
    inp = trigram_train.output.values[i]
    train_bleu += sentence_bleu([inp], output)
    
val_pred = pred_model.predict(trigram_val_dataset, verbose=1)
val_bleu = 0
for i in tqdm(range(val_pred.shape[0])):
    output = idx_to_word(val_pred[i], trigram_index_to_word)
    inp = trigram_val.output.values[i]
    val_bleu += sentence_bleu([inp], output)
    
test_pred = pred_model.predict(trigram_test_dataset, verbose=1)
test_bleu = 0
for i in tqdm(range(test_pred.shape[0])):
    output = idx_to_word(test_pred[i], trigram_index_to_word)
    inp = trigram_test.output.values[i]
    test_bleu += sentence_bleu([inp], output)
    
print('BLEU Score on train: ',train_bleu/train_pred.shape[0])
print('BLEU Score on val: ',val_bleu/val_pred.shape[0])
print('BLEU Score on test: ',test_bleu/test_pred.shape[0])

100%|████████████████████████████████████████████████████████████████████████| 884480/884480 [02:15<00:00, 6530.46it/s]




100%|██████████████████████████████████████████████████████████████████████████| 98176/98176 [00:14<00:00, 6996.11it/s]




100%|██████████████████████████████████████████████████████████████████████████| 24576/24576 [00:03<00:00, 7288.16it/s]

BLEU Score on train:  0.9889611211686458
BLEU Score on val:  0.9813112757412255
BLEU Score on test:  0.9693013446155896



