In [98]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import unicodedata
import re
import time

In [99]:
# Import dataset
ds = pd.read_csv('train.csv')
ds = ds[ds['language']=='English']

In [138]:
# Preprocess data by removing non-alphanumeric characters and tokenizing 
def preprocessSentence(s):
    s = s.lower().strip()
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
    s = s.strip()
    s = '<start> ' + s + ' <end>'
    return s

def createTokenizer(data):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(data)
    return tokenizer

def tokenizeData(data,tokenizer):
    tensor = tokenizer.texts_to_sequences(data)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
    return tensor

premises = []
for premise in ds['premise']:
    premises.append(preprocessSentence(premise))

hypotheses = []
for hypothesis in ds['hypothesis']:
    hypotheses.append(preprocessSentence(hypothesis))
    
tokenizer = createTokenizer(premises+hypotheses)
premise_tensor = tokenizeData(premises,tokenizer)
hypothesis_tensor = tokenizeData(hypotheses,tokenizer)
ph_tensor = tf.concat([premise_tensor,hypothesis_tensor],1)

In [139]:
# Create tf data pipeline
BUFFER_SIZE = len(ph_tensor)
BATCH_SIZE = 32
steps_per_epoch = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 200
units = 150
vocab_size = len(tokenizer.word_index)+1
split_size = premise_tensor.shape[1]

train_input, val_input, train_targ, val_targ = train_test_split(ph_tensor.numpy(),ds['label'].values,test_size=0.3)

In [140]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_input, train_targ)) #.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

val_dataset = tf.data.Dataset.from_tensor_slices((val_input,val_targ)) #.shuffle(BUFFER_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [143]:
# Define Encoder + Decoder architectures


class Encoder(tf.keras.Model):
    
    def __init__(self,vocab_size,embedding_dim,enc_units,batch_size):
        super(Encoder,self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
#         self.gru = tf.keras.layers.GRU(self.enc_units,
#                                    return_sequences=True,
#                                    return_state=True,
#                                    recurrent_initializer='glorot_uniform')
        
        self.lstm = tf.keras.layers.LSTM(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
        
        
    def call(self,x,hidden=None):
        x = self.embedding(x)
        #output, state = self.gru(x,initial_state=hidden)
        #return output, state
        output, state, carry = self.lstm(x,initial_state=hidden)
        return output, state, carry
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size,self.enc_units))


class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self,units):
        super(BahdanauAttention,self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.W3 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self,query,past_context,values):
        query_with_time_axis = tf.expand_dims(query,1)
        past_context_with_time = tf.expand_dims(past_context,1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values) + self.W3(past_context_with_time)))
        attention_weights = tf.nn.softmax(score,axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights
    
    
class BahdanauAttentionLastWord(tf.keras.layers.Layer):
    def __init__(self,units):
        super(BahdanauAttentionLastWord,self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self,query,values):
        query_with_time_axis = tf.expand_dims(query,1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score,axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights
    
    
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
#         self.gru = tf.keras.layers.GRU(self.dec_units,
#                                        return_sequences=True,
#                                        return_state=True,
#                                        recurrent_initializer='glorot_uniform')
        self.lstm = tf.keras.layers.LSTM(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        #self.attention = BahdanauAttention(self.dec_units)
        self.attention = BahdanauAttentionLastWord(self.dec_units)
        self.fc = tf.keras.layers.Dense(3)
        

    def call(self, x, hidden, enc_output):
        
        x = self.embedding(x)
        #output, state = self.gru(x,initial_state=hidden)
        output, state, carry = self.lstm(x,initial_state=hidden)
        
#         past_context_vector = hidden
#         for t in range(x.shape[1]):
#             context_vector, _ = self.attention(output[:,t,:], past_context_vector,enc_output)
#             past_context_vector = context_vector

        context_vector, _ = self.attention(state,enc_output)
    
        context_vector = tf.concat([context_vector,state],axis=1)
        
        x = self.fc(context_vector)
        
        return x
    
    
class RTEAttention(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,enc_units,dec_units,batch_size,split_size, **kwargs):
        super(RTEAttention, self).__init__(**kwargs)
        
        self.split_size = split_size
        self.encoder = Encoder(vocab_size,embedding_dim,enc_units,batch_size)
        self.decoder = Decoder(vocab_size, embedding_dim, dec_units, batch_size)
        
    def call(self,inp):
        enc_hidden = self.encoder.initialize_hidden_state()
        premise, hypothesis = tf.split(inp,[self.split_size,-1],axis=1)
        #enc_output, enc_hidden = self.encoder(premise,enc_hidden)
        #predictions = self.decoder(hypothesis, enc_hidden, enc_output)
        enc_output, enc_hidden, enc_cell = self.encoder(premise)
        predictions = self.decoder(hypothesis, [enc_hidden, enc_cell], enc_output)
         
            
        return predictions

In [144]:
sample_batch = next(iter(train_dataset))
model(sample_batch[0])

<tf.Tensor: shape=(32, 3), dtype=float32, numpy=
array([[ 0.09046682, -0.04434597,  0.00650291],
       [ 0.07976421, -0.04252859,  0.00456191],
       [ 0.0865448 , -0.04405873,  0.0060536 ],
       [ 0.09041405, -0.04534531,  0.00661955],
       [ 0.08427687, -0.04285638,  0.00555488],
       [ 0.09178941, -0.04525625,  0.00720036],
       [ 0.08985469, -0.04468172,  0.00675021],
       [ 0.08949722, -0.04464971,  0.00677291],
       [ 0.08559003, -0.04375186,  0.00570524],
       [ 0.08745772, -0.04428917,  0.00637429],
       [ 0.08377945, -0.04301215,  0.00656908],
       [ 0.08944805, -0.04442884,  0.00655987],
       [ 0.09086208, -0.04462938,  0.00673357],
       [ 0.08803997, -0.04393613,  0.00592776],
       [ 0.09256784, -0.04616264,  0.00721948],
       [ 0.0908728 , -0.04504931,  0.00654482],
       [ 0.08438527, -0.0431409 ,  0.0056338 ],
       [ 0.09238025, -0.04559826,  0.00696244],
       [ 0.08716755, -0.0438043 ,  0.00611146],
       [ 0.08612444, -0.04422703,  0.00

In [145]:
model = RTEAttention(vocab_size,embedding_dim,units,units,BATCH_SIZE,split_size)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(optimizer, 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.fit(train_dataset, epochs=5,validation_data=val_dataset,verbose=1)

Epoch 1/5
 19/150 [==>...........................] - ETA: 1:02 - loss: 1.0993 - accuracy: 0.3438