In [1]:
# Import numpy, pandas, tensorflow, and other packages
import numpy as np
import os
import re
import pandas as pd
import tensorflow as tf
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split

In [2]:
# Use a path to snli data from current directory to create Pandas dataframe
path_to_training_data = '../snli_1.0/' 
ds = pd.read_csv(path_to_training_data+'snli_1.0_train.txt',delimiter='\t')
ds = ds[ds['gold_label']!='-']

In [3]:
# Preprocess data by removing non-alphanumeric characters and tokenizing 
def preprocessSentence(s):
    s = s.lower().strip()
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
    s = s.strip()
    #s = '<start> ' + s + ' <end>'
    return s


# Create a TF tokenizer that creates a word index
def createTokenizer(data):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(data)
    return tokenizer

def tokenizeData(data,tokenizer):
    tensor = tokenizer.texts_to_sequences(data)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
    return tensor

premises = []
for premise in ds['sentence1'].astype('str'):
    premises.append(preprocessSentence(premise))

hypotheses = []
for hypothesis in ds['sentence2'].astype('str'):
    hypotheses.append(preprocessSentence(hypothesis))
    
tokenizer = createTokenizer(premises+hypotheses)

# Now have buffer x seq. length tensor, where seq. length is the maximum length of a sentence in the set
premise_tensor = tokenizeData(premises,tokenizer)
hypothesis_tensor = tokenizeData(hypotheses,tokenizer)

# Concatenate premises and hypotheses so that they can be fed as one tensor
ph_tensor = tf.concat([premise_tensor,hypothesis_tensor],1)
ph_labels = ds['gold_label'].map({'neutral':0,'contradiction':1,'entailment':2}).astype('int').values
split_size = premise_tensor.shape[1]

del ds
del premises
del hypotheses
del premise_tensor
del hypothesis_tensor

In [4]:
# Define hyperparameters and other info
embedding_dim = 100
BUFFER_SIZE = len(ph_tensor)
BATCH_SIZE = 128
steps_per_epoch = BUFFER_SIZE//BATCH_SIZE
units = 100
vocab_size = len(tokenizer.word_index)+1

train_input, val_input, train_targ, val_targ = train_test_split(ph_tensor.numpy(),
                                                                ph_labels,
                                                                test_size=0.1)


In [9]:
# To use GLOVE VECTORS uncomment
GLOVE_DIR = '../glove/'
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


In [10]:
# Define tf data pipeline
train_dataset = tf.data.Dataset.from_tensor_slices((train_input, train_targ)) #.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

val_dataset = tf.data.Dataset.from_tensor_slices((val_input,val_targ)) #.shuffle(BUFFER_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [42]:
# Define Encoder + Decoder architectures


class Encoder(tf.keras.Model):
    
    def __init__(self,vocab_size,embedding_dim,enc_units,batch_size):
        super(Encoder,self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        
        
        # Uncomment first and comment second lines to use random initial word embeddings
        #self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim,mask_zero=True)
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim,
                                                   weights=[embedding_matrix],
                                                   trainable=False,
                                                   mask_zero=True)

        
        
#         self.gru = tf.keras.layers.GRU(self.enc_units,
#                                    return_sequences=True,
#                                    return_state=True,
#                                    recurrent_initializer='glorot_uniform')
        
        self.lstm = tf.keras.layers.LSTM(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   dropout=0.05)
        
        
    def call(self,x,hidden=None):
        x = self.embedding(x)
        #output, state = self.gru(x,initial_state=hidden)
        #return output, state
        output, state, carry = self.lstm(x,initial_state=hidden)
        return output, state, carry
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size,self.enc_units))


class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self,units):
        super(BahdanauAttention,self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.W3 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self,query,past_context,values):
        query_with_time_axis = tf.expand_dims(query,1)
        past_context_with_time = tf.expand_dims(past_context,1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values) + self.W3(past_context_with_time)))
        attention_weights = tf.nn.softmax(score,axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights
    

# Attention Layer
class BahdanauAttentionLastWord(tf.keras.layers.Layer):
    def __init__(self,units):
        super(BahdanauAttentionLastWord,self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self,query,values):
        query_with_time_axis = tf.expand_dims(query,1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score,axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights
    
    
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        
        # Uncomment first and comment second lines to use random initial word embeddings
        
        #self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,mask_zero=True)
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim,
                                                   weights=[embedding_matrix],
                                                   trainable=False,mask_zero=True)
        
#         self.gru = tf.keras.layers.GRU(self.dec_units,
#                                        return_sequences=True,
#                                        return_state=True,
#                                        recurrent_initializer='glorot_uniform')
        self.lstm = tf.keras.layers.LSTM(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout=0.05)
        #self.attention = BahdanauAttention(self.dec_units)
        self.attention = BahdanauAttentionLastWord(self.dec_units)
        self.Wp = tf.keras.layers.Dense(units,activation='linear')
        self.Wx = tf.keras.layers.Dense(units,activation='linear')
        self.fc = tf.keras.layers.Dense(3,kernel_regularizer=regularizers.l2(0.001))
        

    def call(self, x, hidden, enc_output):
        
        x = self.embedding(x)
        #output, state = self.gru(x,initial_state=hidden)
        output, state, carry = self.lstm(x,initial_state=hidden)
        
#         past_context_vector = hidden
#         for t in range(x.shape[1]):
#             context_vector, _ = self.attention(output[:,t,:], past_context_vector,enc_output)
#             past_context_vector = context_vector

        context_vector, _ = self.attention(state,enc_output)
    
        context_vector = tf.nn.tanh(self.Wp(context_vector) + self.Wx(state))
    
        #context_vector = tf.concat([context_vector,state],axis=1)
        
        x = self.fc(context_vector)
        
        return x
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size,self.dec_units))
    
    
class RTEAttention(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,enc_units,dec_units,batch_size,split_size, **kwargs):
        super(RTEAttention, self).__init__(**kwargs)
        
        self.split_size = split_size
        self.encoder = Encoder(vocab_size,embedding_dim,enc_units,batch_size)
        self.decoder = Decoder(vocab_size, embedding_dim, dec_units, batch_size)
        
    def call(self,inp):
        enc_hidden = self.encoder.initialize_hidden_state()
        premise, hypothesis = tf.split(inp,[self.split_size,-1],axis=1)
        #enc_output, enc_hidden = self.encoder(premise,enc_hidden)
        #predictions = self.decoder(hypothesis, enc_hidden, enc_output)
        enc_output, enc_hidden, enc_cell = self.encoder(premise)
        
        dec_hidden = self.decoder.initialize_hidden_state()
        #predictions = self.decoder(hypothesis, [enc_hidden, enc_cell], enc_output)
        predictions = self.decoder(hypothesis, [dec_hidden, enc_cell], enc_output)
         
            
        return predictions

In [43]:
# Instantiate model
model = RTEAttention(vocab_size,embedding_dim,units,units,BATCH_SIZE,split_size)


In [46]:
# Set hyperparameters and compile
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(optimizer, 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [51]:
# Fit on training data
model.fit(train_dataset, epochs=1,validation_data=val_dataset,verbose=1)



<tensorflow.python.keras.callbacks.History at 0x139672f70>

In [52]:
model.save_weights('./checkpoints/snli17102020_k100_glove_dropout05')