In [1]:
import tensorflow as tf
from pymagnitude import *
import os
import csv
import logging
import numpy as np
import regex
import random

# QNLI training and test data path
TRAINING_DATAFILE = "/Users/zxq001/QNLI/train.tsv"
DEV_DATAFILE = "//Users/zxq001//QNLI/dev.tsv"
# load pretrained embedding
vectors = Magnitude("/Users/zxq001/glove.840B.300d.magnitude")
MODEL_FILE = "/Users/zxq001/model.h5"

# prepare logger
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.DEBUG)
tf.logging.set_verbosity(logging.ERROR)

# hyperparameters
MAX_SEQ_LENGTH = 100
BATCH_SIZE = 64
EPOCHS = 100

# build baseline model

In [2]:
# questions and answers embeddings
q_in = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH, vectors.dim))
a_in = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH, vectors.dim))
# shared bidirectional LSTM
Bidir_LSTM = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1500, activation='tanh', return_sequences=True), merge_mode='concat')
# question and answer bidirectional LSTM output
q_Bidir_LSTM = Bidir_LSTM(q_in)
a_Bidir_LSTM = Bidir_LSTM(a_in)
# expand questions and answers bidirectional LSTM output dimension for maxpooling
expanded_q_LSTM = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(q_Bidir_LSTM)
expanded_a_LSTM = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(a_Bidir_LSTM)
# maxpooling layers
q_maxpool = tf.keras.layers.MaxPooling2D(pool_size=(MAX_SEQ_LENGTH, 1))(expanded_q_LSTM)
a_maxpool = tf.keras.layers.MaxPooling2D(pool_size=(MAX_SEQ_LENGTH, 1))(expanded_a_LSTM)
# encoded u and v vector
u =  tf.keras.layers.Lambda(lambda x: tf.squeeze(x, axis=-1))(q_maxpool)
v =  tf.keras.layers.Lambda(lambda x: tf.squeeze(x, axis=-1))(a_maxpool)

# define the concatenation function for the Lambda layer
def concat_u_v(uv):
    u = uv[0]
    v = uv[1]
    return tf.concat([u, v, tf.math.abs(u-v), tf.math.multiply(u, v)], -1)

# concatenate (u, v, |u-v|, u*v) vector and feed it to the classifier
concat_output = tf.keras.layers.Lambda(concat_u_v)([u, v])
# MLP with a 512D hidden layer
hidden = tf.keras.layers.Dense(512)(concat_output)
# output = concatenation layer (u, v, |u-v|, u*v) -> 512D hidden layer -> output node
output = tf.keras.layers.Dense(1, activation="softmax")(hidden)
model = tf.keras.Model(inputs=[q_in, a_in], outputs=output)
model.summary()
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100, 300)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 100, 300)     0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 100, 3000)    21612000    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 100, 3000, 1) 0           bidirectional[0][0]              
__________

# define tokenization function

In [14]:
# read QNLI dataset from GLUE benchmark and return tokenized questions and answers
def tokenize_QNLI_dataset(FILE_PATH):
    trainfile = open(FILE_PATH)
    trainfile = csv.reader(trainfile, delimiter='\t')
    questions_raw = []
    answers_raw = []
    labels = []
    # skip the header
    next(trainfile)
    for row in trainfile:
        questions_raw.append(row[1])
        answers_raw.append(row[2])
        if row[3] == "entailment":
            labels.append(1)
        else:
            labels.append(0)
    # seperate the questions and answers into words
    questions = []
    answers = []
    for question in questions_raw:
        questions.append(regex.findall(r"[^[:punct:] ]+|[[:punct:]]", question))
    for answer in answers_raw:
        answers.append(regex.findall(r"[^[:punct:] ]+|[[:punct:]]", answer))
    return questions, answers, labels

# batchify training and testing dataset

In [15]:
questions, answers, labels = tokenize_QNLI_dataset(TRAINING_DATAFILE)
q_vec = vectors.query(questions)
a_vec = vectors.query(answers)


KeyboardInterrupt: 