In [1]:
import tensorflow as tf
from pymagnitude import *
import os
import csv
import logging
import numpy as np
import regex

# QNLI training and test data path
TRAINING_DATAFILE = "/Users/zxq001/QNLI/train.tsv"
DEV_DATAFILE = "/Users/zxq001/QNLI/dev.tsv"
# load pretrained embedding
vectors = Magnitude("/Users/zxq001/glove.840B.300d.magnitude")
MODEL_FILE = "/Users/zxq001/model.h5"

# prepare logger
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.DEBUG)
tf.logging.set_verbosity(logging.ERROR)

# the maximimum length for the question sequence
MAX_SEQ_LENGTH = 100
BATCH_SIZE = 32

# read QNLI dataset from GLUE benchmark
def read_QNLI_dataset(FILE_PATH):
    trainfile = open(FILE_PATH)
    trainfile = csv.reader(trainfile, delimiter='\t')
    questions_raw = []
    answers_raw = []
    labels = []
    # skip the header
    next(trainfile)
    for row in trainfile:
        questions_raw.append(row[1])
        answers_raw.append(row[2])
        if row[3] == "entailment":
            labels.append(1)
        else:
            labels.append(0)
    # seperate the questions and answers into words
    questions = []
    answers = []
    for question in questions_raw:
        questions.append(regex.findall(r"[^[:punct:] ]+|[[:punct:]]", question))
    for answer in answers_raw:
        answers.append(regex.findall(r"[^[:punct:] ]+|[[:punct:]]", answer))
    return questions, answers, labels

# trains and saves the model
def train():
    logger.info("training")
    
    logger.debug("loading data")
    # load data
    questions, answers, labels = read_QNLI_dataset(TRAINING_DATAFILE)
    
    logger.debug("preprocessing data")
    # embedding transformation
    X1 = vectors.query(questions)
    X2 = vectors.query(answers)
    y = labels
    # pad the question and answer embeddings
    X1 = np.array(tf.keras.preprocessing.sequence.pad_sequences(X1, padding='post', maxlen=MAX_SEQ_LENGTH, truncating='post', value=0, dtype='float32'))
    X2 = np.array(tf.keras.preprocessing.sequence.pad_sequences(X2, padding='post', maxlen=MAX_SEQ_LENGTH, truncating='post', value=0, dtype='float32'))
    
    logger.debug("building model")
    # u = question sequence embedding (MAX_SEQ_LENGTH, 300) -> 1500D bidirectional LSTM -> maxpooling
    q_in = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH, vectors.dim))
    q_Bidir_LSTM = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1500, activation='tanh', return_sequences=True), merge_mode='concat')(q_in)
    expanded_q_LSTM = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(q_Bidir_LSTM)
    q_maxpool = tf.keras.layers.MaxPooling2D(pool_size=(MAX_SEQ_LENGTH, 1))(expanded_q_LSTM)
    u =  tf.keras.layers.Lambda(lambda x: tf.squeeze(x, axis=-1))(q_maxpool)

    # v = answer sequence embedding (MAX_SEQ_LENGTH, 300) -> 1500D bidirectional LSTM -> maxpooling
    a_in = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH, vectors.dim))
    a_Bidir_LSTM = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1500, activation='tanh', return_sequences=True), merge_mode='concat')(a_in)
    expanded_a_LSTM = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(a_Bidir_LSTM)
    a_maxpool = tf.keras.layers.MaxPooling2D(pool_size=(MAX_SEQ_LENGTH, 1))(expanded_a_LSTM)
    v =  tf.keras.layers.Lambda(lambda x: tf.squeeze(x, axis=-1))(a_maxpool)

    # define the concatenation function for the Lambda layer
    def concat_u_v(uv):
        u = uv[0]
        v = uv[1]
        return tf.concat([u, v, tf.math.abs(u-v), tf.math.multiply(u, v)], -1)

    # output = concatenation layer (u, v, |u-v|, u*v) -> 512D hidden layer -> output node
    concat_output = tf.keras.layers.Lambda(concat_u_v)([u, v])
    hidden = tf.keras.layers.Dense(512)(concat_output)
    output = tf.keras.layers.Dense(1, activation="softmax")(hidden)
    model = tf.keras.Model(inputs=[q_in, a_in], outputs=output)
    model.summary()
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])
    
    logger.debug("fitting model")
    model.fit([x1, x2], y, validation_split=0.1, epochs=10, batch_size=64)
    
    logger.debug("saving model to {MODEL_FILE}")
    model.save(MODEL.FILE)
    logger.debug("finished training")

In [2]:
train_questions, train_answers, train_labels = read_QNLI_dataset(TRAINING_DATAFILE)
test_questions, test_answers, test_labels = read_QNLI_dataset(DEV_DATAFILE)

training_batches = MagnitudeUtils.batchify([train_questions, train_answers], train_labels, BATCH_SIZE)
testing_batches = MagnitudeUtils.batchify([test_questions, test_answers], test_labels, BATCH_SIZE)

num_batches_per_epoch_train = int(math.ceil(len(train_questions)/float(BATCH_SIZE)))
num_batches_per_epoch_test = int(math.ceil(len(test_questions)/float(BATCH_SIZE)))

In [3]:
# Generates batches of the transformed training data
train_batch_generator = (
  (
   [
       np.array(tf.keras.preprocessing.sequence.pad_sequences(vectors.query(question_train_batch), padding='post', maxlen=MAX_SEQ_LENGTH, truncating='post', value=0, dtype='float32')),
       np.array(tf.keras.preprocessing.sequence.pad_sequences(vectors.query(answer_train_batch), padding='post', maxlen=MAX_SEQ_LENGTH, truncating='post', value=0, dtype='float32'))
   ],
      y_train_batch
  ) for (question_train_batch, answer_train_batch), y_train_batch in training_batches
)

# Generates batches of the transformed test data
test_batch_generator = (
  (
   [
       np.array(tf.keras.preprocessing.sequence.pad_sequences(vectors.query(question_test_batch), padding='post', maxlen=MAX_SEQ_LENGTH, truncating='post', value=0, dtype='float32')),
       np.array(tf.keras.preprocessing.sequence.pad_sequences(vectors.query(answer_test_batch), padding='post', maxlen=MAX_SEQ_LENGTH, truncating='post', value=0, dtype='float32'))
   ],
      y_train_batch
  ) for (question_test_batch, answer_test_batch), y_train_batch in testing_batches
)

In [None]:
q_in = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH, vectors.dim))
q_Bidir_LSTM = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1500, activation='tanh', return_sequences=True), merge_mode='concat')(q_in)
expanded_q_LSTM = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(q_Bidir_LSTM)
q_maxpool = tf.keras.layers.MaxPooling2D(pool_size=(MAX_SEQ_LENGTH, 1))(expanded_q_LSTM)
u =  tf.keras.layers.Lambda(lambda x: tf.squeeze(x, axis=-1))(q_maxpool)

# v = answer sequence embedding (MAX_SEQ_LENGTH, 300) -> 1500D bidirectional LSTM -> maxpooling
a_in = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH, vectors.dim))
a_Bidir_LSTM = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1500, activation='tanh', return_sequences=True), merge_mode='concat')(a_in)
expanded_a_LSTM = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(a_Bidir_LSTM)
a_maxpool = tf.keras.layers.MaxPooling2D(pool_size=(MAX_SEQ_LENGTH, 1))(expanded_a_LSTM)
v =  tf.keras.layers.Lambda(lambda x: tf.squeeze(x, axis=-1))(a_maxpool)

# define the concatenation function for the Lambda layer
def concat_u_v(uv):
    u = uv[0]
    v = uv[1]
    return tf.concat([u, v, tf.math.abs(u-v), tf.math.multiply(u, v)], -1)

# output = concatenation layer (u, v, |u-v|, u*v) -> 512D hidden layer -> output node
concat_output = tf.keras.layers.Lambda(concat_u_v)([u, v])
hidden = tf.keras.layers.Dense(512)(concat_output)
output = tf.keras.layers.Dense(1, activation="softmax")(hidden)
model = tf.keras.Model(inputs=[q_in, a_in], outputs=output)
model.summary()
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])

model.fit_generator(
    generator = train_batch_generator,
    steps_per_epoch = num_batches_per_epoch_train,
    validation_data = test_batch_generator,
    validation_steps = num_batches_per_epoch_test,
    epochs = 100,
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100, 300)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 100, 300)     0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 100, 3000)    21612000    input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 100, 3000)    21612000    input_2[0][0]                    
__________________________________________________________________________________________________
lambda (La