In [16]:
import tensorflow as tf
import numpy as np
import regex
import csv
from pymagnitude import *
from tensorflow.keras.optimizers import SGD

TRAINING_FILE = "/Users/zetong/SST-2/train.tsv"
TESTING_FILE = "/Users/zetong/SST-2/test.tsv"
EMBEDDING = "/Users/zetong/glove.840B.300d.magnitude"
BATCH_SIZE = 16
EPOCHS = 100

vectors = Magnitude(EMBEDDING)

In [13]:
# read QNLI dataset from GLUE benchmark and return tokenized questions and answers
def tokenize_SST_dataset(FILE_PATH):
    trainfile = open(FILE_PATH)
    trainfile = csv.reader(trainfile, delimiter='\t')
    sentences_raw = []
    labels = []
    # skip the header
    next(trainfile)
    for row in trainfile:
        sentences_raw.append(row[0])
        labels.append(row[1])
    # seperate the questions and answers into words
    sentences = []
    for sentence in sentences_raw:
        sentences.append(regex.findall(r"[^[:punct:] ]+|[[:punct:]]", sentence))
    return sentences, list(map(int, labels))

In [3]:
s, l = tokenize_SST_dataset(TRAINING_FILE)

In [4]:
MAX_SEQ_LENGTH = len(max(s, key=len))

In [17]:
i = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH, vectors.dim))
Bidir_LSTM = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, activation='tanh', return_sequences=True), merge_mode='concat')(i)
expanded_LSTM = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(Bidir_LSTM)
maxpool = tf.keras.layers.MaxPooling2D(pool_size=(MAX_SEQ_LENGTH, 1))(expanded_LSTM)
squeezed_1 =  tf.keras.layers.Lambda(lambda x: tf.squeeze(x, axis=-1))(maxpool)
squeezed_2 =  tf.keras.layers.Lambda(lambda x: tf.squeeze(x, axis=1))(squeezed_1)
hidden = tf.keras.layers.Dense(512)(squeezed_2)
output = tf.keras.layers.Dense(1, activation="softmax")(hidden)
model = tf.keras.Model(inputs=i, outputs=output)
model.summary()
opt = SGD(lr=0.01)
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["acc"])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 57, 300)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 57, 200)           320800    
_________________________________________________________________
lambda_9 (Lambda)            (None, 57, 200, 1)        0         
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 1, 200, 1)         0         
_________________________________________________________________
lambda_10 (Lambda)           (None, 1, 200)            0         
_________________________________________________________________
lambda_11 (Lambda)           (None, 200)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 512)               102912    
__________

In [9]:
x = vectors.query(s)
y = l
x = tf.keras.preprocessing.sequence.pad_sequences(x, maxlen=MAX_SEQ_LENGTH, dtype='float32', padding='post', truncating='post', value=0)

In [19]:
model.fit(x=x, y=y, batch_size = BATCH_SIZE, validation_split=0.1, epochs=EPOCHS)

Train on 60614 samples, validate on 6735 samples
Epoch 1/100
Epoch 2/100
 2224/60614 [>.............................] - ETA: 8:42 - loss: 7.0536 - acc: 0.5576

KeyboardInterrupt: 