### Read training, dev and unlabeled test data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


The following provides a starting code (Python 3) of how to read the labeled training and dev sentence pairs, and unlabeled test sentence pairs, into lists.

In [3]:
import csv

In [4]:
train, dev, test = [], [], []

In [5]:
with open('/content/drive/MyDrive/Colab Notebooks/data/pnli_train.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        train.append(x)
print (len(train))
print (train[:3])

5983
[['Sometimes do exercise.', 'A person typically desire healthy life.', '1'], ['Who eats junk foods.', 'A person typically desire healthy life.', '0'], ['A person is sick.', 'A person typically desire healthy life.', '1']]


In [6]:
with open('/content/drive/MyDrive/Colab Notebooks/data/pnli_dev.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        dev.append(x)
print (len(dev))
print (dev[:3])

1055
[['A person is looking for accuracy.', 'A person typically desires accurate results.', '1'], ['A person does not care for accuracy.', 'A person typically desires accurate results.', '0'], ['The person double checks their data.', 'A person typically desires accurate results.', '1']]


In [7]:

with open('/content/drive/MyDrive/Colab Notebooks/data/pnli_test_unlabeled.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[0] and x[1] will be the sentence pairs.
        test.append(x)
print (len(test))
print (test[:3])

4850
[['The people want to have a romantic and pleasant feel.', 'People typically does desire to smell violets.'], ['The contract is to buy products from you.', 'Getting contract typically cause to make money or spend money.'], ['Train station is closed.', 'Line can typically be used to move train along tracks.']]


### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [8]:
!pip install -q -U "tensorflow-text==2.8.*"
!pip install -q tf-models-official==2.4.0
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
from official.modeling import tf_utils
from official import nlp
from official.nlp import bert
# Load the required submodules
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks
import os
import numpy as np
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

[K     |████████████████████████████████| 4.9 MB 14.5 MB/s 
[K     |████████████████████████████████| 462 kB 84.3 MB/s 
[K     |████████████████████████████████| 1.1 MB 14.7 MB/s 
[K     |████████████████████████████████| 596 kB 66.0 MB/s 
[K     |████████████████████████████████| 47.8 MB 101 kB/s 
[K     |████████████████████████████████| 1.1 MB 64.2 MB/s 
[K     |████████████████████████████████| 237 kB 75.7 MB/s 
[K     |████████████████████████████████| 352 kB 82.6 MB/s 
[K     |████████████████████████████████| 1.2 MB 53.1 MB/s 
[K     |████████████████████████████████| 99 kB 11.6 MB/s 
[K     |████████████████████████████████| 43 kB 2.2 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [9]:
# tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/albert_en_base/2'
# tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/albert_en_preprocess/3'

tfhub_handle_encoder = 'https://tfhub.dev/google/experts/bert/wiki_books/2'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

# bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
# bert_model = hub.KerasLayer(tfhub_handle_encoder)
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text_input')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [10]:
def build_dataset(data):
  tmp_sen_list = []
  tmp_label_list = []
  for sen in data:
    tmp_sen_list.append("[CLS] "+sen[0]+" [SEP] "+sen[1]+" [SEP]")
    tmp_label_list.append(float(sen[2]))
  sentences_dataset = tf.data.Dataset.from_tensor_slices(tmp_sen_list)
  label_dataset = tf.data.Dataset.from_tensor_slices(tmp_label_list)
  return tf.data.Dataset.zip((sentences_dataset,label_dataset))

train_dataset = build_dataset(train)
train_dataset = train_dataset.batch(batch_size = 32,name = "train_dataset")

dev_dataset = build_dataset(dev)
dev_dataset = dev_dataset.batch(batch_size = 32,name = "dev_dataset")

def build_test_dataset(data):
  tmp_sen_list = []
  for sen in data:
    tmp_sen_list.append("[CLS] "+sen[0]+" [SEP] "+sen[1]+" [SEP]")
  return tf.data.Dataset.from_tensor_slices(tmp_sen_list)
test_dataset = build_test_dataset(test)
test_dataset = test_dataset.batch(batch_size = 32,name = "test_dataset")


In [19]:
classifier_model = build_classifier_model()

In [20]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [21]:
classifier_model(tf.constant(["[CLS] "+dev[0][0]+" [SEP] "+dev[0][1]+" [SEP]"])).numpy()

array([[0.72539896]], dtype=float32)

In [22]:
epochs = 4
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [23]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [24]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_dataset,
                               validation_data = dev_dataset,
                               epochs=epochs)

Training model with https://tfhub.dev/google/experts/bert/wiki_books/2
Epoch 1/4


  return dispatch_target(*args, **kwargs)


Epoch 2/4
Epoch 3/4
Epoch 4/4


In [25]:
classifier_model.save_weights('/content/drive/MyDrive/Colab Notebooks/hw4_weights/hw4_epoch4_wiki_bert.ckpt')

In [26]:
predict_result = classifier_model.predict(test_dataset)

In [27]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []
for a in predict_result:
  results.append(round(a[0]))


In [None]:
# reload_model = build_classifier_model()
# reload_model.load_weights('/content/drive/MyDrive/Colab Notebooks/hw4_weights/hw4_1.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fbd82e39ed0>

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [28]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 4850)

In [29]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [30]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')