# Test

In [1]:
import tensorflow as tf
import tensorflow_datasets
from transformers import *

In [2]:
# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
data = tensorflow_datasets.load('glue/mrpc')

INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (/Users/tarrade/tensorflow_datasets/glue/mrpc/1.0.0)
INFO:absl:Constructing tf.data.Dataset for split None, from /Users/tarrade/tensorflow_datasets/glue/mrpc/1.0.0


In [3]:
data

{'test': <DatasetV1Adapter shapes: {idx: (), label: (), sentence1: (), sentence2: ()}, types: {idx: tf.int32, label: tf.int64, sentence1: tf.string, sentence2: tf.string}>,
 'train': <DatasetV1Adapter shapes: {idx: (), label: (), sentence1: (), sentence2: ()}, types: {idx: tf.int32, label: tf.int64, sentence1: tf.string, sentence2: tf.string}>,
 'validation': <DatasetV1Adapter shapes: {idx: (), label: (), sentence1: (), sentence2: ()}, types: {idx: tf.int32, label: tf.int64, sentence1: tf.string, sentence2: tf.string}>}

In [4]:
data.keys()

dict_keys(['test', 'train', 'validation'])

In [5]:
data['train']

<DatasetV1Adapter shapes: {idx: (), label: (), sentence1: (), sentence2: ()}, types: {idx: tf.int32, label: tf.int64, sentence1: tf.string, sentence2: tf.string}>

In [6]:
from tensorflow.python.data.ops import dataset_ops
dataset_ops.get_legacy_output_shapes(data['train'])

{'idx': TensorShape([]),
 'label': TensorShape([]),
 'sentence1': TensorShape([]),
 'sentence2': TensorShape([])}

In [7]:
dataset_ops.get_legacy_output_types(data['train'])

{'idx': tf.int32,
 'label': tf.int64,
 'sentence1': tf.string,
 'sentence2': tf.string}

In [8]:
dataset_ops.get_legacy_output_classes(data['train'])

{'idx': tensorflow.python.framework.ops.Tensor,
 'label': tensorflow.python.framework.ops.Tensor,
 'sentence1': tensorflow.python.framework.ops.Tensor,
 'sentence2': tensorflow.python.framework.ops.Tensor}

In [9]:
for l in data['train']:
    print(l.keys())
    print(l)
    print(l['idx'])
    print(l['label'])
    print(l['sentence1'])
    break

dict_keys(['idx', 'label', 'sentence1', 'sentence2'])
{'idx': <tf.Tensor: shape=(), dtype=int32, numpy=1680>, 'label': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'sentence1': <tf.Tensor: shape=(), dtype=string, numpy=b'The identical rovers will act as robotic geologists , searching for evidence of past water .'>, 'sentence2': <tf.Tensor: shape=(), dtype=string, numpy=b'The rovers act as robotic geologists , moving on six wheels .'>}
tf.Tensor(1680, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(b'The identical rovers will act as robotic geologists , searching for evidence of past water .', shape=(), dtype=string)


In [10]:
# get numpy array
for element in data['train'].as_numpy_iterator(): 
    print(element) 
    break

{'idx': 1680, 'label': 0, 'sentence1': b'The identical rovers will act as robotic geologists , searching for evidence of past water .', 'sentence2': b'The rovers act as robotic geologists , moving on six wheels .'}


In [11]:
import numpy as np
np.shape(list(data['train'].as_numpy_iterator()))

(3668,)

In [12]:
len(list(data['train'].as_numpy_iterator()))

3668

In [13]:
list(data['train'].take(1).as_numpy_iterator())

[{'idx': 1680,
  'label': 0,
  'sentence1': b'The identical rovers will act as robotic geologists , searching for evidence of past water .',
  'sentence2': b'The rovers act as robotic geologists , moving on six wheels .'}]

In [14]:
# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
valid_dataset = valid_dataset.batch(64)

In [15]:
list(train_dataset.take(1).as_numpy_iterator())[0][0].keys()

dict_keys(['input_ids', 'attention_mask', 'token_type_ids'])

In [16]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]['input_ids']

array([[  101,   138,  2877, ...,     0,     0,     0],
       [  101,  1109, 27772, ...,     0,     0,     0],
       [  101,  1109,  5626, ...,     0,     0,     0],
       ...,
       [  101,   146,  1458, ...,     0,     0,     0],
       [  101,  1109,  2221, ...,     0,     0,     0],
       [  101,   107,  1188, ...,     0,     0,     0]], dtype=int32)

In [17]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]['input_ids'].shape

(32, 128)

In [18]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]['attention_mask']

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)

In [19]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]['attention_mask'].shape

(32, 128)

In [20]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]['token_type_ids']

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [21]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]['token_type_ids'].shape

(32, 128)

In [22]:
encoding = tokenizer.encode("This is a simple input to be tokenized")

print("Encoded string: {}".format(encoding))

Encoded string: [101, 1188, 1110, 170, 3014, 7758, 1106, 1129, 22559, 2200, 102]


In [23]:
list(data['train'].take(1).as_numpy_iterator())[0]['sentence1'].decode("utf-8") 

'The identical rovers will act as robotic geologists , searching for evidence of past water .'

In [24]:
tokenizer.encode(list(data['train'].take(1).as_numpy_iterator())[0]['sentence1'].decode("utf-8"))

[101,
 1109,
 6742,
 187,
 24985,
 1209,
 2496,
 1112,
 24628,
 25166,
 1116,
 117,
 6205,
 1111,
 2554,
 1104,
 1763,
 1447,
 119,
 102]

In [25]:
tokenizer.encode(list(data['train'].take(1).as_numpy_iterator())[0]['sentence2'].decode("utf-8"))

[101,
 1109,
 187,
 24985,
 2496,
 1112,
 24628,
 25166,
 1116,
 117,
 2232,
 1113,
 1565,
 8089,
 119,
 102]

In [26]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [27]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0
_________________________________________________________________


In [28]:
# stop here
this is an error

SyntaxError: invalid syntax (<ipython-input-28-e27c1f060d6c>, line 2)

In [None]:
# Train and evaluate using tf.keras.Model.fit()
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)

In [None]:
# Load the TensorFlow model in PyTorch for inspection
model.save_pretrained('./save/')
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)

In [None]:
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
sentence_0 = "This research was consistent with his findings."
sentence_1 = "His findings were compatible with this research."
sentence_2 = "His findings were not compatible with this research."
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')

In [None]:
pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()

print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")