# Test

In [1]:
import tensorflow as tf
import tensorflow_datasets
from transformers import *
import math

In [2]:
# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
data = tensorflow_datasets.load('glue/mrpc')

INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (/root/tensorflow_datasets/glue/mrpc/1.0.0)
INFO:absl:Constructing tf.data.Dataset for split None, from /root/tensorflow_datasets/glue/mrpc/1.0.0


In [3]:
data

{'test': <DatasetV1Adapter shapes: {idx: (), label: (), sentence1: (), sentence2: ()}, types: {idx: tf.int32, label: tf.int64, sentence1: tf.string, sentence2: tf.string}>,
 'train': <DatasetV1Adapter shapes: {idx: (), label: (), sentence1: (), sentence2: ()}, types: {idx: tf.int32, label: tf.int64, sentence1: tf.string, sentence2: tf.string}>,
 'validation': <DatasetV1Adapter shapes: {idx: (), label: (), sentence1: (), sentence2: ()}, types: {idx: tf.int32, label: tf.int64, sentence1: tf.string, sentence2: tf.string}>}

In [4]:
data.keys()

dict_keys(['test', 'train', 'validation'])

In [5]:
data['train']

<DatasetV1Adapter shapes: {idx: (), label: (), sentence1: (), sentence2: ()}, types: {idx: tf.int32, label: tf.int64, sentence1: tf.string, sentence2: tf.string}>

In [6]:
from tensorflow.python.data.ops import dataset_ops
dataset_ops.get_legacy_output_shapes(data['train'])

{'idx': TensorShape([]),
 'label': TensorShape([]),
 'sentence1': TensorShape([]),
 'sentence2': TensorShape([])}

In [7]:
dataset_ops.get_legacy_output_types(data['train'])

{'idx': tf.int32,
 'label': tf.int64,
 'sentence1': tf.string,
 'sentence2': tf.string}

In [8]:
dataset_ops.get_legacy_output_classes(data['train'])

{'idx': tensorflow.python.framework.ops.Tensor,
 'label': tensorflow.python.framework.ops.Tensor,
 'sentence1': tensorflow.python.framework.ops.Tensor,
 'sentence2': tensorflow.python.framework.ops.Tensor}

In [9]:
for l in data['train']:
    print(l.keys())
    print(l)
    print(l['idx'])
    print(l['label'])
    print(l['sentence1'])
    break

dict_keys(['idx', 'label', 'sentence1', 'sentence2'])
{'idx': <tf.Tensor: shape=(), dtype=int32, numpy=1680>, 'label': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'sentence1': <tf.Tensor: shape=(), dtype=string, numpy=b'The identical rovers will act as robotic geologists , searching for evidence of past water .'>, 'sentence2': <tf.Tensor: shape=(), dtype=string, numpy=b'The rovers act as robotic geologists , moving on six wheels .'>}
tf.Tensor(1680, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(b'The identical rovers will act as robotic geologists , searching for evidence of past water .', shape=(), dtype=string)


In [10]:
# get numpy array
for element in data['train'].as_numpy_iterator(): 
    print(element) 
    break

{'idx': 1680, 'label': 0, 'sentence1': b'The identical rovers will act as robotic geologists , searching for evidence of past water .', 'sentence2': b'The rovers act as robotic geologists , moving on six wheels .'}


In [11]:
import numpy as np
np.shape(list(data['train'].as_numpy_iterator()))

(3668,)

In [12]:
len(list(data['train'].as_numpy_iterator()))

3668

In [13]:
list(data['train'].take(1).as_numpy_iterator())

[{'idx': 1680,
  'label': 0,
  'sentence1': b'The identical rovers will act as robotic geologists , searching for evidence of past water .',
  'sentence2': b'The rovers act as robotic geologists , moving on six wheels .'}]

In [14]:
# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
valid_dataset = valid_dataset.batch(64)

In [15]:
len(list(train_dataset.as_numpy_iterator()))

230

In [16]:
#sample size/batch size and repeat 2 times
math.ceil((3668/32)*2)

230

In [17]:
list(train_dataset.take(1).as_numpy_iterator())[0][0].keys()

dict_keys(['input_ids', 'attention_mask', 'token_type_ids'])

In [18]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]['input_ids'][0]

array([  101,  1456,  1237,  5809,  1845,  3216,  1107,  2447,  2008,
        6157,  6356,  1112, 18155,  1265,  3471,  1106,  3345,  1105,
        2670, 24091,  1815,  1154,  1103, 24787,   119,   102,  1456,
        1237,  5809,  3182,  1346, 12535,  6356,  2106,   117,  1112,
       18155,  1265,  3471,  1106,  3345,  1105,  2670, 24091,  1321,
        1103, 24787,   119,   102,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0]

In [19]:
for i in list(train_dataset.take(1).as_numpy_iterator())[0][0]['input_ids'][0]:
    print('{:7d}    ---->    {}'.format(i, tokenizer.decode(int(i))))

    101    ---->    [ C L S ]
   1109    ---->    T h e
   3085    ---->    b a n k
   1145    ---->    a l s o
   1163    ---->    s a i d
   1157    ---->    i t s
   2906    ---->    o f f e r
   1108    ---->    w a s
   2548    ---->    s u b j e c t
   1106    ---->    t o
   1103    ---->    t h e
   3311    ---->    a g r e e m e n t
   1104    ---->    o f
   1987    ---->    D r
   7897    ---->    # # a x
    112    ---->    '
    188    ---->    s
   2682    ---->    s e n i o r
   5482    ---->    b a n k s
    117    ---->    ,
   2682    ---->    s e n i o r
   7069    ---->    b o n d
  14322    ---->    h o l d e r s
   1105    ---->    a n d
   1119    ---->    h e
  13556    ---->    # # d g i n g
   5482    ---->    b a n k s
   1118    ---->    b y
   1476    ---->    3 0
   1347    ---->    S e p t e m b e r
   1581    ---->    2 0 0 3
    119    ---->    .
    102    ---->    [ S E P ]
   1109    ---->    T h e
   2906    ---->    o f f e r
   1110    ---->    i 

In [20]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]['input_ids']

array([[  101,   107,  1409, ...,     0,     0,     0],
       [  101,  1124,  1108, ...,     0,     0,     0],
       [  101,  1130,  1382, ...,     0,     0,     0],
       ...,
       [  101,  1109, 11451, ...,     0,     0,     0],
       [  101, 13719,  2105, ...,     0,     0,     0],
       [  101,  1188,  1214, ...,     0,     0,     0]], dtype=int32)

In [21]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]['input_ids'].shape

(32, 128)

In [42]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]

{'input_ids': array([[ 101, 4673, 1108, ...,    0,    0,    0],
        [ 101,  155, 4538, ...,    0,    0,    0],
        [ 101, 1124, 1163, ...,    0,    0,    0],
        ...,
        [ 101, 1456, 1237, ...,    0,    0,    0],
        [ 101,  146, 1458, ...,    0,    0,    0],
        [ 101, 1332,  170, ...,    0,    0,    0]], dtype=int32),
 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32),
 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int32)}

In [22]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]['attention_mask']

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)

In [23]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]['attention_mask'].shape

(32, 128)

In [24]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]['token_type_ids']

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [25]:
list(train_dataset.take(1).as_numpy_iterator())[0][0]['token_type_ids'].shape

(32, 128)

In [26]:
encoding = tokenizer.encode("This is a simple input to be tokenized")

print("Encoded string: {}".format(encoding))

Encoded string: [101, 1188, 1110, 170, 3014, 7758, 1106, 1129, 22559, 2200, 102]


In [36]:
text=list(data['train'].take(1).as_numpy_iterator())[0]['sentence1'].decode("utf-8") 

In [37]:
text

'The identical rovers will act as robotic geologists , searching for evidence of past water .'

In [41]:
for word in text.split(' ') :
    print('{:10}    ---->    {}'.format(word, tokenizer.encode(word)))

The           ---->    [101, 1109, 102]
identical     ---->    [101, 6742, 102]
rovers        ---->    [101, 187, 24985, 102]
will          ---->    [101, 1209, 102]
act           ---->    [101, 2496, 102]
as            ---->    [101, 1112, 102]
robotic       ---->    [101, 24628, 102]
geologists    ---->    [101, 25166, 1116, 102]
,             ---->    [101, 117, 102]
searching     ---->    [101, 6205, 102]
for           ---->    [101, 1111, 102]
evidence      ---->    [101, 2554, 102]
of            ---->    [101, 1104, 102]
past          ---->    [101, 1763, 102]
water         ---->    [101, 1447, 102]
.             ---->    [101, 119, 102]


In [29]:
for i in tokenizer.encode(list(data['train'].take(1).as_numpy_iterator())[0]['sentence1'].decode("utf-8")):
    print('{:7d}    ---->    {}'.format(i, tokenizer.decode(int(i))))

    101    ---->    [ C L S ]
   1109    ---->    T h e
   6742    ---->    i d e n t i c a l
    187    ---->    r
  24985    ---->    # # o v e r s
   1209    ---->    w i l l
   2496    ---->    a c t
   1112    ---->    a s
  24628    ---->    r o b o t i c
  25166    ---->    g e o l o g i s t
   1116    ---->    # # s
    117    ---->    ,
   6205    ---->    s e a r c h i n g
   1111    ---->    f o r
   2554    ---->    e v i d e n c e
   1104    ---->    o f
   1763    ---->    p a s t
   1447    ---->    w a t e r
    119    ---->    .
    102    ---->    [ S E P ]


In [30]:
len(tokenizer.encode(list(data['train'].take(1).as_numpy_iterator())[0]['sentence1'].decode("utf-8")))

20

In [31]:
for i in tokenizer.encode(list(data['train'].take(1).as_numpy_iterator())[0]['sentence2'].decode("utf-8")):
        print('{:7d}    ---->    {}'.format(i, tokenizer.decode(int(i))))

    101    ---->    [ C L S ]
   1109    ---->    T h e
    187    ---->    r
  24985    ---->    # # o v e r s
   2496    ---->    a c t
   1112    ---->    a s
  24628    ---->    r o b o t i c
  25166    ---->    g e o l o g i s t
   1116    ---->    # # s
    117    ---->    ,
   2232    ---->    m o v i n g
   1113    ---->    o n
   1565    ---->    s i x
   8089    ---->    w h e e l s
    119    ---->    .
    102    ---->    [ S E P ]


In [32]:
len(tokenizer.encode(list(data['train'].take(1).as_numpy_iterator())[0]['sentence2'].decode("utf-8")))

16

In [33]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [34]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0
_________________________________________________________________


In [35]:
# stop here
this is an error

SyntaxError: invalid syntax (<ipython-input-35-e27c1f060d6c>, line 2)

In [None]:
# Train and evaluate using tf.keras.Model.fit()
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)

In [None]:
# Load the TensorFlow model in PyTorch for inspection
model.save_pretrained('./save/')
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)

In [None]:
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
sentence_0 = "This research was consistent with his findings."
sentence_1 = "His findings were compatible with this research."
sentence_2 = "His findings were not compatible with this research."
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')

In [None]:
pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()

print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")