In [2]:
import transformers
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd

# 1. Load BERT Model

Pretrained multi-lingual from HuggingFace

https://huggingface.co/bert-base-uncased

In [19]:
from transformers import BertTokenizer, TFBertModel

# Multi-lingual
mbertTokeniser = BertTokenizer.from_pretrained('bert-base-uncased')
mbertModel = TFBertModel.from_pretrained(
    "bert-base-uncased")

mbertModel.trainable = False

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [20]:
mbertModel.summary()

Model: "tf_bert_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
Total params: 109,482,240
Trainable params: 0
Non-trainable params: 109,482,240
_________________________________________________________________


## Test encoding text in various languages

In [21]:
sentences = [
    'What does the fox say',
    'A dog barks at the farmer',
    'The patient has a bad condition',
    'The doctor rang me last night',
    'Good one'
]

tensors = [mbertTokeniser(s, return_tensors='tf') for s in sentences]


tensors[0]

{'input_ids': <tf.Tensor: shape=(1, 7), dtype=int32, numpy=array([[ 101, 2054, 2515, 1996, 4419, 2360,  102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 7), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 7), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [22]:
wouts = [mbertModel(t) for t in tensors]

In [36]:
[w for w in wouts[0]] # output structure of BERT

['last_hidden_state', 'pooler_output']

In [53]:
[w[0].numpy().shape for w in wouts] # dim of hidden states output

[(1, 7, 768), (1, 9, 768), (1, 8, 768), (1, 8, 768), (1, 4, 768)]

In [63]:
# Flatten hidden states of N words in each sentence
# by averaging
fseqs = [np.mean(w[0].numpy(), axis=1, keepdims=True) for w in wouts]
list(map(lambda f: f.shape, fseqs))

[(1, 1, 768), (1, 1, 768), (1, 1, 768), (1, 1, 768), (1, 1, 768)]

In [37]:
encoded = mbertTokeniser.encode('Do you know who i am') # str -> ids
mbertTokeniser.convert_ids_to_tokens(encoded) # ids -> tokens

['[CLS]', 'do', 'you', 'know', 'who', 'i', 'am', '[SEP]']

In [38]:
sentence_ids = list(map(mbertTokeniser.encode, sentences))

sentence_ids[0] # [CLS] first token

[101, 2054, 2515, 1996, 4419, 2360, 102]

Distance between each pair of sentences

In [67]:
from sklearn.metrics.pairwise import cosine_similarity
from heapq import heappush, heappop

zs = list(zip(sentences, fseqs))

closest = []
for i,(s1,v1) in enumerate(zs):
    for j,(s2,v2) in enumerate(zs[i+1:]):
        c = cosine_similarity(
            np.atleast_2d(v1.flatten()),
            np.atleast_2d(v2.flatten()))
        heappush(closest, (-c, (s1,s2)))

In [68]:
print('Top 5 closest pairs')
for i in range(5):
    c, (s1,s2) = heappop(closest)
    print('==========================')
    print(f'Rank #{i}, {-c}')
    print(s1)
    print(s2)

Top 5 closest pairs
Rank #0, [[0.63417834]]
What does the fox say
A dog barks at the farmer
Rank #1, [[0.5795742]]
What does the fox say
Good one
Rank #2, [[0.55851924]]
What does the fox say
The patient has a bad condition
Rank #3, [[0.5556514]]
A dog barks at the farmer
The patient has a bad condition
Rank #4, [[0.55180717]]
A dog barks at the farmer
The doctor rang me last night


These sentence similarities make no sense at all

## Build Classification based on BERT

By freezing pretrained layer of BERT, and add a new softmax layer

In [72]:
mbertModel.trainable = False

In [73]:
mbertModel

<transformers.models.bert.modeling_tf_bert.TFBertModel at 0x7fb8d3b0a290>

Add custom smoothing & classification layers

In [74]:
MAX_LEN = 16

In [75]:
# Train with distribution over GPUs
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
        
    # input layer (1) taking tokenised words
    input_ids = keras.layers.Input(
        shape=(MAX_LEN,), 
        dtype=tf.int32,
        name="input_ids")
    
    # input layer (2) taking attention masks (masking paddings)
    mask_ids = keras.layers.Input(
        shape=(MAX_LEN,),
        dtype=tf.int32,
        name="attention_mask_ids")

    # input layer (3) taking token types
    token_type_ids = keras.layers.Input(
        shape=(MAX_LEN,),
        dtype=tf.int32,
        name="token_type_ids")
    
    # BERT layer
    sequence_output, pooled_output = mbertModel(
        input_ids, 
        attention_mask=mask_ids,
        token_type_ids=token_type_ids)

    # smoothening layers
    pool1 = keras.layers.GlobalAveragePooling1D()(sequence_output)
    
    # Modeling
    model = keras.models.Model(
        inputs=[input_ids, mask_ids, token_type_ids],
        outputs=[])
    
    model.compile(
        loss='categorical_crossentropy',
        metric='accuracy')

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


TypeError: Inputs to a layer should be tensors. Got: last_hidden_state