In [1]:
import transformers
import numpy as np
import tensorflow as tf
import pandas as pd

# 1. Load BERT Model

Pretrained multi-lingual from HuggerFace

https://huggingface.co/bert-base-uncased

In [102]:
from transformers import BertTokenizer, TFBertModel

# Multi-lingual
bertMultTokeniser = BertTokenizer.from_pretrained('bert-base-uncased')
bertMultModel = TFBertModel.from_pretrained(
    "bert-base-uncased",
    output_hidden_states=True)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [103]:
bertMultModel.summary()

Model: "tf_bert_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
Total params: 109,482,240
Trainable params: 109,482,240
Non-trainable params: 0
_________________________________________________________________


## Test encoding text in various languages

In [122]:
sentences = [
    'What does the fox say',
    'A dog barks at the farmer',
    'The patient has a bad condition',
    'The doctor rang me last night',
    'Good one'
]

tensors = [bertMultTokeniser(s, return_tensors='tf') for s in sentences]


tensors[0]

{'input_ids': <tf.Tensor: shape=(1, 7), dtype=int32, numpy=array([[ 101, 2054, 2515, 1996, 4419, 2360,  102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 7), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 7), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [123]:
encoded = bertMultTokeniser.encode('Do you know who i am') # str -> ids
bertMultTokeniser.convert_ids_to_tokens(encoded) # ids -> tokens

['[CLS]', 'do', 'you', 'know', 'who', 'i', 'am', '[SEP]']

In [124]:
wouts = [bertMultModel(t) for t in tensors]

In [137]:
[w.hidden_states[-1].numpy()[0].shape for w in wouts] # hidden states

[(7, 768), (9, 768), (8, 768), (8, 768), (4, 768)]

Distance between each pair of sentences

In [139]:
from sklearn.metrics.pairwise import cosine_similarity

# Take weights of last hidden state of [CLS] token
embeddings = [v.hidden_states[-1].numpy()[0] for v in wouts]

zs = list(zip(sentences, embeddings))
from heapq import heappush, heappop

closest = []
for i,(s1,v1) in enumerate(zs):
    for j,(s2,v2) in enumerate(zs[i+1:]):
        c = cosine_similarity(
            np.atleast_2d(v1[0].flatten()),
            np.atleast_2d(v2[0].flatten()))
        heappush(closest, (-c, (s1,s2)))

In [140]:
print('Top 5 closest pairs')
for i in range(5):
    c, (s1,s2) = heappop(closest)
    print('==========================')
    print(f'Rank #{i}, {-c}')
    print(s1)
    print(s2)

Top 5 closest pairs
Rank #0, [[0.9283186]]
What does the fox say
Good one
Rank #1, [[0.9103328]]
What does the fox say
The doctor rang me last night
Rank #2, [[0.90115964]]
The doctor rang me last night
Good one
Rank #3, [[0.888417]]
The patient has a bad condition
Good one
Rank #4, [[0.8776811]]
What does the fox say
The patient has a bad condition


These sentence similarities make no sense at all