In [1]:

# auto load the changes of referenced codes
%load_ext autoreload
%autoreload 2

# ebablbe auto-completion
%config Completer.use_jedi = False

In [2]:
# if new methods are created in referenced codes, run the following code.
%reload_ext autoreload

In [2]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

physical_devices = tf.config.list_physical_devices('GPU')
for device in physical_devices:
    tf.config.experimental.set_memory_growth(device, True)
    
def cosine_distance(tensor1, tensor2):
    # 求模长
    tensor1_norm = tf.sqrt(tf.reduce_sum(tf.square(tensor1)))
    tensor2_norm = tf.sqrt(tf.reduce_sum(tf.square(tensor2)))
    
    # 内积
    tensor1_tensor2 = tf.reduce_sum(tf.multiply(tensor1,tensor2))
    cosin = tensor1_tensor2/(tensor1_norm*tensor2_norm)
    
    return cosin

def euclidean_distance(tensor1, tensor2):
     return tf.sqrt(tf.reduce_sum(tf.square(tensor2-tensor1)))

btokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bmodel = TFBertModel.from_pretrained("bert-base-uncased")

sentence = "open a bank account. sit on a bank."
encoded = btokenizer.encode_plus(
    text=sentence,
    add_special_tokens=True,
    max_length=15,
    padding='max_length',
    return_attention_mask=True,
    return_tensors="tf"
)

inputs = encoded["input_ids"]
outputs = bmodel(inputs)
print(type(outputs), len(outputs))
print(outputs[0].shape)
print(outputs[1].shape)    

tokens = btokenizer.convert_ids_to_tokens(tf.squeeze(inputs))
bank1_vector, bank2_vector = [vector for token, vector in zip(tokens, outputs[0][0]) if token=='bank']
print(f'cosine_distance={cosine_distance(bank1_vector, bank2_vector)}')
print(f'euclidean_distance={euclidean_distance(bank1_vector, bank2_vector)}')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


<class 'transformers.modeling_tf_outputs.TFBaseModelOutputWithPooling'> 2
(1, 15, 768)
(1, 768)
cosine_distance=0.7299358248710632
euclidean_distance=11.19631290435791


In [5]:
sentences = ["open a bank account.", "sit on a bank."]
encoded = btokenizer.batch_encode_plus(
        batch_text_or_text_pairs=sentences,
        add_special_tokens=True,
        max_length=10,
        padding='max_length',
        return_attention_mask=True,
        return_tensors="np" 
)

inputs = encoded["input_ids"]
outputs = bmodel(inputs)

tokens_list = [btokenizer.convert_ids_to_tokens(tf.squeeze(inputs_)) for inputs_ in inputs]
bank_vectors = []
for i, tokens in enumerate(tokens_list):
    for token, vector in zip(tokens, outputs[i][0]):
        if token=='bank':
            bank_vectors.append(vector)
        
bank1_vector, bank2_vector = bank_vectors
print(f'cosine_distance={cosine_distance(bank1_vector, bank2_vector)}')
print(f'euclidean_distance={euclidean_distance(bank1_vector, bank2_vector)}')

cosine_distance=-0.5568811297416687
euclidean_distance=26.774715423583984


In [6]:
inputs

array([[ 101, 2330, 1037, 2924, 4070, 1012,  102,    0,    0,    0],
       [ 101, 4133, 2006, 1037, 2924, 1012,  102,    0,    0,    0]])