In [29]:
!pip install tensorflow-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [30]:
# Import library
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# url from Tensorflow Hub
url_preprocessing = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
url_bert_model = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"

In [31]:
# Load model preprocessing and BERT model from TF Hub
bert_preprocess_model = hub.KerasLayer(url_preprocessing)
bert_model = hub.KerasLayer(url_bert_model)

In [33]:
text_test = ["He didn't receive fair treatment", 
             "Tom deserves unbiased judgement", 
             "Fun fair in New York city this summer", 
             "Carnival was packed with fun activities"]

text_preprocessed = bert_preprocess_model(text_test)

# From preprocess model we get a Dictionary with 3 keys:
# input_mask: 
# input_type_ids:
# input_word_ids: tokenization
text_preprocessed.keys()

dict_keys(['input_mask', 'input_word_ids', 'input_type_ids'])

In [34]:
bert_results = bert_model(text_preprocessed)

# From BERT model we get a Dictionary with 4 keys:
# pooled_output: embedding of entire sentences; every sentences embedded with a vector of 768 elements
# sequence_output: individual word embedding; every sentence has fixed length of 128 token (word) (with padding) and every token is embedded with vector of 768 elements 
# default: 
# encoder_outputs
bert_results.keys()

dict_keys(['pooled_output', 'default', 'encoder_outputs', 'sequence_output'])

In [37]:
bert_results['pooled_output'].shape

TensorShape([4, 768])

In [49]:
bert_results['sequence_output'].shape

TensorShape([4, 128, 768])

In [42]:
len(bert_results['encoder_outputs'])

12

In [46]:
bert_results['encoder_outputs'][0][0].shape

TensorShape([128, 768])

In [None]:
sent_01 = np.expand_dims(bert_results['pooled_output'][0], axis=0)
sent_02 = np.expand_dims(bert_results['pooled_output'][1], axis=0)
sent_03 = np.expand_dims(bert_results['pooled_output'][2], axis=0)
sent_04 = np.expand_dims(bert_results['pooled_output'][3], axis=0)

In [None]:
cosine_similarity(sent_03, sent_04)

array([[0.88839364]], dtype=float32)