Bidirectional Encoder Representations from Transformers.

In [2]:
# pip install transformers

Collecting transformers
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.22.2-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading transformers-4.40.1-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading huggingface_hub-0.22.2-py3-none-any.whl (388 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[

In [3]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import numpy as np

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

# Input text
text = "Word embeddings using BERT."

# Tokenize input text
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

# Convert tokens to ids
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)

# Padding
max_len = 10
padded_token_ids = token_ids + [0] * (max_len - len(token_ids))
print("Padded Token IDs:", padded_token_ids)

# Convert to tensor
input_ids = tf.constant([padded_token_ids])

# Pass through BERT model
outputs = model(input_ids)

# Extract embeddings from the last layer
last_hidden_states = outputs.last_hidden_state

# Perform pooling (mean pooling in this example)
word_embeddings = tf.reduce_mean(last_hidden_states, axis=1)
print("Word Embeddings:", word_embeddings.numpy())


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on,

Tokens: ['word', 'em', '##bed', '##ding', '##s', 'using', 'bert', '.']
Token IDs: [2773, 7861, 8270, 4667, 2015, 2478, 14324, 1012]
Padded Token IDs: [2773, 7861, 8270, 4667, 2015, 2478, 14324, 1012, 0, 0]
Word Embeddings: [[-7.23682204e-03  1.10996164e-01  1.52019158e-01  1.08070590e-01
   4.31602955e-01 -3.79892550e-02 -3.94995451e-01 -1.73615545e-01
  -8.43743235e-03 -7.29412854e-01 -7.39203840e-02  1.00656137e-01
  -3.63527805e-01 -3.33960682e-01  3.05299938e-01  8.01726222e-01
   1.11784190e-02 -1.24266848e-01 -2.93703347e-01 -2.75049537e-01
   6.13774657e-01  1.87817905e-02 -3.40012074e-01  5.20234942e-01
   8.05857927e-02 -6.64361492e-02  4.44877923e-01 -2.73937106e-01
  -5.56288898e-01 -3.79304349e-01  7.81603277e-01  1.47958308e-01
  -1.35932520e-01  8.39748010e-02 -1.02879003e-01 -2.19926551e-01
   3.84890884e-01 -2.43756846e-02  1.48159698e-01 -1.04576029e-01
  -4.64921325e-01 -2.28148818e-01 -2.65608784e-02 -1.29067451e-01
   3.33342671e-01 -1.68773368e-01  2.15550378e-01 -