### 1. Load the HuggingFace BERT reference model and its tokenizer for the tests, variant: bert-base-uncased

In [7]:
import torch
from transformers import BertTokenizer, BertModel

In [8]:
model_name = 'bert-base-uncased'
hf_tokenizer = BertTokenizer.from_pretrained(model_name)
hf_bert_model = BertModel.from_pretrained(model_name) # weights are now loaded!
modelparams = sum(p.numel() for p in hf_bert_model.parameters())
print(f"Number of model parameters (original): {modelparams}")
# should yield: 109482240
hf_bert_model.eval()   # set model to evaluation mode (dropouts deactivated for inference)

Number of model parameters (original): 109482240


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

### 2. Tokenization of the text corpus

In [9]:
texts = [
    "Hello, this is a sample sentence for BERT embedding generation.",
    "A short test sentence",
    "Available fruits: bananas, pineapples, oranges, apples, melons, cherries, strawberries"
]

tokendata = hf_tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
print("tokendata:\n==========\n")
for k, v in tokendata.items():
    print(k,":", v)

tokendata:

input_ids : tensor([[  101,  7592,  1010,  2023,  2003,  1037,  7099,  6251,  2005, 14324,
          7861,  8270,  4667,  4245,  1012,   102,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101,  1037,  2460,  3231,  6251,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101,  2800, 10962,  1024, 26191,  1010,  7222, 23804,  2015,  1010,
          4589,  2015,  1010, 18108,  1010, 11463,  5644,  1010, 24188,  5134,
          1010, 13137, 20968,   102]])
token_type_ids : tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask : tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 

### 3. Text inference and embedding generation,  transformer output structure

In [10]:
with torch.no_grad():
    outputs = hf_bert_model(**tokendata)
print(dict(outputs).keys())

dict_keys(['last_hidden_state', 'pooler_output'])


In [11]:
# embeddings are the representation of the CLS token in each batch (sequence)
# This representation is trained during pretraining:
outputs.last_hidden_state[:, 0, :]   

tensor([[-0.2303, -0.4416, -0.2611,  ..., -0.7063, -0.0347,  0.6870],
        [-0.1313, -0.1393, -0.3229,  ..., -0.0999,  0.0072,  0.2108],
        [-0.2609, -0.1264, -0.3669,  ..., -0.1132,  0.1674,  0.1579]])

In [12]:
outputs.pooler_output # not needed here. main purpose for next sentence prediction.

tensor([[-0.8546, -0.5405, -0.8914,  ..., -0.8892, -0.6927,  0.8527],
        [-0.6780, -0.1119,  0.6431,  ...,  0.3919, -0.5026,  0.6840],
        [-0.8838, -0.5582, -0.9850,  ..., -0.9025, -0.7519,  0.8843]])

# Now instantiating the BERTKit model

### 4. Loading the BERTKit configuration, model, tokenizer and weight loader
In contrast to the HF Bert model, there is no .from_pretrained method that instantiates the model together with its correct weights. Instead, we instantiate the layer structure in exactly the same way where the weights are uninitialized. Then we expicitly load the weights with a little tool from the model.bin file provided by HF. It is importatnt that the naming of the layers and the function hierarchy remains unaltered for the moment because the naming of stored layers exactly follows that scheme. Another degree of freedom that we have in our version is the explicit injection of the overall model configuration. Everything is implicitly done in HF during the loading phase but in our procedure we have much more control albeit we strictly adhere to the architecture to be able to ingest the pretrained weigths. When the BERT model is loaded from HF and the configuration printed we obtain:

In [13]:
from bertkit_modules.bert_config import BertConfig
from bertkit_modules.bert_model import BertModel
from bertkit_modules.weightloader import WeightLoader
from bertkit_tokenizer.bertkit_tokenizer import BertKitTokenizer

Due to elimination of some features a few variables in the config class are not needed:

In [14]:
bert_config = {
  "attention_probs_dropout_prob": 0.1, #
  "chunk_size_feed_forward": 0, #
  "classifier_dropout": None, #
  "hidden_act": "gelu", # 
  "hidden_dropout_prob": 0.1, #
  "hidden_size": 768, # 
  "initializer_range": 0.02, #
  "intermediate_size": 3072, # 
  "layer_norm_eps": 1e-12, #
  "max_position_embeddings": 512, #
  "num_attention_heads": 12, # 
  "num_hidden_layers": 12, # 
  "pad_token_id": 0, #
  "type_vocab_size": 2, #
  "output_attentions": True, #
  "output_hidden_states": True, #
  "vocab_size": 30522 # 
}
config = BertConfig(**bert_config)

In [15]:
bk_bert_model = BertModel(config)   # bk = BERTKit
bk_tokenizer = BertKitTokenizer("./bertkit_tokenizer/vocab.txt")
modelparams = sum(p.numel() for p in bk_bert_model.parameters())
print(f"Number of model parameters (BERTKit): {modelparams}")
# should also yield: 109482240
bk_bert_model.eval()   # set model to evaluation mode (dropouts deactivated for inference)

----> len vocab: 30522
Number of model parameters (BERTKit): 109482240


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

### 5. Loading pretrained BERT weights from LFS repo into model. By using a weight loader we have full control over namings. Some layer name adjustments had to be done in order to load weights correctly.

In [16]:
device = "cpu"
weight_path = "/path_to_your_downloaded/pytorch_model.bin"
weight_path = "/home/markus/data/bert_base_uncased/pytorch_model.bin"

weight_loader = WeightLoader(strict=True, verbose=True)
state_dict = weight_loader.load_pytorch_bin(weight_path)
new_state_dict = weight_loader._apply_prefix_mapping(state_dict, prefix_mapping={"bert.":""})
new_state_dict = weight_loader._map_layernorm_keys(new_state_dict)

weight_loader.layer_wise_loading(bk_bert_model, new_state_dict)


Loading weights from: /home/markus/data/bert_base_uncased/pytorch_model.bin
Found 207 parameters in weight file
  bert.embeddings.word_embeddings.weight: torch.Size([30522, 768]) (torch.float32)
  bert.embeddings.position_embeddings.weight: torch.Size([512, 768]) (torch.float32)
  bert.embeddings.token_type_embeddings.weight: torch.Size([2, 768]) (torch.float32)
  bert.embeddings.LayerNorm.gamma: torch.Size([768]) (torch.float32)
  bert.embeddings.LayerNorm.beta: torch.Size([768]) (torch.float32)
  bert.encoder.layer.0.attention.self.query.weight: torch.Size([768, 768]) (torch.float32)
  ... and 201 more parameters

Processing layer: embeddings.word_embeddings
  ✓ embeddings.word_embeddings.weight: torch.Size([30522, 768])
  Layer summary: 1/1 parameters loaded

Processing layer: embeddings.position_embeddings
  ✓ embeddings.position_embeddings.weight: torch.Size([512, 768])
  Layer summary: 1/1 parameters loaded

Processing layer: embeddings.token_type_embeddings
  ✓ embeddings.token_

### 6. Tokenizing with BERTKit tokenizer

In [17]:
tokendata = bk_tokenizer.tokenize(texts)

### 7. Text inference and embedding generation with Kit model

In [18]:
with torch.no_grad():
    outputs = bk_bert_model(**tokendata)   # here outputs is a pure dataclass
print(outputs.__dict__.keys())  # not directly addressable as a dict!

Position ids: tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23]])
shape toty embeddings: torch.Size([3, 24, 768])
shape embeddings: torch.Size([3, 24, 768])
shape word embeddings: torch.Size([3, 24, 768])
shape position embeddings: torch.Size([1, 24, 768])
embeddings shape: torch.Size([3, 24, 768])
type of encoder outputs:  <class 'tuple'> 3
dict_keys(['last_hidden_state', 'pooler_output', 'hidden_states', 'attentions'])


In [19]:
print(f"Last hidden states shape: {outputs.last_hidden_state.shape}")
print(f"Pooler output shape: {outputs.pooler_output.shape}")
print(f"# of hidden states tensors: {len(outputs.hidden_states)}")
print(f"# of attention tensors: {len(outputs.attentions)}")

Last hidden states shape: torch.Size([3, 24, 768])
Pooler output shape: torch.Size([3, 768])
# of hidden states tensors: 13
# of attention tensors: 12


In [20]:
# embeddings are the representation of the CLS token in each batch (sequence)
# This representation is trained during pretraining:
# expected output:
#tensor([[-0.2303, -0.4416, -0.2611,  ..., -0.7063, -0.0347,  0.6870],
#        [-0.1313, -0.1393, -0.3229,  ..., -0.0999,  0.0072,  0.2108],
#        [-0.2609, -0.1264, -0.3669,  ..., -0.1132,  0.1674,  0.1579]])

outputs.last_hidden_state[:, 0, :]   

tensor([[-0.2303, -0.4416, -0.2611,  ..., -0.7063, -0.0347,  0.6870],
        [-0.1313, -0.1393, -0.3229,  ..., -0.0999,  0.0072,  0.2108],
        [-0.2609, -0.1264, -0.3669,  ..., -0.1132,  0.1674,  0.1579]])

In [21]:
outputs.pooler_output
# expected output:
#tensor([[-0.8546, -0.5405, -0.8914,  ..., -0.8892, -0.6927,  0.8527],
#        [-0.6780, -0.1119,  0.6431,  ...,  0.3919, -0.5026,  0.6840],
#        [-0.8838, -0.5582, -0.9850,  ..., -0.9025, -0.7519,  0.8843]])

tensor([[-0.8546, -0.5405, -0.8914,  ..., -0.8892, -0.6927,  0.8527],
        [-0.6780, -0.1119,  0.6431,  ...,  0.3919, -0.5026,  0.6840],
        [-0.8838, -0.5582, -0.9850,  ..., -0.9025, -0.7519,  0.8843]])