In [1]:
import torch
from transformers import BertTokenizerFast, BertModel, AutoConfig

# BERT Model

In [9]:
model_name = "bert-base-multilingual-cased"

tokenizer = BertTokenizerFast.from_pretrained(model_name, padding_side='left')
model = BertModel.from_pretrained(model_name)
model.eval()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [3]:
y = model(**tokenizer(["probando probando","hola que tal"], return_tensors='pt', padding=True))
y

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.1423, -0.0459, -0.0771,  ...,  0.3477, -0.0751, -0.1170],
         [-0.3144,  0.0404, -0.3393,  ...,  0.6715, -0.4224, -0.4781],
         [ 0.2161, -0.2109,  0.4894,  ...,  0.7159,  0.0963, -0.4333],
         ...,
         [ 0.1972, -0.2422,  0.3471,  ...,  0.7266,  0.0117, -0.4138],
         [ 0.1139, -0.3894,  0.7626,  ...,  0.1696, -0.1764, -0.3322],
         [ 0.0406, -0.2014,  0.0605,  ...,  0.2703, -0.0686, -0.1638]],

        [[-0.0048,  0.0708,  0.1901,  ...,  0.3257,  0.0934,  0.0146],
         [-0.5124, -0.1598,  0.4778,  ...,  0.3348,  0.1937,  0.2799],
         [ 0.1374,  0.0236,  0.9351,  ..., -0.1258, -0.0433, -0.1082],
         ...,
         [-0.0373, -0.0464,  0.6379,  ...,  0.5038,  0.0580,  0.1065],
         [-0.0285, -0.0796,  0.0996,  ...,  0.3706,  0.0652,  0.1131],
         [-0.0240, -0.0737,  0.2475,  ...,  0.4629, -0.0723,  0.0723]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_ou

In [4]:
dict(y).keys()

dict_keys(['last_hidden_state', 'pooler_output'])

In [5]:
y['last_hidden_state'].shape

torch.Size([2, 8, 768])

In [6]:
y['pooler_output'].shape

torch.Size([2, 768])

# BERT Embeddings

In [7]:
model_lstm = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Check tokenizer pads from left

In [None]:
tokenizer.padding_side

'left'

In [None]:
tokenizer(["probando probando","hola que tal"], return_tensors='pt', padding='max_length')

{'input_ids': tensor([[    0,     0,     0,  ..., 20791, 10133,   102],
        [    0,     0,     0,  ..., 10121, 13675,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]])}

Text (B=batch_size)-> Tokenize (B,T) -> Embeddings (B,T,H=emb_dim)

H equals...

In [10]:
model_lstm.config.hidden_size

768

T equals...

In [11]:
model_lstm.config.max_position_embeddings

512

We are going to fix the position embeddings so we can use a linear layout to pool all the hidden layers from the LSTM

(B,T,H) -> lstm (B,T,D*hidden_size) (D is 2 if bidirectiona, 1 otherwise)

In [12]:
num_layers=2
dropout=0
bidirectional=True
hidden_size=768

# So it can be included in BERT
class BertLSTM(torch.nn.LSTM):
    def forward(self, x, *args, **kwargs):
        return super().forward(x)


lstm = BertLSTM(
    input_size=model.config.hidden_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    batch_first=True,
    dropout=dropout,
    bidirectional=bidirectional,
)

(B,T,D*hidden_size) -> flatten (B,T\*D\*hidden_size) -> linear (B, out_features)

In [13]:
out_features = 768

pooler = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(
        in_features=(model_lstm.config.max_position_embeddings * hidden_size * (2 if bidirectional else 1)),
        out_features=out_features,
        bias=True,
    ),
    torch.nn.Tanh(),
)

In [14]:
model_lstm.encoder = lstm
model_lstm.pooler = pooler

In [15]:
model_lstm

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertLSTM(768, 768, num_layers=2, batch_first=True, bidirectional=True)
  (pooler): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=786432, out_features=768, bias=True)
    (2): Tanh()
  )
)

Let's check it works

In [16]:
text = ["Prueba de funcionamiento", "hola"]

In [17]:
y = model_lstm(**tokenizer(text, return_tensors='pt', padding='max_length'), return_dict=False)
[k.shape for k in y[:2]]

[torch.Size([2, 512, 1536]), torch.Size([2, 768])]