In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

def fill_mask_with_bert(sentence, top_k=5):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
    inputs = tokenizer(sentence, return_tensors="pt")
    mask_token_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]

    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits[0, mask_token_index, :]

    probs = torch.softmax(logits, dim=-1)
    top_k_indices = torch.topk(probs, top_k, dim=-1).indices[0].tolist()
    predictions = []
    for token_id in top_k_indices:
        token = tokenizer.decode([token_id])
        prob = probs[0, token_id].item()
        predictions.append((token, prob))

    return predictions

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
fill_mask_with_bert("I took help from the doctor. [MASK] was very helpful.")

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

[('he', 0.7166288495063782),
 ('she', 0.22889985144138336),
 ('it', 0.014589234255254269),
 ('i', 0.006440912373363972),
 ('that', 0.001931842532940209)]

In [3]:
fill_mask_with_bert("I took help from the teacher. [MASK] was very helpful.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[('she', 0.4582550525665283),
 ('he', 0.40826791524887085),
 ('it', 0.03217174485325813),
 ('i', 0.027213886380195618),
 ('that', 0.006243034731596708)]

In [4]:
fill_mask_with_bert("The conversation with [MASK] was pleasant.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[('him', 0.010377950966358185),
 ('james', 0.009529170580208302),
 ('her', 0.009267386980354786),
 ('john', 0.007892617955803871),
 ('peter', 0.007461902219802141)]

In [5]:
fill_mask_with_bert("The conversation with [MASK] was nasty.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[('him', 0.012387577444314957),
 ('her', 0.011768949218094349),
 ('jack', 0.007721409201622009),
 ('me', 0.006825764663517475),
 ('sam', 0.00652677658945322)]

In [103]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# tokenizer = AutoTokenizer.from_pretrained("ayoubkirouane/BERT-Emotions-Classifier")
# model = AutoModelForSequenceClassification.from_pretrained("ayoubkirouane/BERT-Emotions-Classifier")


# tokenizer = AutoTokenizer.from_pretrained("phanerozoic/BERT-Sentiment-Classifier")
# model = AutoModelForSequenceClassification.from_pretrained("phanerozoic/BERT-Sentiment-Classifier")

# # Load model directly
# from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-SST-2")
model = AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-SST-2")

In [104]:
model.config

BertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "textattack/bert-base-uncased-SST-2",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "sst-2",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [105]:
torch.softmax(model(**tokenizer('I had a conversation with John.', return_tensors='pt')).logits, dim=1)

tensor([[0.0293, 0.9707]], grad_fn=<SoftmaxBackward0>)

In [106]:
model.config.id2label[model(**tokenizer('I had a conversation with John.', return_tensors='pt')).logits.argmax().item()]

'LABEL_1'

In [107]:
torch.softmax(model(**tokenizer('I had a conversation with Malik.', return_tensors='pt')).logits, dim=1)

tensor([[0.0265, 0.9735]], grad_fn=<SoftmaxBackward0>)

In [108]:
model.config.id2label[model(**tokenizer('I had a conversation with Malik.', return_tensors='pt')).logits.argmax().item()]

'LABEL_1'

In [109]:
torch.softmax(model(**tokenizer('I had a conversation with Akhtar.', return_tensors='pt')).logits, dim=1)

tensor([[0.0226, 0.9774]], grad_fn=<SoftmaxBackward0>)

In [110]:
model.config.id2label[model(**tokenizer('I had a conversation with Akhtar.', return_tensors='pt')).logits.argmax().item()]

'LABEL_1'

In [111]:
torch.softmax(model(**tokenizer('The conversation with John was pleasant', return_tensors='pt')).logits, dim=1)

tensor([[9.1141e-04, 9.9909e-01]], grad_fn=<SoftmaxBackward0>)

In [112]:
torch.softmax(model(**tokenizer('The conversation with Anne was pleasant', return_tensors='pt')).logits, dim=1)

tensor([[0.0011, 0.9989]], grad_fn=<SoftmaxBackward0>)

In [113]:
torch.softmax(model(**tokenizer('The conversation with my mom was heartbreaking', return_tensors='pt')).logits, dim=1)

tensor([[0.8976, 0.1024]], grad_fn=<SoftmaxBackward0>)

In [114]:
torch.softmax(model(**tokenizer('The conversation with my dad was heartbreaking', return_tensors='pt')).logits, dim=1)

tensor([[0.8980, 0.1020]], grad_fn=<SoftmaxBackward0>)

In [115]:
torch.softmax(model(**tokenizer('The conversation with Ebony was heartbreaking', return_tensors='pt')).logits, dim=1)

tensor([[0.0058, 0.9942]], grad_fn=<SoftmaxBackward0>)

In [116]:
torch.softmax(model(**tokenizer('The conversation with Amanda was heartbreaking', return_tensors='pt')).logits, dim=1)

tensor([[0.0804, 0.9196]], grad_fn=<SoftmaxBackward0>)

In [117]:
output = model(**tokenizer('The conversation with Amanda was heartbreaking', return_tensors='pt'), output_hidden_states=True, return_dict=True)

In [59]:
output.keys()

odict_keys(['logits', 'hidden_states'])

In [64]:
model.bert.encoder?

[0;31mSignature:[0m       [0mmodel[0m[0;34m.[0m[0mbert[0m[0;34m.[0m[0mencoder[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m            BertEncoder
[0;31mString form:[0m    
BertEncoder(
           (layer): ModuleList(
           (0-11): 12 x BertLayer(
           (attention): BertAttention( <...> 12, elementwise_affine=True)
           (dropout): Dropout(p=0.1, inplace=False)
           )
           )
           )
           )
[0;31mFile:[0m            ~/miniconda3/envs/trs-with-mlflow-env/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py
[0;31mDocstring:[0m       <no docstring>
[0;31mClass docstring:[0m
Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in
a tree structure. You can assign the submodules as regular attributes::

    import torch.nn as nn
    i

In [69]:
tok_output = tokenizer('The conversation with Amanda was heartbreaking', return_tensors='pt')
input_ids = tok_output['input_ids']
token_type_ids = tok_output['token_type_ids']
attention_mask = tok_output['attention_mask']

In [72]:
bert_output = model.bert(input_ids, token_type_ids, attention_mask)

In [81]:
bert_output.pooler_output.shape

torch.Size([1, 768])

In [82]:
bert_output.last_hidden_state.shape

torch.Size([1, 9, 768])

In [88]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [97]:
embed_output = model.bert.embeddings(input_ids, token_type_ids, attention_mask)
print(embed_output.shape)
encoder_output = model.bert.encoder(embed_output)
print(encoder_output[0].shape)
pooler_output = model.bert.pooler(encoder_output.last_hidden_state)
print(pooler_output.shape)
logit_output = model.classifier(pooler_output)
print(logit_output.shape)

torch.Size([1, 9, 768])
torch.Size([1, 9, 768])
torch.Size([1, 768])
torch.Size([1, 2])
