In [1]:
# Instantiate a task-specific model using pipeline

from transformers import pipeline

camembert_fill_mask = pipeline("fill-mask", model="camembert-base")
results = camembert_fill_mask("Le camembert est <mask> :)")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
results

[{'score': 0.4909117817878723,
  'token': 7200,
  'token_str': 'délicieux',
  'sequence': 'Le camembert est délicieux :)'},
 {'score': 0.10556931048631668,
  'token': 2183,
  'token_str': 'excellent',
  'sequence': 'Le camembert est excellent :)'},
 {'score': 0.03453318774700165,
  'token': 26202,
  'token_str': 'succulent',
  'sequence': 'Le camembert est succulent :)'},
 {'score': 0.03303121030330658,
  'token': 528,
  'token_str': 'meilleur',
  'sequence': 'Le camembert est meilleur :)'},
 {'score': 0.03007640689611435,
  'token': 1654,
  'token_str': 'parfait',
  'sequence': 'Le camembert est parfait :)'}]

In [3]:
# instantiate a model directly from the model architecture
from transformers import CamembertTokenizer, CamembertForMaskedLM
import torch

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForMaskedLM.from_pretrained("camembert-base")

# use the model to predict a masked token
input_ids = tokenizer("Le camembert est <mask> :)", padding=True, return_tensors="pt")  # Batch size 1
output = model(**input_ids)
logits = output.logits

# get the predicted token
predicted_index = torch.argmax(logits[0, -1, :]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
# assert predicted_token == "fromage"
print(predicted_token)




</s>NOTUSED


In [4]:
# use the model to predict a masked token
input_ids = tokenizer("Le camembert est <mask> :)", padding=True, return_tensors="pt")  # Batch size 1
output = model(**input_ids)
logits = output.logits
logits

tensor([[[ 19.9792,  -4.5944,   7.1252,  ...,  -6.3529,  -3.5433,   1.3889],
         [  0.8480,  -3.5700,  10.2163,  ...,  -9.4812,  -1.8443,  -1.3015],
         [ -0.2291,  -9.3549,  -0.0379,  ..., -22.2491, -10.7558,  -2.2051],
         ...,
         [ -2.2079,  -3.1100,   0.8896,  ...,  -6.0595,  -2.9762,  -4.5024],
         [ -1.2716,  -3.9497,   8.1755,  ...,  -2.3597,  -9.8304,  -2.0070],
         [  5.3539,  -5.8456,  22.9419,  ...,  -8.5553,  -5.7814,   2.1215]]],
       grad_fn=<ViewBackward0>)

In [10]:
input_ids = tokenizer.encode("Le camembert est <mask> :)")
input_ids

[5, 54, 730, 25543, 110, 30, 32004, 4522, 6]

In [11]:
input_ids = tokenizer.encode("Le camembert est <mask> :)", add_special_tokens=True)
input_ids

[5, 54, 730, 25543, 110, 30, 32004, 4522, 6]

In [12]:
input_ids = tokenizer.encode("Le camembert est <mask> :)", return_tensors="pt")
input_ids

tensor([[    5,    54,   730, 25543,   110,    30, 32004,  4522,     6]])

In [22]:
input_ids = tokenizer("Le camembert est <mask> :)", return_tensors="pt")
input_ids

{'input_ids': tensor([[    5,    54,   730, 25543,   110,    30, 32004,  4522,     6]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
output = model(input_ids)
output

MaskedLMOutput(loss=None, logits=tensor([[[ 19.9792,  -4.5944,   7.1252,  ...,  -6.3529,  -3.5433,   1.3889],
         [  0.8480,  -3.5700,  10.2163,  ...,  -9.4812,  -1.8443,  -1.3015],
         [ -0.2291,  -9.3549,  -0.0379,  ..., -22.2491, -10.7558,  -2.2051],
         ...,
         [ -2.2079,  -3.1100,   0.8896,  ...,  -6.0595,  -2.9762,  -4.5024],
         [ -1.2716,  -3.9497,   8.1755,  ...,  -2.3597,  -9.8304,  -2.0070],
         [  5.3539,  -5.8456,  22.9419,  ...,  -8.5553,  -5.7814,   2.1215]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [16]:
input_ids = tokenizer("Le camembert est <mask> :)", return_tensors="pt")
input_ids

{'input_ids': tensor([[    5,    54,   730, 25543,   110,    30, 32004,  4522,     6]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [17]:
output = model(**input_ids)
output

MaskedLMOutput(loss=None, logits=tensor([[[ 19.9792,  -4.5944,   7.1252,  ...,  -6.3529,  -3.5433,   1.3889],
         [  0.8480,  -3.5700,  10.2163,  ...,  -9.4812,  -1.8443,  -1.3015],
         [ -0.2291,  -9.3549,  -0.0379,  ..., -22.2491, -10.7558,  -2.2051],
         ...,
         [ -2.2079,  -3.1100,   0.8896,  ...,  -6.0595,  -2.9762,  -4.5024],
         [ -1.2716,  -3.9497,   8.1755,  ...,  -2.3597,  -9.8304,  -2.0070],
         [  5.3539,  -5.8456,  22.9419,  ...,  -8.5553,  -5.7814,   2.1215]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [21]:
# get the predicted token
predicted_index = torch.argmax(output.logits[0, -1, :]).item()
probs = torch.softmax(output.logits[0, -1, :], dim=0)
print(f"probs: ", probs)
values, indices = torch.topk(probs, 5)
print(f"values: ", values)
print(f"indices: ", indices)

predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(f"predicted_index: {predicted_index}")
print(f"predicted_token: {predicted_token}")

probs:  tensor([2.2994e-08, 3.1457e-13, 9.9999e-01,  ..., 2.0938e-14, 3.3545e-13,
        9.0744e-10], grad_fn=<SoftmaxBackward0>)
values:  tensor([9.9999e-01, 1.3571e-06, 1.1593e-06, 5.1620e-07, 4.9516e-07],
       grad_fn=<TopkBackward0>)
indices:  tensor([ 2, 83,  8, 43, 38])
predicted_index: 2
predicted_token: </s>NOTUSED


In [20]:
inputs = tokenizer("Le camembert est <mask> :)", add_special_tokens=True, return_tensors="pt")
inputs

{'input_ids': tensor([[    5,    54,   730, 25543,   110,    30, 32004,  4522,     6]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [22]:
logits = model(**inputs)[0] # The last hidden-state is the first element of the output tuple
logits

tensor([[[ 19.9792,  -4.5944,   7.1252,  ...,  -6.3529,  -3.5433,   1.3889],
         [  0.8480,  -3.5700,  10.2163,  ...,  -9.4812,  -1.8443,  -1.3015],
         [ -0.2291,  -9.3549,  -0.0379,  ..., -22.2491, -10.7558,  -2.2051],
         ...,
         [ -2.2079,  -3.1100,   0.8896,  ...,  -6.0595,  -2.9762,  -4.5024],
         [ -1.2716,  -3.9497,   8.1755,  ...,  -2.3597,  -9.8304,  -2.0070],
         [  5.3539,  -5.8456,  22.9419,  ...,  -8.5553,  -5.7814,   2.1215]]],
       grad_fn=<ViewBackward0>)

In [9]:
logits.size()

torch.Size([1, 9, 32005])

In [32]:
output = model(**inputs)
output

MaskedLMOutput(loss=None, logits=tensor([[[ 19.9792,  -4.5944,   7.1252,  ...,  -6.3529,  -3.5433,   1.3889],
         [  0.8480,  -3.5700,  10.2163,  ...,  -9.4812,  -1.8443,  -1.3015],
         [ -0.2291,  -9.3549,  -0.0379,  ..., -22.2491, -10.7558,  -2.2051],
         ...,
         [ -2.2079,  -3.1100,   0.8896,  ...,  -6.0595,  -2.9762,  -4.5024],
         [ -1.2716,  -3.9497,   8.1755,  ...,  -2.3597,  -9.8304,  -2.0070],
         [  5.3539,  -5.8456,  22.9419,  ...,  -8.5553,  -5.7814,   2.1215]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [24]:
logits.size() # Model outputs a MLM object containing the loss and logits

AttributeError: 'MaskedLMOutput' object has no attribute 'size'

In [25]:
inputs.input_ids

tensor([[    5,    54,   730, 25543,   110,    30, 32004,  4522,     6]])

In [28]:
# get the logits out of the MLM object
logits = model(**input_ids)[0]
masked_index = (inputs.input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item() # squeeze() removes the batch dimension and converts to scalar
masked_index

6

In [42]:
# get the logits out of the MLM object
logits = output.logits
logits

tensor([[[ 19.9792,  -4.5944,   7.1252,  ...,  -6.3529,  -3.5433,   1.3889],
         [  0.8480,  -3.5700,  10.2163,  ...,  -9.4812,  -1.8443,  -1.3015],
         [ -0.2291,  -9.3549,  -0.0379,  ..., -22.2491, -10.7558,  -2.2051],
         ...,
         [ -2.2079,  -3.1100,   0.8896,  ...,  -6.0595,  -2.9762,  -4.5024],
         [ -1.2716,  -3.9497,   8.1755,  ...,  -2.3597,  -9.8304,  -2.0070],
         [  5.3539,  -5.8456,  22.9419,  ...,  -8.5553,  -5.7814,   2.1215]]],
       grad_fn=<ViewBackward0>)

In [43]:
# get the logits for the masked index
logits_mask_token = logits[0, masked_index, :] # logits for the masked index
logits_mask_token

tensor([-2.2079, -3.1100,  0.8896,  ..., -6.0595, -2.9762, -4.5024],
       grad_fn=<SliceBackward0>)

In [44]:
print(f"mask token logits size: ", logits_mask_token.size())
print(f"mask token logits shape: ", logits_mask_token.shape)

mask token logits size:  torch.Size([32005])
mask token logits shape:  torch.Size([32005])


In [52]:
# get the probabilities for the masked token logits
prob = logits_mask_token.softmax(dim=0)
prob

tensor([2.3250e-08, 9.4327e-09, 5.1481e-07,  ..., 4.9397e-10, 1.0783e-08,
        2.3438e-09], grad_fn=<SoftmaxBackward0>)

In [54]:
# get the top 5 probabilities and indices
values, indices = prob.topk(5)
values, indices

(tensor([0.4909, 0.1056, 0.0345, 0.0330, 0.0301], grad_fn=<TopkBackward0>),
 tensor([ 7200,  2183, 26202,   528,  1654]))

In [59]:
top_pred_token = tokenizer.convert_ids_to_tokens(indices[0].item())
top_pred_token

'▁délicieux'

In [57]:
# get the top 5 predicted tokens
top_pred_tokens = tokenizer.convert_ids_to_tokens(indices)
top_pred_tokens

['▁délicieux', '▁excellent', '▁succulent', '▁meilleur', '▁parfait']

In [30]:
import torch

from transformers import CamembertForMaskedLM
from transformers import CamembertTokenizer


def fill_mask(masked_input, model, tokenizer, topk=5):
    # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
    assert masked_input.count("<mask>") == 1
    input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
    masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
    print (f"masked_index: {masked_index}")
    logits = logits[0, masked_index, :]
    prob = logits.softmax(dim=0)
    values, indices = prob.topk(k=topk, dim=0)
    topk_predicted_token_bpe = " ".join(
        [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
    )
    masked_token = tokenizer.mask_token
    topk_filled_outputs = []
    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
        predicted_token = predicted_token_bpe.replace("\u2581", " ")
        if " {0}".format(masked_token) in masked_input:
            topk_filled_outputs.append(
                (
                    masked_input.replace(" {0}".format(masked_token), predicted_token),
                    values[index].item(),
                    predicted_token,
                )
            )
        else:
            topk_filled_outputs.append(
                (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
            )
    return topk_filled_outputs


tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForMaskedLM.from_pretrained("camembert-base")
model.eval()

masked_input = "Le camembert est <mask> :)"
print(fill_mask(masked_input, model, tokenizer, topk=3))

masked_index: 6
[('Le camembert est délicieux :)', 0.4909117817878723, ' délicieux'), ('Le camembert est excellent :)', 0.10556931048631668, ' excellent'), ('Le camembert est succulent :)', 0.03453318774700165, ' succulent')]
