In [1]:
from transformers import AutoTokenizer, XLMRobertaForMaskedLM
import torch

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaForMaskedLM.from_pretrained("xlm-roberta-base").to(torch.device("cuda:0"))

In [None]:
model

In [None]:
sentences = [
    'William Henry Gates created the company <mask> .',
    '周恩来出生于<mask>']
encoded_inputs = tokenizer(sentences, padding=True, return_tensors="pt").to(torch.device("cuda:0"))
with torch.no_grad():
    output = model(**encoded_inputs)
    logits = output.logits
output

In [None]:
# test for xlmr.get_mask_tokens function - with padding
sentences = [
    'William Henry Gates created the company <mask>.',
    '周恩来出生于<mask>, 生育三子二女, 无疾而终']
encoded_inputs = tokenizer(sentences, padding=True, return_tensors="pt").to(torch.device("cuda:0"))
with torch.no_grad():
    logits = model(**encoded_inputs).logits
preds = []
for i, sent in enumerate(sentences):
    masked_tokens = []
    tokens = tokenizer.convert_ids_to_tokens(encoded_inputs['input_ids'][i])
    mask_ind = tokens.index('<mask>')
    for j in range(1):
        token = tokenizer.convert_ids_to_tokens(torch.topk(logits[i][mask_ind+j], 1).indices)
        masked_tokens.append(token)
    preds.append(masked_tokens)
print(preds)

In [None]:
inputs = tokenizer("John Vincent Atanasoff works in the field of mathematics", return_tensors="pt")
print(tokenizer.convert_ids_to_tokens(inputs.input_ids[0]))
# with torch.no_grad():
#     logits = model(**inputs).logits
# with torch.no_grad():
#     output = model(**inputs)

In [5]:
output

MaskedLMOutput(loss=None, logits=tensor([[[60.8371, -0.1726, 45.7986,  ..., 28.6289, 16.6448, 23.7296],
         [29.2734, -1.4505, 72.7367,  ..., 46.4081, 14.4296, 35.9122],
         [20.4163, -0.9104, 49.2314,  ..., 33.0455, 14.6394, 24.9723],
         ...,
         [27.0913, -1.6071, 67.9122,  ..., 54.2438, 18.9646, 38.2035],
         [17.8382, -0.8148, 50.2230,  ..., 32.0012, 14.5591, 24.8543],
         [37.7306, -0.3880, 55.0519,  ..., 34.0709, 18.2874, 27.1745]]]), hidden_states=None, attentions=None)

In [20]:
# retrieve index of <mask>
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

'...'

In [22]:
labels = tokenizer("The capital of France is Paris", return_tensors="pt")["input_ids"]
print(tokenizer.convert_ids_to_tokens(inputs.input_ids[0].tolist()))
print(labels)
print(tokenizer.convert_ids_to_tokens(labels[0].tolist()))

['<s>', '▁The', '▁capital', '▁of', '▁France', '▁is', '<mask>', '</s>']
tensor([[    0,   581, 10323,   111,  9942,    83,  7270,     2]])
['<s>', '▁The', '▁capital', '▁of', '▁France', '▁is', '▁Paris', '</s>']


In [23]:
# mask labels of non-<mask> tokens
labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)

outputs = model(**inputs, labels=labels)
round(outputs.loss.item(), 2)

1.86