In [1]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")
model.eval()

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inp

In [5]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv("data/masked_s_gold_BUG.csv")

In [10]:
def predict_masked_sent(text, top_k=5):
    # Tokenize input
    text = "[CLS] %s [SEP]"%text
    tokenized_text = tokenizer.tokenize(text)
    masked_index = tokenized_text.index("[MASK]")
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    # tokens_tensor = tokens_tensor.to('cuda')    # if you have gpu

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1)
    top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)

    for i, pred_idx in enumerate(top_k_indices):
        predicted_token = tokenizer.convert_ids_to_tokens([pred_idx])[0]
        token_weight = top_k_weights[i]
        print("[MASK]: '%s'"%predicted_token, " | weights:", float(token_weight))

for i in df.head(20)["text"].values:
    print(i)
    predict_masked_sent(i, top_k=5)

Among them was the president [MASK] .
[MASK]: 'himself'  | weights: 0.048793725669384
[MASK]: 'marcos'  | weights: 0.012270461767911911
[MASK]: 'vargas'  | weights: 0.010119077749550343
[MASK]: 'diaz'  | weights: 0.009520509280264378
[MASK]: 'fernandez'  | weights: 0.00824071280658245
Results In the pre - COVID era , an average aesthetic surgeon was finely balancing [MASK] profession , personal lifestyle , learning , and recreation .
[MASK]: 'in'  | weights: 0.13020457327365875
[MASK]: 'aesthetic'  | weights: 0.08425050973892212
[MASK]: 'for'  | weights: 0.07207341492176056
[MASK]: ':'  | weights: 0.06453636288642883
[MASK]: 'through'  | weights: 0.06102780997753143
Peter ( B23 ) is a musician and music editor who has been using [MASK] MacBook Pro and a specialist application ( MainStage ) to emulate " the sounds of pro keyboards like Roland RD pianos and synths when playing live " .
[MASK]: 'apple'  | weights: 0.29093122482299805
[MASK]: 'a'  | weights: 0.18409289419651031
[MASK]: 'th