## Bert Masked Word Prediction
Source: https://huggingface.co/transformers/quickstart.html

In [74]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel, BertForMaskedLM

In [54]:
pre_trained_weights = 'bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(pre_trained_weights)

In [121]:
text = "[CLS] 你 会 说 西 班 牙 语 吗 ？ [SEP] 对 ， 我 是 在 [MASK] [MASK] 的 首 都 利 马 长 大 的  。 [SEP]"

## Tokenize input

In [122]:
# Tokenize input
tokenized_text = tokenizer.tokenize(text)

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Get Masked Index
masked_index = [i for i, x in enumerate(tokenized_text) if x == "[MASK]"]

# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
next_sent_start_ix = tokenized_text.index('[SEP]')
segments_ids = [0] * len(tokenized_text)
segments_ids[next_sent_start_ix+1:] = [1] * len(segments_ids[next_sent_start_ix+1:])

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [123]:
masked_index

[16, 17]

## Use BertModel to encode our inputs in hidden-states:

In [342]:
# This part is unnecessary for the next cell
model = BertModel.from_pretrained(pre_trained_weights)

# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()

# tokens_tensor = tokens_tensor.to('cuda')
# segments_tensors = segments_tensors.to('cuda')
# model.to('cuda')

# Predict hidden states features for each layer
with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    # Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the hidden state of the last layer of the Bert model
    encoded_layers = outputs[0]
# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)

In [343]:
# The above cell is unnecessary for this part.
top_k = 10

model = BertForMaskedLM.from_pretrained(pre_trained_weights)
model.eval()
# If you have a GPU, put everything on cuda here, see above cell.
# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]
    predictions_of_mask = predictions[0, masked_index]
    probs_for_mask = F.softmax(predictions_of_mask, dim=1)
    
# confirm we were able to predict the output
predicted_indices = torch.topk(probs_for_mask, k=top_k).indices
predicted_probs = torch.topk(probs_for_mask, k=top_k).values.numpy()
predicted_tokens = [tokenizer.convert_ids_to_tokens(tok) for tok in predicted_indices]
[dict(zip(predicted_tokens[i], predicted_probs[i])) for i in range(len(predicted_tokens))]

# Using BERT for Chinese Word Prediction

In [277]:
pre_trained_weights = 'bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(pre_trained_weights)

In [344]:
def tokenize_text(text, masklen=1):
    tokenized_text = tokenizer.tokenize(text)
    mask_ix = tokenized_text.index('[MASK]')
    tokenized_text_cp = tokenized_text.copy()
    tokenized_text_cp[mask_ix:mask_ix] = ['[MASK]'] * masklen
    return tokenized_text_cp


def evaluate_one_word(tokenized_text, topk=4):
    """
    tokenized_text: tokenized_text
    """
    print(f"Number of [MASK]: {tokenized_text.count('[MASK]')}")
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    masked_index = [ix for ix, x in enumerate(tokenized_text) if x == "[MASK]"]

    segments_ids = [0] * len(tokenized_text)
    next_sent_start_ix = tokenized_text.index('[SEP]')
    segments_ids[next_sent_start_ix + 1:] = [1] * len(segments_ids[next_sent_start_ix + 1:])

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    model = BertForMaskedLM.from_pretrained(pre_trained_weights)
    model.eval()
    with torch.no_grad():
        outputs = model(tokens_tensor, token_type_ids=segments_tensors)
        predictions = outputs[0][0, masked_index]
        probs_for_mask = F.softmax(predictions, dim=1)

    # confirm we were able to predict the output
    predicted_indices = torch.topk(probs_for_mask, k=topk).indices
    predicted_probs = -1 * torch.topk(probs_for_mask, k=topk).values.numpy()
    predicted_tokens = [tokenizer.convert_ids_to_tokens(tok) for tok in predicted_indices]
    return dict(zip(predicted_probs[0], predicted_tokens[0]))

def get_num_masks(tokenized_text):
    return tokenized_text.count('[MASK]')


def update_probs(prb, new_prb):
    return -1 * prb * new_prb


def stepwise_beam_search(tokenized_text):
    num_masks = get_num_masks(tokenized_text)
    beam_size = 20
    eval_count = beam_size
    best_of_len = {0: [(-1, [])]}
    for length in range(1, num_masks + 1):
        print("For Loop no: ", length)
        for prb0, str0 in best_of_len[length - 1]:
            print('best_of_len: ', best_of_len)
            tokenized_text_cp = tokenized_text.copy()
            mask_ix_start = tokenized_text_cp.index('[MASK]')
            tokenized_text_cp[mask_ix_start:mask_ix_start + length - 1] = str0
            print("Text before processing ", tokenized_text_cp)
            res = evaluate_one_word(tokenized_text_cp, topk=eval_count)
            updated_res = [(update_probs(prb, prb0), str0 + [char]) for prb, char in res.items()]
            if length not in best_of_len:
                best_of_len[length] = []
            best_of_len[length] += updated_res
            best_of_len[length] = sorted(best_of_len[length], key=lambda x: x[0])
        best_of_len[length] = best_of_len[length][:beam_size]
    return best_of_len


This code takes an input text of the form `"[CLS] <text1> [SEP] <text2> [SEP]"`, where `"<text1>", "<text2>"` are space-separated chinese characters, and exactly one of the characters is `[MASK]`. 
We replace `[MASK]` with `mask_len` number of `[MASK]` characters, and then perform beam search to find the optimal n-character sequence to fill in that blank.

In [340]:
mask_len = 3n 
text = "[CLS] 把 台 上 几 个 原  本 羞 却 [MASK] 的 男 孩 们 炒 成 了 热 门 的 幕 间 演 出 乐 队 。 [SEP] 他 们 就 这 样 学 会 了 如 何 抓 住 持 续 增 长 的 听 众 。 [SEP]"
tokenized_text = tokenize_text(text, masklen=mask_len)
result = stepwise_beam_search(tokenized_text)
print(result)

In [111]:
# Code to Predict Just one
# predicted_index = torch.argmax(predictions[0, masked_index]).item()
# predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
