In [4]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
from transformers import BertTokenizer, BertForMaskedLM

from quora_bert_mask_predict_dataset import QuoraBertMaskPredictDataset

In [5]:
# dataset = QuoraBertMaskPredictDataset("train", 1000, 100, text_path='../../data/quora_train.txt')
dataset = QuoraBertMaskPredictDataset("train", 1000, 100, text_path='../../data/quora_train.txt', bow_strategy='indiv_topk')
# dataset = QuoraBertMaskPredictDataset("train", 124000, 100, text_path='../../data/quora_train.txt', bow_strategy='indiv_neighbors')
# dataset = QuoraBertMaskPredictDataset("train", 1000, 100, text_path='../../data/quora_train.txt', bow_strategy='indiv_topk', only_bow=True, use_origin=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def create_mini_batch(samples):
    seq1_tensors = [s[0] for s in samples]
    seq2_tensors = [s[1] for s in samples]
#     bows_tensors = [s[2] for s in samples]

    # zero pad
    seq1_tensors = pad_sequence(seq1_tensors,
                                  batch_first=True)

    seq2_tensors = pad_sequence(seq2_tensors,
                                  batch_first=True)    
    
#     return seq1_tensors, seq2_tensors, torch.stack(bows_tensors)
    return seq1_tensors, seq2_tensors


# it takes time to predict masked component
# to improve -> use gpu and calculate outside the dataset
data_loader = DataLoader(dataset, batch_size=64, collate_fn=create_mini_batch)

In [19]:
def get_replaced_sentence(seq1, pred):
    softmax = torch.nn.Softmax(dim=0)
    # add BOS and EOS
    pred_ws = [seq1[0].item()]
    for i in range(pred.shape[0]):
        # 1. top1, when prob > 0.5
#         prob, idx = torch.topk(softmax(pred[i][i+1]), 1)
#         w = idx.item() if prob > 0.5 else seq1[i+1].item() 
        
#         # 2. top1
#         prob, idx = torch.topk(softmax(pred[i][i+1]), 1)
#         w = idx.item()
        
        # 3. sample
        idx = torch.multinomial(softmax(pred[i][i+1]), 1)[0]
        w = idx.item()

        pred_ws.append(w)
        
    pred_ws.append(seq1[-1].item())
    return torch.tensor(pred_ws, dtype=torch.long)

In [20]:
si = 61
seq = dataset.sentences[si].split('\t')[0]
print(seq)

tokenizer = dataset.tokenizer
tokens = [dataset.SOS_token] + tokenizer.tokenize(seq) + [dataset.EOS_token]

ids = tokenizer.convert_tokens_to_ids(tokens)
ids = torch.tensor(ids, dtype=torch.long)

What if I hired two private eyes and ordered them to follow each other?


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
mask_sentences = []
for i in range(1, len(ids) - 1):
    mask_seq = ids.detach().clone()
    mask_seq[i] = dataset.MASK_token_id
    mask_sentences.append(mask_seq)

mask_stack = torch.stack(mask_sentences)
mask_stack = mask_stack.to(device)


In [23]:
dataset.mask_predict_model.eval()

with torch.no_grad():
        pred = dataset.mask_predict_model(mask_stack)[0]
#     pred = dataset.mask_predict_model(mask_stack, attention_mask=attention_mask)[0]
pred = pred.cpu()



In [24]:
res = get_replaced_sentence(ids, pred)
print(seq)
print(' '.join(tokenizer.convert_ids_to_tokens(res)))

What if I hired two private eyes and ordered them to follow each other?
[CLS] What if he sent their wandering investigators and taught them to stalk each other ? [SEP]


In [16]:
softmax = torch.nn.Softmax(dim=0)

In [17]:
for _ in range(10):
    pred_ws = []
    for i in range(pred.shape[0]):
        sample = torch.multinomial(softmax(pred[i][i+1]), 1)
        w = tokenizer.convert_ids_to_tokens(sample[0].item())
        pred_ws.append(w)
    print(pred_ws)

['What', 'if', 'he', 'held', 'two', 'steady', 'detectives', 'and', 'thought', 'them', 'to', 'see', 'each', 'other', '?']
['What', 'if', 'he', 'held', 'two', 'dogs', 'detectives', 'and', 'set', 'them', 'to', 'find', 'each', 'other', '?']
['What', 'if', 'I', 'called', 'two', 'calculating', 'investigators', 'and', 'wanted', 'them', 'to', 'see', 'each', 'other', '?']
['What', 'if', 'he', 'hid', 'their', 'little', 'detectives', 'and', 'had', 'them', 'to', 'copy', 'each', 'other', '?']
['What', 'if', 'he', 'opened', 'her', 'grey', 'investigators', 'and', 'asked', 'them', 'to', 'kill', 'each', 'other', '?']
['What', 'if', 'he', 'locked', 'his', 'card', 'investigators', 'and', 'told', 'them', 'to', 'get', 'each', 'other', '?']
['What', 'if', 'he', 'met', 'two', 'human', 'investigators', 'and', 'allowed', 'them', 'to', 'kill', 'each', 'other', '?']
['What', 'if', 'she', 'had', 'two', 'new', 'killers', 'and', 'told', 'them', 'to', 'kill', 'each', 'other', '?']
['What', 'if', 'he', 'removed', 'so

In [18]:
for i in range(20):
    sample = torch.multinomial(softmax(pred[5][6]), 1)
    out = tokenizer.convert_ids_to_tokens(sample[0].item())
    print(out)

evil
deep
dark
dark
evil
evil
private
blind
vampire
sort
blue
green
evil
exotic
cat
wandering
yellow
green
green
vampire
