In [1]:
import torch
import argparse
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import yaml
from argparse import ArgumentParser

from transformer.Models import Transformer
from transformer.Translator import Translator

In [2]:
##### Read Arguments from Config File #####

# config_path = '../configs/dpng_transformer_bert_tokenizer.yaml'
# config_path = '../configs/dpng_transformer_bert_tokenizer_bow.yaml'
config_path = '../configs/dpng_transformer_bert_tokenizer_bow_indivtopk.yaml'
# config_path = '../configs/dpng_transformer_bert_tokenizer_bow_indivtopk_onlybow.yaml'

with open(config_path) as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    print(config)

save_model_path = config['save_model_path']
output_file = config['test_output_file']
use_dataset = config['dataset']

batch_size = 1

d_model = config['d_model']
d_inner_hid = config['d_inner_hid']
d_k = config['d_k']
d_v = config['d_v']

n_head = config['n_head']
n_layers = config['n_layers']
n_warmup_steps = config['n_warmup_steps']

dropout = config['dropout']
embs_share_weight = config['embs_share_weight']
proj_share_weight = config['proj_share_weight']
label_smoothing = config['label_smoothing']

train_size = config['train_size']
val_size = config['val_size']
test_size = config['test_size']

beam_size = 3
max_seq_len = 30

try:
    is_bow = config['is_bow']

    if is_bow:
        bow_strategy = config['bow_strategy']
        topk = config['topk']
        if bow_strategy != 'simple_sum':
            indiv_topk = config['indiv_topk']
        else:
            # not used but use default value for simplicity
            indiv_topk = 50

except KeyError:
    is_bow = False
    
try:
    only_bow = config['only_bow']
except KeyError:
    only_bow = False 
# ###################

{'save_model_path': '../models/DNPG_base_transformer_bert_tokenizer_bert_bow_indivtopk.pth', 'log_file': '../logs/DNPG_base_transformer_bert_tokenizer_training_bert_bow_indiv_topk.txt', 'test_output_file': '../outputs/test_DNPG_transformer_bert_tokenizer_bow_indivtopk_out.txt', 'val_output_file': '../outputs/val_DNPG_transformer_bert_tokenizer_bow_indivtopk_out.txt', 'dataset': 'quora_bert_mask_predict_dataset', 'num_epochs': 50, 'batch_size': 100, 'd_model': 450, 'd_inner_hid': 512, 'd_k': 50, 'd_v': 50, 'n_head': 9, 'n_layers': 3, 'n_warmup_steps': 8000, 'dropout': 0.1, 'embs_share_weight': True, 'proj_share_weight': True, 'label_smoothing': False, 'train_size': 100000, 'val_size': 4000, 'test_size': 20000, 'is_bow': True, 'bow_strategy': 'indiv_topk', 'indiv_topk': 10, 'topk': 50, 'lr': '5e-4'}


In [3]:
preprocessed = True
# load dataset
if preprocessed:
    from datasets.quora_preprocessed_dataset import QuoraPreprocessedDataset as Dataset
else:
    if use_dataset == 'quora_dataset':
        from datasets.quora_dataset import QuoraDataset as Dataset
    elif use_dataset == 'quora_bert_dataset':
        from datasets.quora_bert_dataset import QuoraBertDataset as Dataset
    elif use_dataset == 'quora_bert_mask_predict_dataset':
        from datasets.quora_bert_mask_predict_dataset import QuoraBertMaskPredictDataset as Dataset
    elif use_dataset == 'quora_word_mask_prediction_dataset':
        from datasets.quora_word_mask_prediction_dataset import QuoraWordMaskPredictDataset as Dataset
    else:
        raise NotImplementedError("Dataset is not defined or not implemented")
        



In [4]:
def create_mini_batch(samples):
    seq1_tensors = [s[0] for s in samples]
    seq2_tensors = [s[1] for s in samples]

    # zero pad
    seq1_tensors = pad_sequence(seq1_tensors,
                                  batch_first=True)

    seq2_tensors = pad_sequence(seq2_tensors,
                                  batch_first=True)    
    
    return seq1_tensors, seq2_tensors


if preprocessed:
    model_name = config_path.split('/')[-1][:-5]
    preprocessed_file = '../data/preprocess_all_{}.npy'.format(model_name)
    dataset = Dataset("test", train_size, val_size, test_size, preprocessed_file=preprocessed_file)
elif is_bow:
    dataset = Dataset("test", train_size, val_size, test_size, bow_strategy=bow_strategy, topk=topk, indiv_topk=indiv_topk, only_bow=only_bow)
else:
    dataset = Dataset("test", train_size, val_size, test_size)

data_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=create_mini_batch, shuffle=False)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

transformer = Transformer(
    dataset.n_words,
    dataset.n_words,
    src_pad_idx=dataset.PAD_token_id,
    trg_pad_idx=dataset.PAD_token_id,
    trg_emb_prj_weight_sharing=proj_share_weight,
    emb_src_trg_weight_sharing=embs_share_weight,
    d_k=d_k,
    d_v=d_v,
    d_model=d_model,
    d_word_vec=d_model,
    d_inner=d_inner_hid,
    n_layers=n_layers,
    n_head=n_head,
    dropout=dropout    
)

model = transformer.to(device)

model.load_state_dict((torch.load(
        save_model_path, map_location=device)))

<All keys matched successfully>

In [6]:
src_pad_idx = dataset.PAD_token_id
trg_pad_idx = dataset.PAD_token_id
    
trg_bos_idx = dataset.SOS_token_id
trg_eos_idx = dataset.EOS_token_id
unk_idx = dataset.UNK_token_id

In [8]:
seq1, seq2 = next(iter(data_loader))
src_seq = seq1.to(device)
trg_seq = seq2[:, :-1].to(device)
pred = model(src_seq, trg_seq)

In [10]:
pred

torch.Size([10, 28996])

In [None]:
model_version = 'bert-base-uncased'
do_lower_case = True
model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)
sentence_a = "The cat sat on the mat"
sentence_b = "The cat lay on the rug"
show_head_view(model, tokenizer, sentence_a, sentence_b)

def show_head_view(model, tokenizer, sentence_a, sentence_b=None):
    inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True)
    input_ids = inputs['input_ids']
    if sentence_b:
        token_type_ids = inputs['token_type_ids']
        attention = model(input_ids, token_type_ids=token_type_ids)[-1]
        sentence_b_start = token_type_ids[0].tolist().index(1)
    else:
        attention = model(input_ids)[-1]
        sentence_b_start = None
    input_id_list = input_ids[0].tolist() # Batch index 0
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)    
    head_view(attention, tokens, sentence_b_start)