In [1]:
# import tensorflow as tf
import torch
import numpy as np

import collections
import random

import pytorch_viz.tokenization as tokenization
import pytorch_viz.modeling as modeling
# from create_pretraining_data import TrainingInstance

In [2]:
class TrainingInstance(object):
  """A single training instance (sentence pair)."""

  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
               is_random_next):
    self.tokens = tokens
    self.segment_ids = segment_ids
    self.is_random_next = is_random_next
    self.masked_lm_positions = masked_lm_positions
    self.masked_lm_labels = masked_lm_labels

  def __str__(self):
    s = ""
    s += "tokens: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.tokens]))
    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
    s += "is_random_next: %s\n" % self.is_random_next
    s += "masked_lm_positions: %s\n" % (" ".join(
        [str(x) for x in self.masked_lm_positions]))
    s += "masked_lm_labels: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
    s += "\n"
    return s

  def __repr__(self):
    return self.__str__()

    
# functions to parse data
def create_masked_lm_predictions_based_given(tokens, max_predictions_per_seq, segment_ids):
  """Creates the predictions for the masked LM objective."""

  tokens_len = len(tokens)

  output_tokens = []
  masked_lm_positions = []
  masked_lm_labels = []
  segment_ids_new = []
  i=0
  idx=0
  num_masks = 0
  while i < tokens_len:
    tok = tokens[i]
    if tok==u'01':
      masked_token = "[MASK]"
      output_tokens.append(masked_token)
      masked_lm_positions.append(idx)
      i+=1
      num_masks += 1
      masked_lm_labels.append(tokens[i])
      segment_ids_new.append(segment_ids[i])
      idx+=1
    else:
      output_tokens.append(tok)
      segment_ids_new.append(segment_ids[i])
      idx+=1
    i+=1
  if num_masks>max_predictions_per_seq:
    print ('too many masks')
  # print (tokens)
  # print (output_tokens)
  # print (masked_lm_positions)
  # print (masked_lm_labels)
  # abc

  return (output_tokens, masked_lm_positions, masked_lm_labels, segment_ids_new)


def generate_example_given_instance(instance, tokenizer, max_seq_length,
                                    max_predictions_per_seq):
    
  input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
  input_mask = [1] * len(input_ids)
  segment_ids = list(instance.segment_ids)
  assert len(input_ids) <= max_seq_length

  while len(input_ids) < max_seq_length:
    input_ids.append(0)
    input_mask.append(0)
    segment_ids.append(0)

  assert len(input_ids) == max_seq_length
  assert len(input_mask) == max_seq_length
  assert len(segment_ids) == max_seq_length, "%d != %d"%(len(segment_ids),max_seq_length) 

  masked_lm_positions = list(instance.masked_lm_positions)
  masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
  masked_lm_weights = [1.0] * len(masked_lm_ids)

  while len(masked_lm_positions) < max_predictions_per_seq:
    masked_lm_positions.append(0)
    masked_lm_ids.append(0)
    masked_lm_weights.append(0.0)

  next_sentence_label = 1 if instance.is_random_next else 0

  features = collections.OrderedDict()
  features["input_ids"] = input_ids
  features["input_mask"] = input_mask
  features["segment_ids"] = segment_ids
  features["masked_lm_positions"] = masked_lm_positions
  features["masked_lm_ids"] = masked_lm_ids
  features["masked_lm_weights"] = masked_lm_weights
  features["next_sentence_labels"] = [next_sentence_label]
  features['masked_lm_labels'] = np.ones_like(input_ids)*-1
  features['masked_lm_labels'][masked_lm_positions] = masked_lm_ids
  features['masked_lm_labels'][0] = -1

  return features

In [3]:
SENT_A = "i will omit the benadryl because he gets extremely fatigued."
SENT_B =  ""
IS_RANDOM_NEXT = True

VOCAB_FILE = '../uncased_L-12_H-768_A-12/model/model07/vocab.txt'
DO_LOWER_CASE = True
MAX_PREDICTIONS_PER_SEQ = 20
MAX_SEQ_LENGTH = 128
RECORD_FILE = '../tmp/pred.tfrecord'
BERT_CONFIG_FILE = '../uncased_L-12_H-768_A-12/model/model07/bert_config.json'
OUTPUT_DIR = '../tmp/pretraining_output'
INIT_CHECKPOINT_PT = '../uncased_L-12_H-768_A-12/model/model07/trained_model.bin'
INIT_CHECKPOINT_CLS = '../uncased_L-12_H-768_A-12/model/model07/cls.bin'
LEARNING_RATE = 2e-5
NUM_TRAIN_STEPS = 1
NUM_WARMUP_STEPS = 10
USE_TPU = False
BATCH_SIZE = 1

tokenizer = tokenization.BertTokenizer(
      vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)
#tokenize
line = tokenization.convert_to_unicode(SENT_A)
line = line.strip()
tokens_a = tokenizer.tokenize(line)
line = tokenization.convert_to_unicode(SENT_B)
line = line.strip()
tokens_b = tokenizer.tokenize(line)
print (tokens_a)
print (tokens_b)

# generate token with mask and segment_ids
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
  tokens.append(token)
  segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
# for token in tokens_b:
#   tokens.append(token)
#   segment_ids.append(1)
# tokens.append("[SEP]")
# segment_ids.append(1)
(tokens_all, masked_lm_positions, masked_lm_labels, segment_ids) = create_masked_lm_predictions_based_given(
             tokens, MAX_PREDICTIONS_PER_SEQ, segment_ids)

# print tokens
# print len(tokens)
# print masked_lm_positions
# print masked_lm_labels
# generate instance
instance = TrainingInstance(
            tokens=tokens_all,
            segment_ids=segment_ids,
            is_random_next=IS_RANDOM_NEXT,
            masked_lm_positions=masked_lm_positions,
            masked_lm_labels=masked_lm_labels)

# generate tf_example
features = generate_example_given_instance(instance, tokenizer, MAX_SEQ_LENGTH, MAX_PREDICTIONS_PER_SEQ)

print (features)

[u'i', u'will', u'om', u'##it', u'the', u'benadryl', u'because', u'he', u'gets', u'extremely', u'fatigue', u'##d', u'.']
[]
OrderedDict([('input_ids', [101, 1045, 2097, 18168, 4183, 1996, 223, 2138, 2002, 4152, 5186, 16342, 2094, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ('input_mask', [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ('segment_ids', 

In [4]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

bert_config = modeling.BertConfig(BERT_CONFIG_FILE)
device = torch.device("cpu")
model = modeling.BertForPreTraining(bert_config)
model.load_state_dict(torch.load(INIT_CHECKPOINT_PT, map_location='cpu'))
# model.bert.from_pretrained(INIT_DIRECTORY)
model.to(device)
cls = torch.nn.Linear(768, 2)
cls.load_state_dict(torch.load(INIT_CHECKPOINT_CLS, map_location='cpu'))
cls.to(device)

print ('loaded model')

#TODO resolve features
all_input_ids = torch.tensor([features['input_ids']], dtype=torch.long)
all_input_mask = torch.tensor([features['input_mask']], dtype=torch.long)
all_input_type_ids = torch.tensor([features['segment_ids']], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_type_ids, all_example_index)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)

model.eval()

loaded model


BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertIntermediate(
    

In [6]:
for input_ids, input_mask, input_type_ids, example_indices in eval_dataloader:
    # print(input_ids)
    # print(input_mask)
    # print(example_indices)
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)

    masked_lm_logits_scores, pooled_output, tmp = model(input_ids, token_type_ids=input_type_ids, attention_mask=input_mask, output_all_encoded_layers=True)
    print (len(tmp))
    predicts_prob = cls(pooled_output)
print(predicts_prob)
enc_atts = []
for item in tmp:
    enc_atts.append(item.detach().numpy()[0])
enc_atts = np.array(enc_atts)
print (enc_atts.shape)

12
tensor([[-2.9278,  2.7973]], grad_fn=<ThAddmmBackward>)
(12, 12, 128, 128)


In [7]:
SIZE = len(tokens_all)
# SIZE = 10

# def encode_eval(input_str, output_str):
#   inputs = tf.reshape(encoders["inputs"].encode(input_str) + [1], [1, -1, 1, 1])  # Make it 3D.
#   outputs = tf.reshape(encoders["inputs"].encode(output_str) + [1], [1, -1, 1, 1])  # Make it 3D.
#   return {"inputs": inputs, "targets": outputs}

# def get_att_mats():
#   enc_atts = []
#   dec_atts = []
#   encdec_atts = []

#   for i in range(hparams.num_hidden_layers):
#     enc_att = translate_model.attention_weights[
#       "transformer/body/encoder/layer_%i/self_attention/multihead_attention/dot_product_attention" % i][0]
#     dec_att = translate_model.attention_weights[
#       "transformer/body/decoder/layer_%i/self_attention/multihead_attention/dot_product_attention" % i][0]
#     encdec_att = translate_model.attention_weights[
#       "transformer/body/decoder/layer_%i/encdec_attention/multihead_attention/dot_product_attention" % i][0]
#     enc_atts.append(resize(enc_att))
#     dec_atts.append(resize(dec_att))
#     encdec_atts.append(resize(encdec_att))
#   return enc_atts, dec_atts, encdec_atts

def resize(np_mat):
  # Sum across heads
  np_mat = np_mat[:, :SIZE, :SIZE]
  row_sums = np.sum(np_mat, axis=0)
  # Normalize
  layer_mat = np_mat / row_sums[np.newaxis, :]
  lsh = layer_mat.shape
  # Add extra dim for viz code to work.
  layer_mat = np.reshape(layer_mat, (1, lsh[0], lsh[1], lsh[2]))
  return layer_mat

def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

In [8]:
import attention

enc_atts_list = []
for i in range(enc_atts.shape[0]):
    enc_atts_list.append(resize(enc_atts[i]))

call_html()
attention.show(tokens_all[:SIZE], tokens_all[:SIZE], enc_atts_list)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>