In [1]:
!pip install transformers
import transformers as trf
import torch as pt
import numpy as np
import pandas as pd
import random as rnd




Intel(R) Data Analytics Acceleration Library (Intel(R) DAAL) solvers for sklearn enabled: https://intelpython.github.io/daal4py/sklearn.html


In [2]:
bert = trf.BertModel.from_pretrained('bert-base-uncased')
tokenizer = trf.BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
def norm_of(word):
  with pt.no_grad():
    return bert.embeddings.word_embeddings.weight[tokenizer.convert_tokens_to_ids([word])[0]].norm().item()

In [4]:
def index_of(word):
  return tokenizer.convert_tokens_to_ids([word])[0]

In [6]:
def reconstruct(tk_ids, model=bert, tokenizer=tokenizer):
  print(f'input tokens: {tk_ids}')
  print(f'input sentence: {tokenizer.decode(tk_ids)}')
  with pt.no_grad():
    outs, _ = model(pt.tensor(tk_ids).unsqueeze(0))
  outs.detach()
  dots = pt.matmul(outs, model.embeddings.word_embeddings.weight.T)
  softmaxes = dots.softmax(dim=-1)
  predictions = softmaxes.argmax(dim=-1).squeeze()
  print(f'predicted tokens: {tokenizer.convert_ids_to_tokens(predictions)}')
  decoded = tokenizer.decode(predictions)
  print(f'decoded sentence: {decoded}')
  return predictions


In [7]:
sentence = 'london is the capital of great britain.'
tks = tokenizer.encode(sentence)
for _ in range(10):
  tks = reconstruct(tks)

input tokens: [101, 2414, 2003, 1996, 3007, 1997, 2307, 3725, 1012, 102]
input sentence: [CLS] london is the capital of great britain. [SEP]
predicted tokens: ['[CLS]', '[CLS]', '[CLS]', '[CLS]', 'capital', '[CLS]', 'ned', '[CLS]', '##ann', '##vis']
decoded sentence: [CLS] [CLS] [CLS] [CLS] capital [CLS] ned [CLS]annvis
input tokens: tensor([  101,   101,   101,   101,  3007,   101, 12311,   101, 11639, 11365])
input sentence: [CLS] [CLS] [CLS] [CLS] capital [CLS] ned [CLS]annvis
predicted tokens: ['[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]']
decoded sentence: [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS]
input tokens: tensor([101, 101, 101, 101, 101, 101, 101, 101, 101, 101])
input sentence: [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS]


  """


predicted tokens: ['[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]']
decoded sentence: [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS]
input tokens: tensor([101, 101, 101, 101, 101, 101, 101, 101, 101, 101])
input sentence: [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS]
predicted tokens: ['[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]']
decoded sentence: [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS]
input tokens: tensor([101, 101, 101, 101, 101, 101, 101, 101, 101, 101])
input sentence: [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS]
predicted tokens: ['[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]']
decoded sentence: [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS]
input tokens: tensor([101, 101, 101, 101, 101, 101, 101, 101, 101, 101])
input sentence: [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS] [CLS

In [0]:
the_factor = norm_of('[CLS]') / norm_of('the')

In [0]:
cls_factor = 1 / the_factor

In [7]:
tokenizer.convert_tokens_to_ids('[MASK]')

103

In [0]:
bert.embeddings.word_embeddings.weight[index_of('the')] *= the_factor
bert.embeddings.word_embeddings.weight[index_of('[CLS]')] *= cls_factor

In [9]:
norm_of('the')

2.030848979949951

In [10]:
norm_of('[CLS]')

0.8216485381126404

In [0]:
def reconstruct(tk_ids, model=bert, tokenizer=tokenizer):
  print(f'input tokens: {tk_ids}')
  print(f'input sentence: {tokenizer.decode(tk_ids)}')
  with pt.no_grad():
    outs, _ = model(pt.tensor(tk_ids).unsqueeze(0))
  outs.detach()
  dots = pt.matmul(outs, model.embeddings.word_embeddings.weight.T)
  softmaxes = dots.softmax(dim=-1)
  predictions = softmaxes.argmax(dim=-1).squeeze()
  print(f'predicted tokens: {tokenizer.convert_ids_to_tokens(predictions)}')
  decoded = tokenizer.decode(predictions)
  print(f'decoded sentence: {decoded}')
  return predictions


In [12]:
sentence = 'london is the capital of great britain.'
tks = tokenizer.encode(sentence)
for _ in range(10):
  tks = reconstruct(tks)

input tokens: [101, 2414, 2003, 1996, 3007, 1997, 2307, 3725, 1012, 102]
input sentence: [CLS] london is the capital of great britain. [SEP]
predicted tokens: ['##city', 'london', 'the', 'the', 'capital', 'the', 'ned', 'britain', '##ann', '##vis']
decoded sentence: ##city london the the capital the ned britainannvis
input tokens: tensor([12972,  2414,  1996,  1996,  3007,  1996, 12311,  3725, 11639, 11365])
input sentence: ##city london the the capital the ned britainannvis


  """


predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the

In [13]:
sentence = 'water consists of hydrogen and oxygen.'
tks = tokenizer.encode(sentence)
for _ in range(10):
  tks = reconstruct(tks)

input tokens: [101, 2300, 3774, 1997, 9732, 1998, 7722, 1012, 102]
input sentence: [CLS] water consists of hydrogen and oxygen. [SEP]
predicted tokens: ['##lus', 'the', 'the', 'the', 'hydrogen', 'the', 'oxygen', '##ther', '##ther']
decoded sentence: ##lus the the the hydrogen the oxygentherther
input tokens: tensor([ 7393,  1996,  1996,  1996,  9732,  1996,  7722, 12399, 12399])
input sentence: ##lus the the the hydrogen the oxygentherther


  """


predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996,

In [14]:
sentence = 'washington is the capital of united states.'
tks = tokenizer.encode(sentence)
for _ in range(10):
  tks = reconstruct(tks)

input tokens: [101, 2899, 2003, 1996, 3007, 1997, 2142, 2163, 1012, 102]
input sentence: [CLS] washington is the capital of united states. [SEP]
predicted tokens: ['##cas', 'washington', 'the', 'the', 'capitals', 'the', 'usa', 'puget', '##vey', '##vna']
decoded sentence: ##cas washington the the capitals the usa pugetveyvna
input tokens: tensor([15671,  2899,  1996,  1996, 15433,  1996,  3915, 27879, 12417, 29207])
input sentence: ##cas washington the the capitals the usa pugetveyvna
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 19

  """


predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the

In [15]:
sentence = 'the currency of canada is the canadian dollar.'
tks = tokenizer.encode(sentence)
for _ in range(10):
  tks = reconstruct(tks)

input tokens: [101, 1996, 9598, 1997, 2710, 2003, 1996, 3010, 7922, 1012, 102]
input sentence: [CLS] the currency of canada is the canadian dollar. [SEP]
predicted tokens: ['##lus', '##chet', 'currency', 'the', 'canada', 'the', '##nated', 'canadian', 'dollar', '##ann', '##ust']
decoded sentence: ##luschet currency the canada thenated canadian dollarannust
input tokens: tensor([ 7393, 20318,  9598,  1996,  2710,  1996, 23854,  3010,  7922, 11639,
        19966])
input sentence: ##luschet currency the canada thenated canadian dollarannust
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the the


  """


predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the

In [0]:
bert.embeddings.word_embeddings.weight[103] /= 2


In [17]:
sentence = 'the currency of canada is the canadian dollar.'
tks = tokenizer.encode(sentence)
for _ in range(10):
  tks = reconstruct(tks)

input tokens: [101, 1996, 9598, 1997, 2710, 2003, 1996, 3010, 7922, 1012, 102]
input sentence: [CLS] the currency of canada is the canadian dollar. [SEP]
predicted tokens: ['##lus', '##chet', 'currency', 'the', 'canada', 'the', '##nated', 'canadian', 'dollar', '##ann', '##ust']
decoded sentence: ##luschet currency the canada thenated canadian dollarannust
input tokens: tensor([ 7393, 20318,  9598,  1996,  2710,  1996, 23854,  3010,  7922, 11639,
        19966])
input sentence: ##luschet currency the canada thenated canadian dollarannust
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the 

  """


input sentence: the the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the the
predicted tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
decoded sentence: the the the the the the the the the the the
input tokens: tensor([1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996])
input sentence: the the the the the the the the the the the
predicted tokens: [