In [1]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

In [2]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [3]:
# WordPiece tokenization이 어떤식으로 진행되는지 확인해 보기
example_text = "These days word embeddings are important."

In [4]:
input_text = "[CLS]" + example_text + "[SEP]"

In [5]:
# 토크나이제이션 수행하기
tokenized_text = tokenizer.tokenize(input_text)

# Print out the tokens.
print (tokenized_text)

['[CLS]', 'these', 'days', 'word', 'em', '##bed', '##ding', '##s', 'are', 'important', '.', '[SEP]']


In [6]:
# 각 토큰의 아이디 확인하기
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

for token, index in zip(tokenized_text, indexed_tokens):
    print('Token: {0},  Index: {1}'.format(token, index))

Token: [CLS],  Index: 101
Token: these,  Index: 2122
Token: days,  Index: 2420
Token: word,  Index: 2773
Token: em,  Index: 7861
Token: ##bed,  Index: 8270
Token: ##ding,  Index: 4667
Token: ##s,  Index: 2015
Token: are,  Index: 2024
Token: important,  Index: 2590
Token: .,  Index: 1012
Token: [SEP],  Index: 102


In [7]:
print(list(tokenizer.vocab.keys())[2000:2010])

['to', 'was', 'he', 'is', 'as', 'for', 'on', 'with', 'that', 'it']


In [8]:
len(tokenizer.vocab.keys())

30522

In [9]:
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."

In [10]:
# 자동으로 해줌
inputs = tokenizer(text, return_tensors="np", max_length=50, padding='max_length',
                  truncation=True)

In [11]:
inputs

{'input_ids': array([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,
         1996,  2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,
         2314,  2924,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]]), 'token_type_ids': array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]])}

In [12]:
inputs['input_ids']

array([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,
         1996,  2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,
         2314,  2924,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]])

In [13]:
# Load pre-trained model (weights)
model = TFBertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [14]:
outputs = model(inputs)

In [15]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

In [16]:
outputs

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(1, 50, 768), dtype=float32, numpy=
array([[[-0.49644348, -0.18308331, -0.5231442 , ..., -0.19021118,
          0.37379786,  0.3964439 ],
        [-0.13227051, -0.2762247 , -0.34953567, ..., -0.45665833,
          0.3786474 , -0.10961407],
        [-0.36261478, -0.40016505,  0.06757405, ..., -0.32070866,
         -0.27089787, -0.3004259 ],
        ...,
        [-0.3983887 , -0.5654813 , -0.06805924, ...,  0.34025484,
          0.17032716, -0.21694681],
        [ 0.39903018, -0.21066266,  0.14042473, ..., -0.08417968,
         -0.13027403, -0.12785591],
        [ 0.53758544, -0.13355471,  0.20943275, ..., -0.12839381,
         -0.08842888, -0.08094992]]], dtype=float32)>, pooler_output=<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-0.6030822 , -0.33421525, -0.7174114 ,  0.33471996,  0.5144567 ,
        -0.1721656 ,  0.45023912,  0.27680638, -0.3769242 , -0.99984133,
        -0.3657234 ,  0.753

In [17]:
outputs.last_hidden_state.shape

TensorShape([1, 50, 768])

In [18]:
last_hidden_state = outputs.last_hidden_state
last_hidden_state[0][0]

<tf.Tensor: shape=(768,), dtype=float32, numpy=
array([-4.96443480e-01, -1.83083311e-01, -5.23144186e-01,  5.25867224e-01,
        5.07566810e-01,  1.63770676e-01,  2.03401476e-01,  3.17120075e-01,
       -5.31787351e-02, -1.73956960e-01,  1.56624377e-01, -2.91167259e-01,
       -4.70112771e-01,  6.43651187e-01,  1.01685300e-01,  4.04034853e-02,
       -2.23800063e-01,  4.66817319e-01,  7.84237802e-01, -2.29058176e-01,
       -1.18187755e-01, -1.04613125e-01,  2.05791920e-01,  1.56229824e-01,
       -3.35293673e-02, -1.64334714e-01, -3.00018638e-01, -9.57755372e-02,
       -9.04613957e-02,  3.83675486e-01,  5.08686453e-02,  5.76396137e-02,
       -1.03359997e-01, -8.35441887e-01,  1.45372719e-01, -3.95007849e-01,
        4.80393246e-02, -1.43011436e-01,  4.68776897e-02,  3.04918170e-01,
       -3.79328638e-01,  8.92914236e-02, -2.46659905e-01,  4.72980104e-02,
        2.10816026e-01, -6.77735269e-01, -3.22645259e+00, -7.80532062e-02,
       -2.20757216e-01, -2.99442977e-01,  8.09336454

In [19]:
outputs.pooler_output.shape

TensorShape([1, 768])

In [20]:
hidden_states = outputs.hidden_states
len(hidden_states)

13

In [21]:
bank1_vector = outputs.last_hidden_state[0][6] # 1st bank
bank2_vector = outputs.last_hidden_state[0][10] # 2nd bank
bank3_vector = outputs.last_hidden_state[0][19] # 3rd bank

In [22]:
bank1_vector.shape

TensorShape([768])

In [23]:
import numpy as np

In [24]:
# 첫 번째 bank와 두 번째 bank 간의 코사인 유사도
np.dot(bank1_vector, bank2_vector)/(np.linalg.norm(bank1_vector)*np.linalg.norm(bank2_vector))

0.952733

In [25]:
# 첫 번째 bank와 세 번째 bank 간의 코사인 유사도
np.dot(bank1_vector, bank3_vector)/(np.linalg.norm(bank1_vector)*np.linalg.norm(bank3_vector))

0.6988579

In [26]:
# 두 번째 bank와 세 번째 bank 간의 코사인 유사도
np.dot(bank3_vector, bank2_vector)/(np.linalg.norm(bank3_vector)*np.linalg.norm(bank2_vector))

0.69788194