In [1]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

In [2]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
# WordPiece tokenization이 어떤식으로 진행되는지 확인해 보기
example_text = "These days word embeddings are important."

In [8]:
input_text = "[CLS]" + example_text + "[SEP]"

In [7]:
# 토크나이제이션 수행하기
tokenized_text = tokenizer.tokenize(input_text)

# Print out the tokens.
print (tokenized_text)

['[CLS]', 'these', 'days', 'word', 'em', '##bed', '##ding', '##s', 'are', 'important', '.', '[SEP]']


In [9]:
# 각 토큰의 아이디 확인하기
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

for token, index in zip(tokenized_text, indexed_tokens):
    print('Token: {0},  Index: {1}'.format(token, index))

Token: [CLS],  Index: 101
Token: these,  Index: 2122
Token: days,  Index: 2420
Token: word,  Index: 2773
Token: em,  Index: 7861
Token: ##bed,  Index: 8270
Token: ##ding,  Index: 4667
Token: ##s,  Index: 2015
Token: are,  Index: 2024
Token: important,  Index: 2590
Token: .,  Index: 1012
Token: [SEP],  Index: 102


In [7]:
print(list(tokenizer.vocab.keys())[2000:2010])

['to', 'was', 'he', 'is', 'as', 'for', 'on', 'with', 'that', 'it']


In [8]:
len(tokenizer.vocab.keys())

30522

In [45]:
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."

In [76]:
# 자동으로 해줌
inputs = tokenizer(text, return_tensors="np", max_length=50, padding='max_length',
                  truncation=True)

In [77]:
inputs

{'input_ids': array([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,
         1996,  2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,
         2314,  2924,  1012,   102,  1045,  3866,  2009,  1012,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]]), 'token_type_ids': array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]])}

In [78]:
inputs['input_ids']

array([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,
         1996,  2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,
         2314,  2924,  1012,   102,  1045,  3866,  2009,  1012,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]])

In [79]:
# Load pre-trained model (weights)
model = TFBertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [80]:
outputs = model(inputs)

In [81]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

In [82]:
outputs

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(1, 50, 768), dtype=float32, numpy=
array([[[ 0.18630259, -0.08853763, -0.31647214, ..., -0.36787772,
          0.2583288 ,  0.37412503],
        [ 0.23645057, -0.48485497, -0.10432266, ..., -0.46558157,
          0.32942188, -0.09824116],
        [-0.08680285, -0.48331517, -0.02683613, ..., -0.5769142 ,
         -0.19851959, -0.47474113],
        ...,
        [ 0.1077374 , -0.17673402,  0.10932198, ...,  0.05099047,
          0.12281671, -0.0564242 ],
        [ 0.46033293,  0.14346765,  0.14582571, ...,  0.01768432,
          0.18873154, -0.0517134 ],
        [ 0.5430294 , -0.09196662,  0.16534087, ..., -0.16804206,
         -0.03168319,  0.01910625]]], dtype=float32)>, pooler_output=<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-0.8156238 , -0.3731762 , -0.8378582 ,  0.42739952,  0.6788647 ,
        -0.08003396,  0.6671873 ,  0.18694296, -0.6864796 , -0.9999335 ,
        -0.35925588,  0.926

In [83]:
outputs.last_hidden_state.shape

TensorShape([1, 50, 768])

In [84]:
last_hidden_state = outputs.last_hidden_state
last_hidden_state[0][0]

<tf.Tensor: shape=(768,), dtype=float32, numpy=
array([ 1.86302587e-01, -8.85376334e-02, -3.16472143e-01,  2.16291517e-01,
        4.41600159e-02, -3.11819822e-01, -1.97564185e-01,  9.13671851e-01,
        1.19621672e-01, -3.86951983e-01,  3.09093237e-01, -1.04595676e-01,
        2.22184002e-01,  3.67674768e-01,  3.13024044e-01,  2.39848435e-01,
        1.26845583e-01,  2.78158128e-01,  6.46569550e-01, -2.51294523e-01,
       -1.85707048e-01, -3.29144388e-01,  7.30341598e-02, -9.53947194e-03,
       -1.05522789e-01, -6.49560690e-02, -2.98829794e-01, -2.80056506e-01,
        9.19551924e-02,  4.07719500e-02, -3.12344246e-02,  3.66604209e-01,
       -3.52290809e-01, -3.01766753e-01,  3.83294761e-01, -2.13039517e-01,
        9.04702693e-02, -3.51489037e-01,  3.57642919e-02,  1.85888469e-01,
       -6.83337897e-02,  1.75581098e-01,  2.30882302e-01,  1.83661357e-01,
        2.46510342e-01, -4.01863456e-01, -3.42457414e+00,  1.22301280e-01,
       -2.22957790e-01, -3.28175962e-01,  2.00240165

In [85]:
outputs.pooler_output.shape

TensorShape([1, 768])

In [86]:
hidden_states = outputs.hidden_states
len(hidden_states)

13

In [87]:
bank1_vector = outputs.last_hidden_state[0][6] # 1st bank
bank2_vector = outputs.last_hidden_state[0][10] # 2nd bank
bank3_vector = outputs.last_hidden_state[0][19] # 3rd bank

In [88]:
bank1_vector.shape

TensorShape([768])

In [89]:
import numpy as np

In [90]:
# 첫 번째 bank와 두 번째 bank 간의 코사인 유사도
np.dot(bank1_vector, bank2_vector)/(np.linalg.norm(bank1_vector)*np.linalg.norm(bank2_vector))

0.9517975

In [91]:
# 첫 번째 bank와 세 번째 bank 간의 코사인 유사도
np.dot(bank1_vector, bank3_vector)/(np.linalg.norm(bank1_vector)*np.linalg.norm(bank3_vector))

0.69389445

In [75]:
# 두 번째 bank와 세 번째 bank 간의 코사인 유사도
np.dot(bank3_vector, bank2_vector)/(np.linalg.norm(bank3_vector)*np.linalg.norm(bank2_vector))

0.7164332