<a href="https://colab.research.google.com/github/the-Quert/bert/blob/master/Implementation/iNLPfun.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
pip install transformers tqdm boto3 requests regex -q

[K     |████████████████████████████████| 317kB 3.5MB/s 
[K     |████████████████████████████████| 645kB 59.0MB/s 
[K     |████████████████████████████████| 860kB 50.4MB/s 
[K     |████████████████████████████████| 1.0MB 53.4MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [0]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
import torch, random, glob
from transformers import BertTokenizer
from transformers import BertForMaskedLM
from IPython.display import clear_output


In [0]:
PRETRAINED_MODEL_NAME = "bert-base-chinese"  

In [0]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

100%|██████████| 109540/109540 [00:00<00:00, 1863401.74B/s]


In [0]:
text = "[CLS] 等到潮水 [MASK] 了，就知道誰沒穿褲子。"
tokens = tokenizer.tokenize(text)
ids = tokenizer.convert_tokens_to_ids(tokens)

print(text)
print(tokens[:10], '...')
print(ids[:10], '...')

[CLS] 等到潮水 [MASK] 了，就知道誰沒穿褲子。
['[CLS]', '等', '到', '潮', '水', '[MASK]', '了', '，', '就', '知'] ...
[101, 5023, 1168, 4060, 3717, 103, 749, 8024, 2218, 4761] ...


In [0]:
# 除了 tokens 以外還需要辨別句子的 segment ids
tokens_tensor = torch.tensor([ids])  # (1, seq_len)
segments_tensors = torch.zeros_like(tokens_tensor)  # (1, seq_len)
maskedLM_model = BertForMaskedLM.from_pretrained(PRETRAINED_MODEL_NAME)
clear_output()


In [0]:
# 使用 masked LM 估計 [MASK] 位置所代表的實際 token 
maskedLM_model.eval()
with torch.no_grad():
    outputs = maskedLM_model(tokens_tensor, segments_tensors)
    predictions = outputs[0]
    # (1, seq_len, num_hidden_units)
del maskedLM_model


In [0]:
# 將 [MASK] 位置的機率分佈取 top k 最有可能的 tokens 出來
masked_index = 5
k = 3
probs, indices = torch.topk(torch.softmax(predictions[0, masked_index], -1), k)
predicted_tokens = tokenizer.convert_ids_to_tokens(indices.tolist())


In [0]:
# 顯示 top k 可能的字。取 top 1 當作預測值
print("輸入 tokens ：", tokens[:10], '...')
print('-' * 50)
for i, (t, p) in enumerate(zip(predicted_tokens, probs), 1):
    tokens[masked_index] = t
    print("Top {} ({:2}%)：{}".format(i, int(p.item() * 100), tokens[:10]), '...')

輸入 tokens ： ['[CLS]', '等', '到', '潮', '水', '過', '了', '，', '就', '知']
--------------------------------------------------
Top 1 (67%)：['[CLS]', '等', '到', '潮', '水', '來', '了', '，', '就', '知'] ...
Top 2 (25%)：['[CLS]', '等', '到', '潮', '水', '濕', '了', '，', '就', '知'] ...
Top 3 ( 2%)：['[CLS]', '等', '到', '潮', '水', '過', '了', '，', '就', '知'] ...


In [0]:
# 安裝 BertViz
import sys
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']


Cloning into 'bertviz_repo'...
remote: Enumerating objects: 56, done.[K
remote: Counting objects: 100% (56/56), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 899 (delta 25), reused 39 (delta 13), pack-reused 843[K
Receiving objects: 100% (899/899), 78.48 MiB | 26.42 MiB/s, done.
Resolving deltas: 100% (568/568), done.


In [0]:
# import packages
from bertviz.pytorch_transformers_attn import BertModel, BertTokenizer
from bertviz.head_view import show

# 在 jupyter notebook 裡頭顯示 visualzation 的 helper
def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

clear_output()

In [0]:
model_type = 'bert'
bert_version = 'bert-base-chinese'

bertviz_model = BertModel.from_pretrained(bert_version)
bertviz_tokenizer = BertTokenizer.from_pretrained(bert_version)

# 情境 1 
sentence_a = "胖虎叫大雄買漫畫，"
sentence_b = "回來慢了就打他。"
call_html()
show(bertviz_model, model_type, bertviz_tokenizer, sentence_a, sentence_b)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [0]:
import glob
glob.glob('*.csv.zip')

[]