In [1]:
!pip install transformers==4.5.0 fugashi==1.1.0 ipadic==1.0.0

Collecting transformers==4.5.0
  Downloading transformers-4.5.0-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 5.5 MB/s 
[?25hCollecting fugashi==1.1.0
  Downloading fugashi-1.1.0-cp37-cp37m-manylinux1_x86_64.whl (486 kB)
[K     |████████████████████████████████| 486 kB 35.8 MB/s 
[?25hCollecting ipadic==1.0.0
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 20.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 33.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.5 MB/s 
Building wheels for collected packages: ipadic
  Building wheel for ipadic (setup.py) ... [?25l[?25hdone
  Created wheel for ipadic: filename=ipadic-1.0.0-py3-none-an

In [2]:
import torch
from transformers import BertJapaneseTokenizer, BertModel

# トークナイザ
文章をトークンに分割し、BERTに入力できる形に変換する。

In [3]:
model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

In [4]:
len(tokenizer.vocab)

32000

In [5]:
# tokenizerを使ってみる
tokenizer.tokenize('明日は自然言語処理の勉強をしよう')

['明日', 'は', '自然', '言語', '処理', 'の', '勉強', 'を', 'しよ', 'う']

In [6]:
tokenizer.tokenize('マシンラーニングの勉強をしよう')

['マシン', '##ラー', '##ニング', 'の', '勉強', 'を', 'しよ', 'う']

In [7]:
tokenizer.tokenize('魑魅魍魎だな')

['[UNK]', 'だ', 'な']

In [8]:
# トークンのIDで返ってくるようにする
input_ids = tokenizer.encode('明日は自然言語処理の勉強をしよう')
print(input_ids)

[2, 11475, 9, 1757, 1882, 2762, 5, 8192, 11, 2132, 205, 3]


2つ増えているのは先頭と末尾にトークンが追加されているから。

In [9]:
tokenizer.convert_ids_to_tokens(input_ids)

['[CLS]', '明日', 'は', '自然', '言語', '処理', 'の', '勉強', 'を', 'しよ', 'う', '[SEP]']

In [10]:
text = '明日の天気は晴れだ'
encoding = tokenizer(text, max_length = 12, padding = 'max_length', truncation = True)
print('# encoding:')
print(encoding)

tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'])
print('# tokens')
print(tokens)

# encoding:
{'input_ids': [2, 11475, 5, 11385, 9, 16577, 75, 3, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]}
# tokens
['[CLS]', '明日', 'の', '天気', 'は', '晴れ', 'だ', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


tokenizerは辞書形式で返ってくる。今回は12を指定しているので、足りない分はpaddingされ、paddingしたところはattention_maskが0になる。

In [11]:
# 6にしてみる
text = '明日の天気は晴れだ'
encoding = tokenizer(text, max_length = 6, padding = 'max_length', truncation = True)

tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'])
print('# tokens')
print(tokens)

# tokens
['[CLS]', '明日', 'の', '天気', 'は', '[SEP]']


In [12]:
# 複数の文章をまとめて処理できる
text_list = ['明日の天気は晴れだ。', 'パソコンが急に動かなくなった。']
tokenizer(text_list, max_length = 10, padding = 'max_length', truncation = True)

{'input_ids': [[2, 11475, 5, 11385, 9, 16577, 75, 8, 3, 0], [2, 6311, 14, 1132, 7, 16084, 332, 58, 10, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [13]:
# 長さを一番長いやつに合わせる
tokenizer(text_list, padding = 'longest')

{'input_ids': [[2, 11475, 5, 11385, 9, 16577, 75, 8, 3, 0, 0], [2, 6311, 14, 1132, 7, 16084, 332, 58, 10, 8, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [14]:
# torch.tensorで出力
tokenizer(text_list, max_length = 10, padding = 'max_length', truncation = True, return_tensors = 'pt')

{'input_ids': tensor([[    2, 11475,     5, 11385,     9, 16577,    75,     8,     3,     0],
        [    2,  6311,    14,  1132,     7, 16084,   332,    58,    10,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [15]:
model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
bert = BertModel.from_pretrained(model_name)

# →GPU
bert = bert.cuda()

Downloading:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

In [16]:
# モデルの概要
print(bert.config)

BertConfig {
  "_name_or_path": "cl-tohoku/bert-base-japanese-whole-word-masking",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertJapaneseTokenizer",
  "transformers_version": "4.5.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}



- レイヤー数は12
- BERTの出力は768次元
- 最大トークン長は512

In [19]:
text_list = [
             '明日は自然言語処理の勉強をしよう。',
             '明日はマシーンラーニングの学習をしよう。'
]

# 文章の符号化
encoding = tokenizer(text_list, max_length = 32, padding = 'max_length', truncation = True, return_tensors = 'pt')

# →GPU
encoding = {k:v.cuda() for k, v in encoding.items()}

# bertの処理。入力は2次元のtorch.tensor
output = bert(**encoding) # input_ids = encoding[input_ids]みたいに書いても良い
last_hidden_state = output.last_hidden_state

In [20]:
encoding

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),
 'input_ids': tensor([[    2, 11475,     9,  1757,  1882,  2762,     5,  8192,    11,  2132,
            205,     8,     3,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0],
         [    2, 11475,     9,    96, 13866,   422,  1581,     5,  4293,    11,
           2132,   205,     8,     3,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]], device='cuda:0'),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0

In [22]:
# 文書数, max_length, hidden_size
print(last_hidden_state.size())

torch.Size([2, 32, 768])


In [23]:
with torch.no_grad():
    output = bert(**encoding)
    last_hidden_state = output.last_hidden_state