In [14]:
import torch
from transformers import BertModel, BertTokenizer
import numpy as np

In [15]:
# GPUが利用可能な場合はGPUを、利用できない場合はCPUを使用する
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# BERTの事前学習済みモデルを読み込む
model = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

# モデルをデバイスに転送する
model.to(device)

# 以下でモデルを使用するコードを記述する

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [16]:
from transformers import BertJapaneseTokenizer

# テキストデータをトークナイズする
# tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
text_list = ["代わりに自分は原則としてヨドバシしか利用していない。", "普段からヨドバシを利用する。", "僕は数年前からAmazonしか利用していない", "文章の類似度計算してみた。"]
inputs = tokenizer(text_list, return_tensors="pt", padding=True).to(device)
inputs

{'input_ids': tensor([[    2,  2713,     7,  1040,     9,  3747,    50,   801, 28493, 28531,
         28494,   278,   666,    15,    16,    21,    80,     8,     3],
        [    2,  9406,    40,   801, 28493, 28531, 28494,    11,   666,    34,
             8,     3,     0,     0,     0,     0,     0,     0,     0],
        [    2,  6259,     9,   276,    19,   174,    40, 17609,   278,   666,
            15,    16,    21,    80,     3,     0,     0,     0,     0],
        [    2,  7204,     5,  4629,   559,  3181,    15,    16,   546,    10,
             8,     3,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1

In [17]:
tokenizer.decode(inputs['input_ids'][0])

'[CLS] 代わり に 自分 は 原則 として ヨドバシ しか 利用 し て い ない 。 [SEP]'

In [18]:
# モデルに入力を与えて、テキストデータをエンコードする
outputs = model(**inputs)
    
print(outputs.keys())
print(outputs.pooler_output.shape)
print(outputs.last_hidden_state.shape)

odict_keys(['last_hidden_state', 'pooler_output'])
torch.Size([4, 768])
torch.Size([4, 19, 768])


In [20]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 文章の分散表現
a = outputs.pooler_output[0].to('cpu').detach().numpy().copy()
b = outputs.pooler_output[1].to('cpu').detach().numpy().copy()
similarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))
print(similarity)

a = outputs.pooler_output[0].to('cpu').detach().numpy().copy()
b = outputs.pooler_output[2].to('cpu').detach().numpy().copy()
similarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))
print(similarity)

a = outputs.pooler_output[0].to('cpu').detach().numpy().copy()
b = outputs.pooler_output[3].to('cpu').detach().numpy().copy()
similarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))
print(similarity[0][0])


# 単語の分散表現
# a = outputs.last_hidden_state[0].to('cpu').detach().numpy().copy()
# b = outputs.last_hidden_state[1].to('cpu').detach().numpy().copy()

# similarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))
# print(similarity)

[[0.82439315]]
[[0.86359704]]
[[0.62414646]]
