In [1]:
import pandas as pd
import numpy as np
import time
from MyNLP import Divider

In [2]:
import torch
from transformers import BertModel, BertTokenizer

# GPUが利用可能な場合はGPUを、利用できない場合はCPUを使用する
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# BERTの事前学習済みモデルを読み込む
model = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

# モデルをデバイスに転送する
model.to(device)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [97]:
# テキストデータをトークナイズする
from transformers import BertJapaneseTokenizer

# tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
comment_sentence_list = ["鞍替えしています", '利用していない', 'したが為に']
inputs = tokenizer(comment_sentence_list, return_tensors="pt", padding=True).to(device)

In [98]:
inputs['input_ids'][1][1]

tensor(666, device='cuda:0')

In [99]:
inputs.input_ids

tensor([[   2, 9247, 2331,   15,   16,   21, 2610,    3],
        [   2,  666,   15,   16,   21,   80,    3,    0],
        [   2,   15,   10,   14, 1863,    7,    3,    0]], device='cuda:0')

In [100]:
for i in range(len(comment_sentence_list)):
    print(tokenizer.decode(inputs['input_ids'][i]))

[CLS] 鞍替え し て い ます [SEP]
[CLS] 利用 し て い ない [SEP] [PAD]
[CLS] し た が 為 に [SEP] [PAD]


In [101]:
# モデルに入力を与えて、テキストデータをエンコードする
outputs = model(**inputs)
    
print(outputs.keys())
print(outputs.last_hidden_state.shape)

odict_keys(['last_hidden_state', 'pooler_output'])
torch.Size([3, 8, 768])


In [102]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 文章の分散表現
#a = outputs.last_hidden_state[0][1].to('cpu').detach().numpy().copy()
#b = outputs.last_hidden_state[1][1].to('cpu').detach().numpy().copy()
#similarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))
#print(similarity)

a = outputs.pooler_output[0].to('cpu').detach().numpy().copy()
b = outputs.pooler_output[1].to('cpu').detach().numpy().copy()
similarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))
print(similarity)

a = outputs.pooler_output[0].to('cpu').detach().numpy().copy()
b = outputs.pooler_output[2].to('cpu').detach().numpy().copy()
similarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))
print(similarity)

'''
a = outputs.pooler_output[0].to('cpu').detach().numpy().copy()
b = outputs.pooler_output[3].to('cpu').detach().numpy().copy()
similarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))
print(similarity)

a = outputs.pooler_output[0].to('cpu').detach().numpy().copy()
b = outputs.pooler_output[4].to('cpu').detach().numpy().copy()
similarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))
print(similarity)
'''

[[0.846479]]
[[0.78406954]]


"\na = outputs.pooler_output[0].to('cpu').detach().numpy().copy()\nb = outputs.pooler_output[3].to('cpu').detach().numpy().copy()\nsimilarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))\nprint(similarity)\n\na = outputs.pooler_output[0].to('cpu').detach().numpy().copy()\nb = outputs.pooler_output[4].to('cpu').detach().numpy().copy()\nsimilarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))\nprint(similarity)\n"