In [10]:
import torch

print(torch.cuda.is_available())

True


In [11]:
import torch
from transformers import BertModel, BertTokenizer

# GPUが利用可能な場合はGPUを、利用できない場合はCPUを使用する
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# BERTの事前学習済みモデルを読み込む
model = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

# モデルをデバイスに転送する
model.to(device)

# 以下でモデルを使用するコードを記述する



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [93]:
# テキストデータをトークナイズする
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(["私はご飯を食べるのが好きです。", "ゲームは宿題の後です。"], return_tensors="pt", padding=True).to(device)
inputs

{'input_ids': tensor([[  101,   100,  1672, 30181,   100,  1690,  1978,  1675, 30213, 30197,
         30177,   100,  1652, 30191, 30184,  1636,   102],
        [  101,  1703, 30265, 30251, 30198,  1826,   100,  1671,  1846,  1665,
         30184,  1636,   102,     0,     0,     0,     0]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]], device='cuda:0')}

In [94]:
# モデルに入力を与えて、テキストデータをエンコードする
outputs = model(**inputs)
    
print(outputs.keys())
print(outputs.pooler_output.shape)
print(outputs.last_hidden_state.shape)

odict_keys(['last_hidden_state', 'pooler_output'])
torch.Size([2, 768])
torch.Size([2, 17, 768])


In [95]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 文章の分散表現
a = outputs.pooler_output[0].to('cpu').detach().numpy().copy()
b = outputs.pooler_output[1].to('cpu').detach().numpy().copy()
similarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))
print(similarity)


# 単語の分散表現
a = outputs.last_hidden_state[0].to('cpu').detach().numpy().copy()
b = outputs.last_hidden_state[1].to('cpu').detach().numpy().copy()

similarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))
print(similarity)

[[0.9456686]]
[[0.5186821]]


In [9]:
print(a.reshape(1, -1).shape)

(1, 13056)
