In [1]:
import pandas as pd
import numpy as np
import time
from MyNLP import Divider

In [2]:
import torch
from transformers import BertModel, BertTokenizer

# GPUが利用可能な場合はGPUを、利用できない場合はCPUを使用する
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# BERTの事前学習済みモデルを読み込む
model = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

# モデルをデバイスに転送する
model.to(device)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [8]:
# テキストデータをトークナイズする
from transformers import BertJapaneseTokenizer

# tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
comment_sentence_list = ["代わりに自分は原則としてヨドバシしか利用していない。", "日本人"]
inputs = tokenizer(comment_sentence_list, return_tensors="pt", padding=True).to(device)

In [9]:
# モデルに入力を与えて、テキストデータをエンコードする

input_size = 10
outputs = []
for i in range(0, len(comment_sentence_list), input_size):
    input_dict = {'input_ids':inputs['input_ids'][i:i+input_size], 'token_type_ids':inputs['token_type_ids'][i:i+input_size], 'attention_mask':inputs['attention_mask'][i:i+input_size]}
    encoded_outputs = model(**input_dict)
    outputs.extend(encoded_outputs.last_hidden_state.to('cpu').detach().numpy().copy())

In [26]:
# 単語の分散表現を表す箇所のインデックスを取得する（単語＝１，特殊トークン＝０のフラグで表す）
index_list = []
for input_ids in inputs.input_ids:
    index = []
    for i in input_ids:
        if i.to('cpu').numpy() not in [0, 2, 3]:
            index.append(1)
        else:
            index.append(0)
    index_list.append(index)

# Bertの出力から単語の分散表現のみを取得する
word_only_inputs = []
for i, output_by_sentence in enumerate(outputs):
    word_vectors = []
    for j, word_vector in enumerate(output_by_sentence):
        if index_list[i][j] == 1:
            word_vectors.append(word_vector)
    word_only_inputs.append(word_vectors)

In [29]:
# 単語分散表現のリストを文章の分散表現で表す（単語ベクトルの平均を求める）
sentence_vectors = []

for word_vectors in word_only_inputs:
    sentence_vector = np.mean(word_vectors, axis=0)
    sentence_vectors.append(sentence_vector)

(2, 768)

In [None]:
# コメントの類似度を総当たりで計算するプログラム

from sklearn.metrics.pairwise import cosine_similarity

pair_list = []

for id1, comment1 in enumerate(sentence_vectors):
    for id2, comment2 in enumerate(sentence_vectors):
        # 同じコメント同士でなければ、コサイン類似度を計算する
        if not id1 == id2: 
            similarity = cosine_similarity(comment1.reshape(1, -1), comment2.reshape(1, -1))
            if not [id2, id1, comment_sentence_list[id2], comment_sentence_list[id1], similarity[0][0]] in pair_list:
                pair_list.append([id1, id2, comment_sentence_list[id1], comment_sentence_list[id2], similarity[0][0]])
    print(id1)