# BERTで文をベクトル化して類似度を求めるコード
ブログ：https://techblog.gmo-ap.jp/2022/12/21/bert_calc_sentence_similarity/

# 初期設定

## 必要なライブラリのダウンロード

In [1]:
!pip install transformers
!pip install sentence_transformers
!pip install fugashi[unidic-lite]
!pip install ipadic

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m77.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.2 MB/s[0m eta [36m0:00:00[0m
Col

## ライブラリのimport

In [2]:
from transformers import BertJapaneseTokenizer, BertModel

from sentence_transformers import SentenceTransformer
from sentence_transformers import models

import torch
import numpy as np

## よく使う変数の定義

In [3]:
# 今回使うbertのバージョンの名前
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'

tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME)

Downloading (…)okenizer_config.json:   0%|          | 0.00/110 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/258k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

# 文のベクトル化

In [8]:
def sentence_to_vector(model, tokenizer, sentence):

  # 文を単語に区切って数字にラベル化
  tokens = tokenizer(sentence)['input_ids']

  # BERTモデルの処理のためtensor型に変換
  input = torch.tensor(tokens).reshape(1,-1)

  # BERTモデルに入力し文のベクトルを取得
  with torch.no_grad():
    outputs = model(input, output_hidden_states=True)
    last_hidden_state = outputs.last_hidden_state[0]
    averaged_hidden_state = last_hidden_state.sum(dim=0) / len(last_hidden_state)

  return averaged_hidden_state

In [11]:
sentence = "我輩は猫である。"
sentence_vector = sentence_to_vector(model, tokenizer, sentence)

# 文の類似度計算

In [12]:
def calc_similarity(sentence1, sentence2):
  print("{}\n{}".format(sentence1, sentence2))

  sentence_vector1 = sentence_to_vector(model, tokenizer, sentence1)
  sentence_vector2 = sentence_to_vector(model, tokenizer, sentence2)

  score = torch.nn.functional.cosine_similarity(sentence_vector1, sentence_vector2, dim=0).detach().numpy().copy()
  print("類似度：", score)

In [14]:
sentence1 = "吾輩は猫である"

sentence2 = "私は猫です"

calc_similarity(sentence1, sentence2)

吾輩は猫である
私は猫です
類似度： 0.85347897


In [15]:
sentence3 = "お昼ご飯にカレーが食べたい"

calc_similarity(sentence1, sentence3)

吾輩は猫である
お昼ご飯にカレーが食べたい
類似度： 0.76310676
