In [2]:
from glob import glob
from typing import List, Tuple

from transformers import BertJapaneseTokenizer, BertModel

In [1]:
pre_train_path = 'cl-tohoku/bert-base-japanese-whole-word-masking'

In [3]:
tokenizer = BertJapaneseTokenizer.from_pretrained(pre_train_path)
model = BertModel.from_pretrained(pre_train_path)

In [31]:
def load_texts(paths: List[str]) -> List[str]:
    texts = []
    for p in paths:
        with open(p, 'rt') as f:
            text = f.read()
            if text.find('\n') != -1:
                text = text[text.find('\n'):]
            if text.find('\n') != -1:
                text = text[text.find('\n'):]
            texts.append(text)
    return texts

In [32]:
it_paths = glob('text/it-life-hack/*.txt')
it_texts = load_texts(it_paths)

In [33]:
kaden_paths = glob('text/kaden-channel/*.txt')
kaden_texts = load_texts(kaden_paths)

In [34]:
dokujo_paths = glob('text/dokujo-tsushin/*.txt')
dokujo_texts = load_texts(dokujo_paths)

In [35]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [70]:
def vectorize(text: str) -> np.ndarray:
    inputs = tokenizer(text, return_tensors='pt', padding=True)
    for key in inputs.keys():
        inputs[key] = inputs[key][:, :512]
    outputs = model(**inputs)
    output = outputs.last_hidden_state
    sentence_vector = output[0].mean(axis=0).detach().numpy().reshape(1, -1)
    return sentence_vector

In [73]:
cosine_similarity(vectorize(it_texts[0]), vectorize(it_texts[1]))[0][0]

0.94478273

In [74]:
cosine_similarity(vectorize(it_texts[0]), vectorize(kaden_texts[0]))[0][0]

0.9339549

In [75]:
cosine_similarity(vectorize(it_texts[0]), vectorize(dokujo_texts[0]))[0][0]

0.9248747