In [32]:
from glob import glob
from typing import List, Tuple

from transformers import AutoTokenizer, AutoModel

In [33]:
tokenizer_path = "cl-tohoku/bert-base-japanese-whole-word-masking"
model_path = "bandainamco-mirai/distilbert-base-japanese"

In [35]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModel.from_pretrained(model_path)

In [25]:
def load_texts(paths: List[str]) -> List[str]:
    texts = []
    for p in paths:
        with open(p, 'rt') as f:
            text = f.read()
            if text.find('\n') != -1:
                text = text[text.find('\n'):]
            if text.find('\n') != -1:
                text = text[text.find('\n'):]
            texts.append(text)
    return texts

In [26]:
it_paths = glob('text/it-life-hack/*.txt')
it_texts = load_texts(it_paths)

In [27]:
kaden_paths = glob('text/kaden-channel/*.txt')
kaden_texts = load_texts(kaden_paths)

In [20]:
dokujo_paths = glob('text/dokujo-tsushin/*.txt')
dokujo_texts = load_texts(dokujo_paths)

In [21]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [51]:
def vectorize(text: str) -> np.ndarray:
    inputs = tokenizer(text, return_tensors='pt', padding=True)
    for key in inputs.keys():
        inputs[key] = inputs[key][:, :512]
    outputs = model(inputs.input_ids, inputs.attention_mask)
    output = outputs.last_hidden_state
    sentence_vector = output[0].mean(axis=0).detach().numpy().reshape(1, -1)
    return sentence_vector

In [52]:
cosine_similarity(vectorize(it_texts[0]), vectorize(it_texts[1]))[0][0]

0.9965112

In [53]:
cosine_similarity(vectorize(it_texts[0]), vectorize(kaden_texts[0]))[0][0]

0.9947234

In [55]:
cosine_similarity(vectorize(it_texts[0]), vectorize(dokujo_texts[0]))[0][0]

0.9955975