In [2]:
from transformers import BertTokenizer, BertForMaskedLM
from transformers import BertModel
import torch
import torch.nn.functional as F
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity

# モデルとトークナイザーの読み込み
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model_mlm = BertForMaskedLM.from_pretrained(model_name)
model_bert = BertModel.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at bert

８０．トークン化

In [3]:
text = "the movie was full of incomprehensibllities."
tokens = tokenizer.tokenize(text)
print(tokens)

['the', 'movie', 'was', 'full', 'of', 'inc', '##omp', '##re', '##hen', '##si', '##bl', '##lit', '##ies', '.']


８１．マスクの予測

In [4]:
text = "The movie was full of [MASK]"
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
    outputs = model_mlm(**inputs)
logits = outputs.logits
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
predicted_token_id = logits[0, mask_token_index].argmax(dim=-1)
predicted_token = tokenizer.decode(predicted_token_id)
print(predicted_token)

.


８２．マスクのtop-k予測

In [6]:
top_k = 10
mask_logits = logits[0, mask_token_index, :]
probs = F.softmax(mask_logits, dim=-1)
topk_probs, topk_indices = torch.topk(probs, top_k, dim=-1)

for i in range(top_k):
    token = tokenizer.decode(topk_indices[0, i])
    prob = topk_probs[0, i].item()
    print(f"{i+1}: {token} ({prob:.4f})")

1: . (0.9260)
2: ; (0.0389)
3: ! (0.0300)
4: ? (0.0035)
5: ... (0.0005)
6: | (0.0002)
7: - (0.0001)
8: s t u f f (0.0000)
9: t h i n g s (0.0000)
10: , (0.0000)


８３．CLSトークンによる文ベクトル

In [8]:
sentences = [
    "The movie was full of fun.",
    "The movie was full of excitement.",
    "The movie was full of crap.",
    "The movie was full of rubbish."
]

def get_cls_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model_bert(**inputs)
    return outputs.last_hidden_state[0, 0]

cls_vectors = [get_cls_embedding(sent) for sent in sentences]
similarities = cosine_similarity(torch.stack(cls_vectors).numpy())

print("Cosine Similarity Matrix (CLS):")
print(similarities)

Cosine Similarity Matrix (CLS):
[[0.9999998  0.9880608  0.95576596 0.9475324 ]
 [0.9880608  0.99999994 0.9541275  0.94866353]
 [0.95576596 0.9541275  0.99999976 0.9806931 ]
 [0.9475324  0.94866353 0.9806931  1.0000002 ]]


８４．平均による文ベクトル

In [10]:
def get_avg_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model_bert(**inputs)
    token_embeddings = outputs.last_hidden_state[0]
    return token_embeddings.mean(dim=-1)

avg_vectors = [get_avg_embedding(sent) for sent in sentences]
similarities_avg = cosine_similarity(torch.stack(avg_vectors).numpy())

print("Cosine Similarity Matrix (Avg):")
print(similarities_avg)

Cosine Similarity Matrix (Avg):
[[0.99999994 0.9981602  0.9991019  0.9980094 ]
 [0.9981602  1.         0.9963008  0.9946869 ]
 [0.9991019  0.9963008  0.9999999  0.9988854 ]
 [0.9980094  0.9946869  0.9988854  0.99999994]]
