In [1]:
from transformers import BertTokenizer, BertForMaskedLM
from transformers import BertModel
import torch
import torch.nn.functional as F
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity

# モデルとトークナイザーの読み込み
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model_mlm = BertForMaskedLM.from_pretrained(model_name)
model_bert = BertModel.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


８０．トークン化

In [3]:
text = "the movie was full of incomprehensibllities."
tokens = tokenizer.tokenize(text)
print(tokens)

['the', 'movie', 'was', 'full', 'of', 'inc', '##omp', '##re', '##hen', '##si', '##bl', '##lit', '##ies', '.']


８１．マスクの予測

In [4]:
text = "The movie was full of [MASK]"
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
    outputs = model_mlm(**inputs)
logits = outputs.logits
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
predicted_token_id = logits[0, mask_token_index].argmax(dim=-1)
predicted_token = tokenizer.decode(predicted_token_id)
print(predicted_token)

.


８２．マスクのtop-k予測

In [6]:
top_k = 10
mask_logits = logits[0, mask_token_index, :]
probs = F.softmax(mask_logits, dim=-1)
topk_probs, topk_indices = torch.topk(probs, top_k, dim=-1)

for i in range(top_k):
    token = tokenizer.decode(topk_indices[0, i])
    prob = topk_probs[0, i].item()
    print(f"{i+1}: {token} ({prob:.4f})")

1: . (0.9260)
2: ; (0.0389)
3: ! (0.0300)
4: ? (0.0035)
5: ... (0.0005)
6: | (0.0002)
7: - (0.0001)
8: s t u f f (0.0000)
9: t h i n g s (0.0000)
10: , (0.0000)


８３．CLSトークンによる文ベクトル

In [8]:
sentences = [
    "The movie was full of fun.",
    "The movie was full of excitement.",
    "The movie was full of crap.",
    "The movie was full of rubbish."
]

def get_cls_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model_bert(**inputs)
    return outputs.last_hidden_state[0, 0]

cls_vectors = [get_cls_embedding(sent) for sent in sentences]
similarities = cosine_similarity(torch.stack(cls_vectors).numpy())

print("Cosine Similarity Matrix (CLS):")
print(similarities)

Cosine Similarity Matrix (CLS):
[[0.9999998  0.9880608  0.95576596 0.9475324 ]
 [0.9880608  0.99999994 0.9541275  0.94866353]
 [0.95576596 0.9541275  0.99999976 0.9806931 ]
 [0.9475324  0.94866353 0.9806931  1.0000002 ]]


８４．平均による文ベクトル

In [10]:
def get_avg_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model_bert(**inputs)
    token_embeddings = outputs.last_hidden_state[0]
    return token_embeddings.mean(dim=-1)

avg_vectors = [get_avg_embedding(sent) for sent in sentences]
similarities_avg = cosine_similarity(torch.stack(avg_vectors).numpy())

print("Cosine Similarity Matrix (Avg):")
print(similarities_avg)

Cosine Similarity Matrix (Avg):
[[0.99999994 0.9981602  0.9991019  0.9980094 ]
 [0.9981602  1.         0.9963008  0.9946869 ]
 [0.9991019  0.9963008  0.9999999  0.9988854 ]
 [0.9980094  0.9946869  0.9988854  0.99999994]]


８５．データセットの準備

In [2]:
import pandas as pd
from transformers import BertTokenizer

# SST-2用のBERTトークナイザ（任意のモデルに変更可能）
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# データ読み込み
train_df = pd.read_csv("data/SST-2/train.tsv", sep="\t")
dev_df = pd.read_csv("data/SST-2/dev.tsv", sep="\t")

# 文とラベルを取得
train_texts = train_df["sentence"].tolist()
train_labels = train_df["label"].tolist()

dev_texts = dev_df["sentence"].tolist()
dev_labels = dev_df["label"].tolist()

# トークン化（トークン列へ）
train_tokens = [tokenizer.tokenize(text) for text in train_texts]
dev_tokens = [tokenizer.tokenize(text) for text in dev_texts]

# 確認
print("Train 文の例:", train_texts[0])
print("Train トークン列の例:", train_tokens[0])
print("Train ラベルの例:", train_labels[0])


  from .autonotebook import tqdm as notebook_tqdm


Train 文の例: hide new secretions from the parental units 
Train トークン列の例: ['hide', 'new', 'secret', '##ions', 'from', 'the', 'parental', 'units']
Train ラベルの例: 0


８６．ミニバッチの作成

In [4]:
from transformers import BertTokenizer
import torch

# トークナイザのロード
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 例として先頭の4文を使用
sample_texts = train_texts[:4]
sample_labels = train_labels[:4]

# トークナイズ + ID化 + パディング（max_length に自動で揃う）
encoding = tokenizer(
    sample_texts,
    padding=True,            # 最長にパディング
    truncation=True,         # 長すぎる場合は切り詰め
    return_tensors="pt"      # PyTorchテンソルで返す
)

# ミニバッチの中身（input_ids, attention_mask, labels）
input_ids = encoding["input_ids"]           # トークンID列
attention_mask = encoding["attention_mask"] # マスク
labels = torch.tensor(sample_labels)        # ラベルをテンソル化

# 結果表示
print("input_ids:\n", input_ids)
print("attention_mask:\n", attention_mask)
print("labels:\n", labels)


input_ids:
 tensor([[  101,  5342,  2047,  3595,  8496,  2013,  1996, 18643,  3197,   102,
             0,     0,     0,     0,     0],
        [  101,  3397,  2053, 15966,  1010,  2069,  4450,  2098, 18201,  2015,
           102,     0,     0,     0,     0],
        [  101,  2008,  7459,  2049,  3494,  1998, 10639,  2015,  2242,  2738,
          3376,  2055,  2529,  3267,   102],
        [  101,  3464, 12580,  8510,  2000,  3961,  1996,  2168,  2802,   102,
             0,     0,     0,     0,     0]])
attention_mask:
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])
labels:
 tensor([0, 0, 1, 0])


８７．ファインチューニング

In [6]:
from transformers import TrainingArguments, Trainer, BertTokenizer, BertForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset
import torch
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # GPU 無効化 → CPU 使用


# モデルをロード（2クラス分類）
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# HuggingFaceのDataset形式に変換
train_dataset = Dataset.from_dict({"sentence": train_texts, "label": train_labels})
dev_dataset = Dataset.from_dict({"sentence": dev_texts, "label": dev_labels})

# トークナイザでトークナイズ（パディングや切り詰めを含めて）
def tokenize_function(examples):
    return tokenizer(
        examples["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_dev = dev_dataset.map(tokenize_function, batched=True)

# DataCollatorを用意（パディング自動処理）
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Trainer引数
training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500,
)

# Trainerを作成
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 学習
trainer.train()

# 評価
eval_result = trainer.evaluate()
print(f"Validation Accuracy: {eval_result.get('eval_accuracy', 'N/A')}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 67349/67349 [00:16<00:00, 4139.42 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 2780.21 examples/s]
  trainer = Trainer(


Step,Training Loss
100,0.4509
200,0.3133
300,0.2911
400,0.3135
500,0.2924
600,0.2925
700,0.2575
800,0.2694
900,0.2755
1000,0.2533


Validation Accuracy: N/A


８８．極性分析

In [7]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

sentences = [
    "The movie was full of incomprehensibilities.",
    "The movie was full of fun.",
    "The movie was full of excitement.",
    "The movie was full of crap.",
    "The movie was full of rubbish.",
]

# トークナイズ
inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)

# モデルをGPUに移動
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 入力もGPUに移動
inputs = {k: v.to(device) for k, v in inputs.items()}

# 推論
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=1)

for s, p in zip(sentences, predictions):
    label = "Positive" if p == 1 else "Negative"
    print(f"{s} → {label}")


The movie was full of incomprehensibilities. → Negative
The movie was full of fun. → Positive
The movie was full of excitement. → Positive
The movie was full of crap. → Negative
The movie was full of rubbish. → Negative


８９．アーキテクチャの変更

In [10]:
import torch.nn as nn
from transformers import BertModel
from sklearn.metrics import accuracy_score
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

class BertMaxPool(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state.max(dim=1).values
        logits = self.classifier(pooled)
        log_probs = nn.functional.log_softmax(logits, dim=-1)

        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(log_probs, labels)
            return {"loss": loss, "logits": log_probs}

        return {"logits": log_probs}



def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

model = BertMaxPool()

# Trainerを作成
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, 
)

# 学習
trainer.train()

# 評価
eval_result = trainer.evaluate()
print(f"Validation Accuracy: {eval_result.get('eval_accuracy', 'N/A')}")


  trainer = Trainer(


Step,Training Loss
100,0.4655
200,0.3354
300,0.2869
400,0.2867
500,0.2866
600,0.3025
700,0.2665
800,0.2752
900,0.2909
1000,0.241


Validation Accuracy: 0.9220183486238532
