<a href="https://colab.research.google.com/github/tkoba1216/Transformer_scratch/blob/main/transformer_revised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch as T
import torch.nn as nn
#from torchtext.legacy import data, datasets
import torch.optim as optim
import time

In [2]:
# 必要なライブラリインストール
!pip install transformers datasets torch

# データロード＆前処理
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

# トークナイザー初期化
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# 前処理関数（自動パディング＆トランケーション）
def preprocess(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

# IMDbデータセットのロードと分割
def load_and_split_data():
    dataset = load_dataset("imdb")

    # 80%訓練用、20%検証+テスト用に層別化分割
    train_test = dataset["train"].train_test_split(
        test_size=0.2,
        stratify_by_column="label",
        seed=42
    )

    # 検証とテスト用に50%ずつ分割
    valid_test = train_test["test"].train_test_split(
        test_size=0.5,
        stratify_by_column="label",
        seed=42
    )

    return DatasetDict({
        "train": train_test["train"],
        "valid": valid_test["train"],
        "test": valid_test["test"]
    })

# データセット処理
dataset = load_and_split_data()
dataset = dataset.map(preprocess, batched=True, batch_size=32)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# データローダー作成（GPU対応）
BATCH_SIZE = 4
train_loader = DataLoader(
    dataset["train"],
    batch_size=BATCH_SIZE,
    shuffle=True,
    pin_memory=True  # GPUメモリへの高速転送
)

valid_loader = DataLoader(
    dataset["valid"],
    batch_size=BATCH_SIZE,
    pin_memory=True
)

test_loader = DataLoader(
    dataset["test"],
    batch_size=BATCH_SIZE,
    pin_memory=True
)

# データ形状確認
sample_batch = next(iter(train_loader))
print(f"Input IDs shape: {sample_batch['input_ids'].shape}")
print(f"Attention Mask shape: {sample_batch['attention_mask'].shape}")
print(f"Labels shape: {sample_batch['label'].shape}")


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupt

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Input IDs shape: torch.Size([4, 512])
Attention Mask shape: torch.Size([4, 512])
Labels shape: torch.Size([4])


In [3]:
import torch as T
import torch.nn as nn
import torch.optim as optim
import time
from torch.nn.utils.rnn import pad_sequence

# デバイス設定
device = T.device('cuda' if T.cuda.is_available() else 'cpu')

# モデルアーキテクチャ --------------------------------------------------
class MHSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.w_queries = nn.Linear(embed_dim, embed_dim, bias=False)
        self.w_keys = nn.Linear(embed_dim, embed_dim, bias=False)
        self.w_values = nn.Linear(embed_dim, embed_dim, bias=False)
        self.fc_out = nn.Linear(embed_dim, embed_dim, bias=True)

    def forward(self, x, mask=None):
        batch_size, seq_len, _ = x.shape

        # マルチヘッドに分割
        Q = self.w_queries(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.w_keys(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.w_values(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # スコア計算
        scores = T.matmul(Q, K.transpose(-2, -1)) / T.sqrt(T.tensor(self.head_dim, dtype=T.float32))

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attention = T.softmax(scores, dim=-1)
        out = T.matmul(attention, V)

        # 結合して出力
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
        return self.fc_out(out)

class TransformerEncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, forward_expansion):
        super().__init__()
        self.attention = MHSelfAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * forward_expansion),
            nn.ReLU(),
            nn.Linear(embed_dim * forward_expansion, embed_dim)
        )
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, num_heads, forward_expansion, num_layers=3):
        super().__init__()
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(embed_dim, num_heads, forward_expansion)
            for _ in range(num_layers)
        ])

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask)
        return x

class Embedding(nn.Module):
    def __init__(self, vocab_size, max_length, embed_dim):
        super().__init__()
        self.word_embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_embed = nn.Embedding(max_length, embed_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        seq_len = x.size(1)
        positions = T.arange(0, seq_len, device=x.device).expand(x.size(0), seq_len)
        embeddings = self.word_embed(x) + self.pos_embed(positions)
        return self.dropout(embeddings)

class Classifier(nn.Module):
    def __init__(self, vocab_size, max_length, embed_dim, num_heads, forward_expansion):
        super().__init__()
        self.embedder = Embedding(vocab_size, max_length, embed_dim)
        self.encoder = TransformerEncoder(embed_dim, num_heads, forward_expansion)
        self.fc = nn.Linear(embed_dim, 1)
        self.max_length = max_length

    def forward(self, input_ids, attention_mask=None):
        # 入力形状: (batch_size, seq_len)
        x = input_ids.long()

        # マスク作成
        if attention_mask is None:
            attention_mask = (x != 0).float()  # 0がパディングと仮定

        # 埋め込み層
        embedding = self.embedder(x)  # (batch_size, seq_len, embed_dim)

        # Transformerエンコーダ
        encoder_output = self.encoder(embedding, attention_mask.unsqueeze(1).unsqueeze(2))

        # プーリング
        compact_encoding, _ = encoder_output.max(dim=1)

        # 出力層
        out = self.fc(compact_encoding)
        return out.squeeze()

# 訓練用関数 --------------------------------------------------
def binary_accuracy(preds, y):
    rounded_preds = T.round(T.sigmoid(preds))
    correct = (rounded_preds == y).float()
    return correct.mean()

def epoch_time(start_time, end_time):
    elapsed = end_time - start_time
    return int(elapsed / 60), int(elapsed % 60)

def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for batch in iterator:
        optimizer.zero_grad()

        inputs = batch['input_ids'].to(device)
        labels = batch['label'].float().to(device)

        # マスク生成
        mask = (inputs != 0).float().to(device)

        # 最大長制限
        if inputs.size(1) > model.max_length:
            inputs = inputs[:, :model.max_length]
            mask = mask[:, :model.max_length]

        predictions = model(inputs, mask)
        loss = criterion(predictions, labels)

        acc = binary_accuracy(predictions, labels)
        loss.backward()
        T.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0

    with T.no_grad():
        for batch in iterator:
            inputs = batch['input_ids'].to(device)
            labels = batch['label'].float().to(device)
            mask = (inputs != 0).float().to(device)

            if inputs.size(1) > model.max_length:
                inputs = inputs[:, :model.max_length]
                mask = mask[:, :model.max_length]

            predictions = model(inputs, mask)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)



データ用意

In [5]:
# データローダー作成（GPU対応）
BATCH_SIZE = 32
train_loader = DataLoader(
    dataset["train"],
    batch_size=BATCH_SIZE,
    shuffle=True,
    pin_memory=True  # GPUメモリへの高速転送
)

valid_loader = DataLoader(
    dataset["valid"],
    batch_size=BATCH_SIZE,
    pin_memory=True
)

test_loader = DataLoader(
    dataset["test"],
    batch_size=BATCH_SIZE,
    pin_memory=True
)

# データ形状確認
sample_batch = next(iter(train_loader))
print(f"Input IDs shape: {sample_batch['input_ids'].shape}")
print(f"Attention Mask shape: {sample_batch['attention_mask'].shape}")
print(f"Labels shape: {sample_batch['label'].shape}")

Input IDs shape: torch.Size([32, 512])
Attention Mask shape: torch.Size([32, 512])
Labels shape: torch.Size([32])


In [6]:
# 設定 --------------------------------------------------
EMBED_DIM = 200
NUM_HEADS = 8
FORWARD_EXPANSION = 3
MAX_LENGTH = 512
VOCAB_SIZE = 30522  # BERTベースの語彙サイズ

classifier = Classifier(
    VOCAB_SIZE, MAX_LENGTH, EMBED_DIM, NUM_HEADS, FORWARD_EXPANSION
).to(device)

optimizer = optim.AdamW(classifier.parameters(), lr=1e-4)
criterion = nn.BCEWithLogitsLoss().to(device)

# 訓練ループ --------------------------------------------------
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    print("epoch")
    print(epoch)

    train_loss, train_acc = train(classifier, train_loader, optimizer, criterion)
    print("train fin")
    valid_loss, valid_acc = evaluate(classifier, valid_loader, criterion)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        T.save(classifier.state_dict(), 'best-model.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


epoch
0
train fin
Epoch: 01 | Time: 1m 41s
	Train Loss: 0.631 | Train Acc: 63.59%
	 Val. Loss: 0.525 |  Val. Acc: 74.29%
epoch
1
train fin
Epoch: 02 | Time: 1m 41s
	Train Loss: 0.495 | Train Acc: 76.06%
	 Val. Loss: 0.493 |  Val. Acc: 76.74%
epoch
2
train fin
Epoch: 03 | Time: 1m 40s
	Train Loss: 0.431 | Train Acc: 79.98%
	 Val. Loss: 0.461 |  Val. Acc: 78.05%
epoch
3
train fin
Epoch: 04 | Time: 1m 40s
	Train Loss: 0.387 | Train Acc: 82.81%
	 Val. Loss: 0.414 |  Val. Acc: 80.58%
epoch
4
train fin
Epoch: 05 | Time: 1m 40s
	Train Loss: 0.347 | Train Acc: 84.90%
	 Val. Loss: 0.398 |  Val. Acc: 82.04%
epoch
5
train fin
Epoch: 06 | Time: 1m 40s
	Train Loss: 0.316 | Train Acc: 86.53%
	 Val. Loss: 0.378 |  Val. Acc: 83.11%
epoch
6
train fin
Epoch: 07 | Time: 1m 40s
	Train Loss: 0.287 | Train Acc: 88.09%
	 Val. Loss: 0.378 |  Val. Acc: 83.70%
epoch
7
train fin
Epoch: 08 | Time: 1m 40s
	Train Loss: 0.263 | Train Acc: 89.24%
	 Val. Loss: 0.386 |  Val. Acc: 84.06%
epoch
8
train fin
Epoch: 09 | Ti

T4 　BATCH_SIZE = 4　epoch
0
train fin
Epoch: 01 | Time: 2m 17s
	Train Loss: 0.646 | Train Acc: 61.55%
	 Val. Loss: 0.573 |  Val. Acc: 70.96%
epoch
1

In [5]:
train_loader.dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 20000
})

動作確認>> 動いているの確認できたので実際にGPUで試す

In [9]:
# 設定 --------------------------------------------------
EMBED_DIM = 200
NUM_HEADS = 8
FORWARD_EXPANSION = 3
MAX_LENGTH = 512
VOCAB_SIZE = 30522
classifier = Classifier(
    VOCAB_SIZE, MAX_LENGTH, EMBED_DIM, NUM_HEADS, FORWARD_EXPANSION
).to(device)
optimizer = optim.AdamW(classifier.parameters(), lr=2e-5)
criterion = nn.BCEWithLogitsLoss().to(device)

# 8バッチのみ処理するイテレータ作成
limited_iterator = iter(train_loader)

for _ in range(8):
    batch = next(limited_iterator)
    # 訓練処理実行
    train(classifier, [batch], optimizer, criterion)