<a href="https://colab.research.google.com/github/twyeh/AI-in-education/blob/main/Transformer_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 單純 transformer

In [2]:
import torch
import torch.nn as nn
import math

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0)]

In [4]:
class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, d_model, nhead, num_layers, dim_feedforward):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, input_dim)

    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output

In [14]:
# Example usage
input_dim = 10  # Vocabulary size (0-9)
d_model = 32    # Embedding dimension
nhead = 2       # Number of attention heads
num_layers = 2  # Number of transformer layers
dim_feedforward = 128

model = SimpleTransformer(input_dim, d_model, nhead, num_layers, dim_feedforward)

# Sample input sequence
input_seq = torch.tensor([[1, 2, 3, 4, 5]])  # Batch size of 1, sequence length of 5

# Make a prediction
with torch.no_grad():
    output = model(input_seq)
    prediction = output.argmax(dim=-1)
    print(f"Input sequence: {input_seq}")
    print(f"Predicted next number: {prediction[0, -1].item()}")

Input sequence: tensor([[1, 2, 3, 4, 5]])
Predicted next number: 1


Trained Model + Transormer

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.rnn = nn.RNN(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(1, x.size(0), self.hidden_dim).to(x.device)

        # Forward pass through RNN
        out, _ = self.rnn(x, h0)

        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# Example usage
input_dim = 10  # Vocabulary size (0-9)
hidden_dim = 64  # Hidden dimension of RNN
output_dim = 10  # Output dimension (same as input_dim for prediction)

# Create the RNN model
model = RNNModel(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    # Sample input sequence (replace with your actual training data)
    input_seq = torch.randint(0, input_dim, (64, 5))  # Batch size of 64, sequence length of 5

    # Generate target sequence (shifted by one position)
    target_seq = torch.cat([input_seq[:, 1:], input_seq[:, -1:]], dim=1)

    # Zero the gradients
    optimizer.zero_grad()

    # Forward pass
    outputs = model(input_seq)

    # Calculate loss
    loss = criterion(outputs, target_seq.view(-1))

    # Backward pass and optimize
    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

RuntimeError: For unbatched 2-D input, hx should also be 2-D but got 3-D tensor

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.rnn = nn.RNN(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        # Initialize hidden state with zeros, reshape to 3D to match RNN input
        # The hidden state shape should be (num_layers, batch_size, hidden_dim)
        # Since num_layers is 1 by default for nn.RNN, the shape is (1, batch_size, hidden_dim)
        # With batch_first=True, the shape should be: (batch_size, num_layers, hidden_dim) -> (x.size(0), 1, self.hidden_dim)
        h0 = torch.zeros(x.size(0), 1, self.hidden_dim).to(x.device)

        # Forward pass through RNN
        out, _ = self.rnn(x, h0)  # Pass the reshaped hidden state

        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# Example usage
input_dim = 10  # Vocabulary size (0-9)
hidden_dim = 64  # Hidden dimension of RNN
output_dim = 10  # Output dimension (same as input_dim for prediction)

# Create the RNN model
model = RNNModel(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    # Sample input sequence (replace with your actual training data)
    input_seq = torch.randint(0, input_dim, (64, 5))  # Batch size of 64, sequence length of 5

    # Generate target sequence (shifted by one position)
    target_seq = torch.cat([input_seq[:, 1:], input_seq[:, -1:]], dim=1)

    # Zero the gradients
    optimizer.zero_grad()

    # Forward pass
    outputs = model(input_seq)

    # Calculate loss
    loss = criterion(outputs, target_seq.view(-1))

    # Backward pass and optimize
    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

RuntimeError: For unbatched 2-D input, hx should also be 2-D but got 3-D tensor

In [17]:
import tensorflow as tf
from tensorflow import keras
import math

# 定義 RNN 模型
class RNNModel(keras.Model):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.rnn = keras.layers.RNN(keras.layers.SimpleRNNCell(hidden_dim), return_sequences=True)  # 使用 SimpleRNNCell 和 return_sequences=True
        self.fc = keras.layers.Dense(output_dim)

    def call(self, x):
        # 初始化隱藏狀態為零
        batch_size = tf.shape(x)[0]
        h0 = tf.zeros((batch_size, self.hidden_dim))

        # 通過 RNN 的正向傳遞
        out = self.rnn(x, initial_state=h0)

        # 解碼最後一個時間步長的隱藏狀態
        out = self.fc(out[:, -1, :])
        return out

# 範例用法
input_dim = 10  # 詞彙大小 (0-9)
hidden_dim = 64  # RNN 的隱藏維度
output_dim = 10  # 輸出維度 (與預測的 input_dim 相同)

# 創建 RNN 模型
model = RNNModel(input_dim, hidden_dim, output_dim)

# 定義損失函數和優化器
criterion = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = keras.optimizers.Adam(learning_rate=0.001)

# 訓練迴圈
num_epochs = 10
batch_size = 64
sequence_length = 5

for epoch in range(num_epochs):
    # 範例輸入序列 (用您的實際訓練數據替換)
    input_seq = tf.random.uniform((batch_size, sequence_length), minval=0, maxval=input_dim, dtype=tf.int32)

    # 生成目標序列 (向右移一位)
    target_seq = tf.concat([input_seq[:, 1:], input_seq[:, -1:]], axis=1)

    with tf.GradientTape() as tape:
        # 正向傳遞
        outputs = model(tf.one_hot(input_seq, depth=input_dim))  # 使用 tf.one_hot 對輸入進行編碼

        # 計算損失
        loss = criterion(target_seq, outputs)

    # 反向傳遞和優化
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.numpy():.4f}")

ValueError: Argument `output` must have rank (ndim) `target.ndim - 1`. Received: target.shape=(64, 5), output.shape=(64, 10)