In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import string
import random
import time
import math
from torch.utils.data import Dataset, DataLoader

# Hyperparameters
batch_size = 32
embedding_dim = 128
num_heads = 8
num_encoder_layers = 6
num_decoder_layers = 6
dim_feedforward = 512
dropout = 0.1
learning_rate = 0.0005
num_epochs = 10
seq_length = 30

# Load data
all_characters = string.printable
n_characters = len(all_characters)
text = "This is an example text for next character prediction using transformers. " * 1000

# Dataset preparation
class TextDataset(Dataset):
    def __init__(self, text, seq_length):
        self.text = text
        self.seq_length = seq_length
        self.all_characters = string.printable
        self.char_to_idx = {char: idx for idx, char in enumerate(self.all_characters)}
        self.idx_to_char = {idx: char for idx, char in enumerate(self.all_characters)}
        self.data = self.text_to_tensor(text)

    def text_to_tensor(self, text):
        tensor = torch.zeros(len(text)).long()
        for i, char in enumerate(text):
            tensor[i] = self.char_to_idx[char]
        return tensor

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        return (
            self.data[idx:idx + self.seq_length],
            self.data[idx + 1:idx + self.seq_length + 1]
        )

dataset = TextDataset(text, seq_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Transformer Model
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, n_characters, embedding_dim, num_heads, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(n_characters, embedding_dim)
        self.pos_encoder = PositionalEncoding(embedding_dim, dropout)
        self.transformer = nn.Transformer(
            d_model=embedding_dim,
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )
        self.fc_out = nn.Linear(embedding_dim, n_characters)

    def forward(self, src, tgt, src_mask, tgt_mask):
        src = self.embedding(src) * math.sqrt(embedding_dim)
        tgt = self.embedding(tgt) * math.sqrt(embedding_dim)
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)
        output = self.transformer(src, tgt, src_mask, tgt_mask)
        output = self.fc_out(output)
        return output

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

# Define the device for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the model
model = TransformerModel(
    n_characters=n_characters,
    embedding_dim=embedding_dim,
    num_heads=num_heads,
    num_encoder_layers=num_encoder_layers,
    num_decoder_layers=num_decoder_layers,
    dim_feedforward=dim_feedforward,
    dropout=dropout
).to(device)



In [3]:
# Optimizer and Loss Function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Training Loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch, (src, tgt) in enumerate(dataloader):
        src = src.to(device)
        tgt = tgt.to(device)
        tgt_input = tgt[:-1, :]
        tgt_output = tgt[1:, :]

        optimizer.zero_grad()
        src_mask = model.generate_square_subsequent_mask(src.size(0)).to(device)
        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(0)).to(device)
        output = model(src, tgt_input, src_mask, tgt_mask)

        loss = criterion(output.view(-1, n_characters), tgt_output.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}')


KeyboardInterrupt: 