In [1]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer

# Import Hand-Made Custom-Transformer
from transformer import Config, DecoderOnlyTransformer

In [2]:
!pip install torchsummary
from torchsummary import summary
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

config = Config()
model = DecoderOnlyTransformer(config)
model = model.to(device)

summary(model, input_size=(128,), batch_size=8, device=device.type)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
         Embedding-1              [8, 128, 640]      32,164,480
         Embedding-2              [8, 128, 640]       1,310,720
           Dropout-3              [8, 128, 640]               0
         LayerNorm-4              [8, 128, 640]           1,280
            Linear-5             [8, 128, 1920]       1,230,720
           Dropout-6           [8, 8, 128, 128]               0
            Linear-7              [8, 128, 640]         410,240
           Dropout-8              [8, 128, 640]               0
MultiHeadAttention-9              [8, 128, 640]               0
        LayerNorm-10              [8, 128, 640]           1,280
           Linear-11             [8, 128, 2560]       1,640,960
           Linear-12              [8, 128, 640]       1,639,040
          Dropout-13              [8, 128, 640]               0
      FeedForward-14              [8, 1

In [3]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, file_path, tokenizer, seq_len, vocab_size):
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        
        with open(file_path, "r") as f:
            self.text = f.read()

        # Tokenize the text and ensure valid token IDs
        self.tokenized_text = [
            tid if tid < self.vocab_size else self.tokenizer.unk_token_id
            for tid in self.tokenizer.encode(self.text)
        ]

    def __len__(self):
        return len(self.tokenized_text) // self.seq_len

    def __getitem__(self, idx):
        # Get a chunk of the text
        start = idx * self.seq_len
        end = (idx + 1) * self.seq_len
        input_ids = self.tokenized_text[start:end]
        target_ids = self.tokenized_text[start + 1:end + 1]

        # Pad sequences to fixed length
        pad_token = self.tokenizer.pad_token_id
        input_ids = input_ids + [pad_token] * (self.seq_len - len(input_ids))
        target_ids = target_ids + [pad_token] * (self.seq_len - len(target_ids))

        # Convert to tensors and clamp to valid range
        input_ids = torch.tensor(input_ids).clamp(max=self.vocab_size-1)
        target_ids = torch.tensor(target_ids).clamp(max=self.vocab_size-1)

        return input_ids, target_ids

In [4]:
# Training loop
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch_idx, (input_ids, target_ids) in enumerate(dataloader):
        input_ids, target_ids = input_ids.to(device), target_ids.to(device)

        # Forward pass
        optimizer.zero_grad()
        logits = model(input_ids)  # Shape: [B, T, vocab_size]

        # Slice logits and targets to align predictions
        logits = logits[:, :-1, :]  # Remove last token (no target)
        target_ids = target_ids[:, :-1]  # Align with logits (targets are already shifted)

        # Compute loss using .reshape() for non-contiguous tensors
        loss = criterion(
            logits.reshape(-1, logits.size(-1)),  # Reshape to [B*(T-1), vocab_size]
            target_ids.reshape(-1)                # Reshape to [B*(T-1)]
        )

        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Add this line
        optimizer.step()

        if (batch_idx + 1) % 100 == 0:
            print(f"Batch {batch_idx + 1}/{len(dataloader)}: Loss = {loss.item():.4f}")

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch finished: Average Loss = {avg_loss:.4f}")

In [5]:
# Configuration
config = Config()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Ensure padding token is defined

# Dataset and DataLoader
file_path = "input.txt"  # Replace with your input text file path
seq_len = 128
dataset = TextDataset(file_path, tokenizer, seq_len, config.vocab_size)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Model and optimizer
model = DecoderOnlyTransformer(config).to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-4)

# Loss function
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


In [6]:
EPOCHS = 1

print("Training started...")
for epoch in range(EPOCHS):
    print("EPOCH:", epoch)
    train(model, dataloader, optimizer, criterion, device)
print("Training completed!")

Training started...
EPOCH: 0
Batch 100/330: Loss = 6.1635
Batch 200/330: Loss = 6.0040
Batch 300/330: Loss = 5.4045
Epoch finished: Average Loss = 6.0485
Training completed!


In [7]:
EPOCHS = 15

print("Training started...")
for epoch in range(EPOCHS):
    print("EPOCH:", epoch)
    train(model, dataloader, optimizer, criterion, device)
print("Training completed!")

Training started...
EPOCH: 0
Batch 100/330: Loss = 5.6672
Batch 200/330: Loss = 5.2406
Batch 300/330: Loss = 4.7562
Epoch finished: Average Loss = 5.1180
EPOCH: 1
Batch 100/330: Loss = 4.5388
Batch 200/330: Loss = 4.7845
Batch 300/330: Loss = 4.3562
Epoch finished: Average Loss = 4.7542
EPOCH: 2
Batch 100/330: Loss = 4.4194
Batch 200/330: Loss = 4.6380
Batch 300/330: Loss = 3.2539
Epoch finished: Average Loss = 4.1899
EPOCH: 3
Batch 100/330: Loss = 1.7323
Batch 200/330: Loss = 1.5065
Batch 300/330: Loss = 1.4184
Epoch finished: Average Loss = 1.7545
EPOCH: 4
Batch 100/330: Loss = 1.0313
Batch 200/330: Loss = 1.0939
Batch 300/330: Loss = 0.9371
Epoch finished: Average Loss = 1.0321
EPOCH: 5
Batch 100/330: Loss = 0.6886
Batch 200/330: Loss = 0.7724
Batch 300/330: Loss = 0.7125
Epoch finished: Average Loss = 0.7499
EPOCH: 6
Batch 100/330: Loss = 0.5846
Batch 200/330: Loss = 0.4257
Batch 300/330: Loss = 0.3266
Epoch finished: Average Loss = 0.4490
EPOCH: 7
Batch 100/330: Loss = 0.1869
Batc

In [8]:
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.json',
 'tokenizer/merges.txt',
 'tokenizer/added_tokens.json')

In [9]:
# Save model
torch.save({
    'model_state_dict': model.state_dict(),
    'config': config  # Your Config object
}, "decoder_transformer.pth")
