In [25]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import BertPreTokenizer
import torch.nn.functional as F
import requests
import re
import matplotlib.pyplot as plt

In [26]:
def clean_gutenberg_text(raw_text):
    """Cleans Project Gutenberg headers/footers."""
    # Use non-greedy '.*?' to match the shortest possible text for the title
    start_marker = r"\*\*\* START OF (THIS|THE) PROJECT GUTENBERG EBOOK .*? \*\*\*"
    end_marker = r"\*\*\* END OF (THIS|THE) PROJECT GUTENBERG EBOOK .*? \*\*\*"

    start_match = re.search(start_marker, raw_text, re.IGNORECASE | re.DOTALL)
    end_match = re.search(end_marker, raw_text, re.IGNORECASE | re.DOTALL)

    if not start_match:
        print("  [Warning] Could not find START marker.")
        if end_match:
            return raw_text[:end_match.start()].strip()
        return raw_text.strip()

    if not end_match:
        print("  [Warning] Could not find END marker.")
        return raw_text[start_match.end():].strip()

    text_start = start_match.end()
    text_end = end_match.start()

    cleaned_text = raw_text[text_start:text_end].strip()
    return cleaned_text

BOOKS_TO_DOWNLOAD = {
    "Frankenstein": "https://www.gutenberg.org/ebooks/84.txt.utf-8",
    "The_Time_Machine": "https://www.gutenberg.org/ebooks/35.txt.utf-8",
    "The_War_of_the_Worlds": "https://www.gutenberg.org/ebooks/36.txt.utf-8",
    "20000_Leagues_Under_the_Sea": "https://www.gutenberg.org/ebooks/164.txt.utf-8",
    "A_Princess_of_Mars": "https://www.gutenberg.org/ebooks/62.txt.utf-8"
}

OUTPUT_FILENAME = "scifi_corpus.txt"
all_cleaned_texts = []

print("Starting download process...")

for title, url in BOOKS_TO_DOWNLOAD.items():
    print(f"Fetching '{title}'...", end="")
    try:
        response = requests.get(url)
        response.raise_for_status()
        response.encoding = 'utf-8'
        raw_text = response.text

        cleaned_text = clean_gutenberg_text(raw_text)
        all_cleaned_texts.append(cleaned_text)
        print(" Done.")

    except requests.exceptions.RequestException as e:
        print(f"\n  [ERROR] Failed to download {title}: {e}")

if all_cleaned_texts:
    print(f"\nCombining {len(all_cleaned_texts)} books into '{OUTPUT_FILENAME}'...")

    with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as f:
        full_corpus = "\n\n\n".join(all_cleaned_texts)
        f.write(full_corpus)

    print(f"Successfully created '{OUTPUT_FILENAME}' with a total of {len(full_corpus)} characters.")
else:
    print("No books were downloaded. Output file was not created.")

Starting download process...
Fetching 'Frankenstein'... Done.
Fetching 'The_Time_Machine'... Done.
Fetching 'The_War_of_the_Worlds'... Done.
Fetching '20000_Leagues_Under_the_Sea'... Done.
Fetching 'A_Princess_of_Mars'... Done.

Combining 5 books into 'scifi_corpus.txt'...
Successfully created 'scifi_corpus.txt' with a total of 1941792 characters.


In [27]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d=256, p=32, num_heads=2, dropout=0.1):
    super().__init__()
    self.num_heads = num_heads
    self.p = p
    self.d = d
    self.w_k = nn.ModuleList([nn.Linear(d, p, bias = False) for _ in range(num_heads)])
    self.w_q = nn.ModuleList([nn.Linear(d, p, bias = False) for _ in range(num_heads)])
    self.w_v = nn.ModuleList([nn.Linear(d, p, bias = False) for _ in range(num_heads)])

    self.w_o = nn.Linear(num_heads * p, d, bias=False)
    self.dropout = nn.Dropout(dropout)
    self.scale = self.p ** -0.5

  def forward(self, h):
    _, t, d = h.shape
    head_outputs = []

    mask = torch.triu(torch.ones(t, t), diagonal=1).bool().to(h.device)

    for m in range(self.num_heads):
        k = self.w_k[m](h)
        q = self.w_q[m](h)
        v = self.w_v[m](h)

        scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        scores = scores.masked_fill(mask, float('-inf'))
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        head_out = torch.matmul(attn_weights, v)
        head_outputs.append(head_out)

    h_prime = torch.cat(head_outputs, dim=-1)
    output = self.w_o(h_prime)
    return output


In [28]:
class PositionWiseFFN(nn.Module):
  def __init__(self, d=256):
    super().__init__()
    self.fc1 = nn.Linear(d, 2*d, bias=False)
    self.fc2 = nn.Linear(2*d, d, bias=False)
    self.relu = nn.ReLU()

  def forward(self, h):
    return self.fc2(self.relu(self.fc1(h)))

In [29]:
class TransformerBlock(nn.Module):
  def __init__(self, d=256, p=32, num_heads=2, dropout=0.1):
    super().__init__()
    self.attention = MultiHeadAttention(d, p, num_heads, dropout)
    self.norm1 = nn.LayerNorm(d)
    self.mlp = PositionWiseFFN(d)
    self.norm2 = nn.LayerNorm(d)

  def forward(self, h):
    h = self.norm1(h + self.attention(h))
    h = self.norm2(h + self.mlp(h))

    return h

In [30]:
class PositionalEncoding(nn.Module):
    def __init__(self, d=256, max_len=128):
        super().__init__()

        # Create positional encoding matrix
        pe = torch.zeros(max_len, d)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d, 2).float() * (-np.log(10000.0) / d))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe)

    def forward(self, x):
        T = x.size(1)
        return x + self.pe[:T, :].unsqueeze(0)

In [31]:
class TransformerLM(nn.Module):
    def __init__(self, vocab_size, d=256, p=32, num_heads=2, num_blocks=3,
                 max_len=128, dropout=0.1):
        super().__init__()

        self.d = d
        self.max_len = max_len

        self.token_embedding = nn.Embedding(vocab_size, d)
        self.pos_encoding = PositionalEncoding(d, max_len)
        self.blocks = nn.ModuleList([
            TransformerBlock(d, p, num_heads, dropout)
            for _ in range(num_blocks)
        ])

        self.output = nn.Linear(d, vocab_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h = self.token_embedding(x)
        h = self.pos_encoding(h)
        h = self.dropout(h)

        for block in self.blocks:
            h = block(h)

        logits = self.output(h)
        return logits

In [32]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, context_length=128):
        self.tokens = tokens
        self.context_length = context_length

        self.sequences = []
        for i in range(0, len(tokens) - context_length, context_length // 2):
            seq = tokens[i:i + context_length + 1]
            if len(seq) == context_length + 1:
                self.sequences.append(seq)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        x = torch.tensor(seq[:-1], dtype=torch.long)
        y = torch.tensor(seq[1:], dtype=torch.long)
        return x, y

In [33]:
def load_and_tokenize_data(data_path, vocab_size=2048):
    with open('scifi_corpus.txt', 'r', encoding='utf-8') as f:
        text = f.read()

    tokenizer = Tokenizer(BPE(unk_token='[UNK]'))
    tokenizer.pre_tokenizer = BertPreTokenizer()

    trainer = BpeTrainer(special_tokens=['[UNK]', '[CLS]', '[PAD]', '[SEP]', '[MASK]'])

    tokenizer.train_from_iterator([text], trainer)

    tokens = tokenizer.encode(text).ids

    print(f"Corpus length: {len(text)} characters")
    print(f"Number of tokens: {len(tokens)}")
    print(f"Vocabulary size: {tokenizer.get_vocab_size()}")

    return tokens, tokenizer

In [34]:
def train_model(model, train_loader, val_loader, device, epochs=10, lr=1e-4):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    train_losses = []
    val_losses = []
    iterations = []
    iteration = 0

    for epoch in range(epochs):
      model.train()
      epoch_loss = 0

      for batch_idx, (x, y) in enumerate(train_loader):
        x, y, = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)

        B, T, V = logits.shape
        logits = logits.view(B * T, V)
        y = y.view(B * T)

        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        train_losses.append(loss.item())
        iterations.append(iteration)
        iteration += 1

        if batch_idx % 100 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx}/{len(train_loader)}, "
                  f"Loss: {loss.item():.4f}")

        # Validation every 1000 iterations
        if iteration % 1000 == 0:
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for x_val, y_val in val_loader:
                    x_val, y_val = x_val.to(device), y_val.to(device)
                    logits_val = model(x_val)
                    B, T, V = logits_val.shape
                    logits_val = logits_val.view(B * T, V)
                    y_val = y_val.view(B * T)
                    val_loss += criterion(logits_val, y_val).item()

            val_loss /= len(val_loader)
            val_losses.append(val_loss)
            print(f"Validation Loss at iteration {iteration}: {val_loss:.4f}")
            model.train()

      avg_epoch_loss = epoch_loss / len(train_loader)
      print(f"Epoch {epoch+1} completed. Average Loss: {avg_epoch_loss:.4f}")

    return train_losses, val_losses, iterations

In [35]:
def plot_losses(train_losses, val_losses, iterations, save_path='training_loss.pdf'):
    plt.figure(figsize=(12, 6))

    window = 50
    if len(train_losses) > window:
        train_smooth = np.convolve(train_losses, np.ones(window)/window, mode='valid')
        iter_smooth = iterations[window-1:]
    else:
        train_smooth = train_losses
        iter_smooth = iterations

    plt.plot(iter_smooth, train_smooth, label='Training Loss (smoothed)', alpha=0.8)

    val_iterations = [i for i in iterations if i % 1000 == 0][:len(val_losses)]
    if val_losses:
        plt.scatter(val_iterations, val_losses, color='red', label='Validation Loss',
                   s=50, zorder=5)

    plt.xlabel('Iterations')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()
    print(f"Plot saved to {save_path}")

In [38]:
VOCAB_SIZE = 2048
CONTEXT_LENGTH = 128
D_MODEL = 256
P = 32
NUM_HEADS = 2
NUM_BLOCKS = 5
BATCH_SIZE = 32
EPOCHS = 10000
LR = 3e-4
DROPOUT = 0.1

In [None]:
# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load and tokenize data
print("Loading and tokenizing data...")
tokens, tokenizer = load_and_tokenize_data('scifi_corpus.txt', vocab_size=VOCAB_SIZE)

# Split into train and validation
split_idx = int(0.9 * len(tokens))
train_tokens = tokens[:split_idx]
val_tokens = tokens[split_idx:]

print(f"Train tokens: {len(train_tokens)}")
print(f"Validation tokens: {len(val_tokens)}")

# Create datasets
train_dataset = TextDataset(train_tokens, CONTEXT_LENGTH)
val_dataset = TextDataset(val_tokens, CONTEXT_LENGTH)

print(f"Train sequences: {len(train_dataset)}")
print(f"Validation sequences: {len(val_dataset)}")

# Create dataloaders
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True
)
val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=BATCH_SIZE, shuffle=False
)

# Create model
print("\nCreating model...")
model = TransformerLM(
    vocab_size=tokenizer.get_vocab_size(),
    d=D_MODEL,
    p=P,
    num_heads=NUM_HEADS,
    num_blocks=NUM_BLOCKS,
    max_len=CONTEXT_LENGTH,
    dropout=DROPOUT
)

total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")

# Train model
print("\nTraining model...")
train_losses, val_losses, iterations = train_model(
    model, train_loader, val_loader, device, epochs=EPOCHS, lr=LR
)

# Plot losses
plot_losses(train_losses, val_losses, iterations)

# Save model and tokenizer
torch.save(model.state_dict(), 'transformer_model.pt')
tokenizer.save('tokenizer.json')
print("\nModel and tokenizer saved!")

Using device: cuda
Loading and tokenizing data...
Corpus length: 1905366 characters
Number of tokens: 399830
Vocabulary size: 25902
Train tokens: 359847
Validation tokens: 39983
Train sequences: 5621
Validation sequences: 623

Creating model...
Total parameters: 14,905,344

Training model...
Epoch 1/10000, Batch 0/176, Loss: 10.3389
Epoch 1/10000, Batch 100/176, Loss: 6.3054
Epoch 1 completed. Average Loss: 6.6589
Epoch 2/10000, Batch 0/176, Loss: 5.9248
Epoch 2/10000, Batch 100/176, Loss: 5.5534
Epoch 2 completed. Average Loss: 5.6396
Epoch 3/10000, Batch 0/176, Loss: 5.4801
Epoch 3/10000, Batch 100/176, Loss: 5.3754
Epoch 3 completed. Average Loss: 5.3319
Epoch 4/10000, Batch 0/176, Loss: 5.1781
Epoch 4/10000, Batch 100/176, Loss: 5.1402
Epoch 4 completed. Average Loss: 5.1121
Epoch 5/10000, Batch 0/176, Loss: 4.9765
Epoch 5/10000, Batch 100/176, Loss: 4.9201
Epoch 5 completed. Average Loss: 4.9324
Epoch 6/10000, Batch 0/176, Loss: 4.8449
Epoch 6/10000, Batch 100/176, Loss: 4.7098
Va