In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from torch.utils.tensorboard import SummaryWriter
import random
import matplotlib.pyplot as plt
import time
from PyPDF2 import PdfReader
import re
import os
import math
from typing import List


In [2]:
# dataset loader
class CharDataset(Dataset):

    def __init__(self, words, chars, max_word_length):
        self.words = words
        self.chars = chars
        self.max_word_length = max_word_length
        self.stoi = {ch:i+1 for i,ch in enumerate(chars)}
        self.stoi['.'] = 0
        self.itos = {i:s for s,i in self.stoi.items()} # inverse mapping
        self.itos[0] = '.'

    def __len__(self):
        return len(self.words)

    def contains(self, word):
        return word in self.words

    def get_vocab_size(self):
        return len(self.chars) + 1 # all the possible characters and special 0 token

    def get_output_length(self):
        return self.max_word_length + 1 # <START> token followed by words

    def encode(self, word):
        ix = torch.tensor([self.stoi[w] for w in word], dtype=torch.long)
        return ix

    def decode(self, ix):
        word = ''.join(self.itos[i] for i in ix)
        return word

    def __getitem__(self, idx):
        word = self.words[idx]
        ix = self.encode(word)
        x = torch.zeros(self.max_word_length + 1, dtype=torch.long)
        y = torch.zeros(self.max_word_length + 1, dtype=torch.long)
        x[1:1+len(ix)] = ix
        y[:len(ix)] = ix
        y[len(ix)+1:] = -1 # index -1 will mask the loss at the inactive locations
        return x, y

def create_datasets(input_file):

    # preprocessing of the input text file
    reader = PdfReader(input_file)
    text = ''
    for i in range(len(reader.pages)):
        page = reader.pages[i]
        text = text + page.extract_text()
    clean_text = re.sub(r'[^\w]', ' ', text).lower()
    clean_text = re.sub(r'_', ' ', clean_text)
    words = clean_text.split()
    chars = sorted(list(set(clean_text))) # all the possible characters
    number_unique_char =len(chars)
    max_word_length = max(len(w) for w in words)
    print(f"number of examples in the dataset: {len(words)}")
    print(f"max word length: {max_word_length}")
    print(f"number of unique characters in the vocabulary: {number_unique_char}")
    print("vocabulary:")
    print(''.join(chars))

    # partition the input data into a training and the test set
    test_set_size = min(1000, int(len(words) * 0.1)) # 10% of the training set, or up to 1000 examples
    rp = torch.randperm(len(words)).tolist()
    train_words = [words[i] for i in rp[:-test_set_size]]
    test_words = [words[i] for i in rp[-test_set_size:]]
    print(f"split up the dataset into {len(train_words)} training examples and {len(test_words)} test examples")

    # wrap in dataset objects
    train_dataset = CharDataset(train_words, chars, max_word_length)
    test_dataset = CharDataset(test_words, chars, max_word_length)

    return train_dataset, test_dataset

class InfiniteDataLoader:
    """
    this is really hacky and I'm not proud of it, but there doesn't seem to be
    a better way in PyTorch to just create an infinite dataloader?
    """

    def __init__(self, dataset, **kwargs):
        train_sampler = torch.utils.data.RandomSampler(dataset, replacement=True, num_samples=int(1e10))
        self.train_loader = DataLoader(dataset, sampler=train_sampler, **kwargs)
        self.data_iter = iter(self.train_loader)

    def next(self):
        try:
            batch = next(self.data_iter)
        except StopIteration: # this will technically only happen after 1e10 samples... (i.e. basically never)
            self.data_iter = iter(self.train_loader)
            batch = next(self.data_iter)
        return batch

In [3]:
train_dataset, test_dataset = create_datasets('ISDA.pdf')

number of examples in the dataset: 6922
max word length: 17
number of unique characters in the vocabulary: 37
vocabulary:
 0123456789abcdefghijklmnopqrstuvwxyz
split up the dataset into 6230 training examples and 692 test examples


In [4]:
@torch.no_grad()
def generate(model, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
    """
    Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
    the sequence max_new_tokens times, feeding the predictions back into the model each time.
    Most likely you'll want to make sure to be in model.eval() mode of operation for this.
    """
    block_size = 8
    for _ in range(max_new_tokens):
        # if the sequence context is growing too long we must crop it at block_size
        idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
        # forward the model to get the logits for the index in the sequence
        logits, _ = model(idx_cond)
        # pluck the logits at the final step and scale by desired temperature
        logits = logits[:, -1, :] / temperature
        # optionally crop the logits to only the top k options
        if top_k is not None:
            v, _ = torch.topk(logits, top_k)
            logits[logits < v[:, [-1]]] = -float('Inf')
        # apply softmax to convert logits to (normalized) probabilities
        probs = F.softmax(logits, dim=-1)
        # either sample from the distribution or take the most likely element
        if do_sample:
            idx_next = torch.multinomial(probs, num_samples=1)
        else:
            _, idx_next = torch.topk(probs, k=1, dim=-1)
        # append sampled index to the running sequence and continue
        idx = torch.cat((idx, idx_next), dim=1)

    return idx

def print_samples(model, dataset, device, num=10):
    """ samples from the model and pretty prints the decoded samples """
    X_init = torch.zeros(num, 1, dtype=torch.long, device=device)
    top_k = 10
    steps = dataset.get_output_length() - 1 # -1 because we already start with <START> token (index 0)
    X_samp = generate(model, X_init, steps, top_k=top_k, do_sample=True).to(device)
    train_samples, test_samples, new_samples = [], [], []
    for i in range(X_samp.size(0)):
        # get the i'th row of sampled integers, as python list
        row = X_samp[i, 1:].tolist() # note: we need to crop out the first <START> token
        # token 0 is the <STOP> token, so we crop the output sequence at that point
        crop_index = row.index(0) if 0 in row else len(row)
        row = row[:crop_index]
        word_samp = train_dataset.decode(row)
        # separately track samples that we have and have not seen before
        if train_dataset.contains(word_samp):
            train_samples.append(word_samp)
        elif test_dataset.contains(word_samp):
            test_samples.append(word_samp)
        else:
            new_samples.append(word_samp)
    print('-'*80)
    for lst, desc in [(train_samples, 'in train'), (test_samples, 'in test'), (new_samples, 'new')]:
        print(f"{len(lst)} samples that are {desc}:")
        for word in lst:
            print(word)
    print('-'*80)

@torch.inference_mode()
def evaluate(model, dataset, device, batch_size=50, max_batches=None):
    model.eval()
    loader = DataLoader(dataset, shuffle=True, batch_size=batch_size, num_workers=0)
    losses = []
    for i, batch in enumerate(loader):
        batch = [t.to(device) for t in batch]
        X, Y = batch
        logits, loss = model(X, Y)
        losses.append(loss.item())
        if max_batches is not None and i >= max_batches:
            break
    mean_loss = torch.tensor(losses).mean().item()
    model.train() # reset model back to training mode
    return mean_loss


In [5]:
from dataclasses import dataclass
@dataclass
class ModelConfig:
    block_size: int = None # length of the input sequences of integers
    vocab_size: int = None # the input integers are in range [0 .. vocab_size -1]
    # parameters below control the sizes of each model slightly differently
    n_layer: int = 4
    n_embd: int = 64
    n_embd2: int = 64
    n_head: int = 4
    head_size: int = 16

In [6]:
# init model
vocab_size = train_dataset.get_vocab_size()
block_size = train_dataset.get_output_length()
config = ModelConfig(vocab_size=vocab_size+1, block_size=128,
                       n_layer=4, n_head=4,
                       n_embd=64, head_size=16, n_embd2=64)

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, n_embd, head_size, block_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.head_size = head_size

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B,T,head_size)
        q = self.query(x) # (B,T,head_size)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * (1.0 / math.sqrt(self.head_size)) # (B,T,head_size) @ (B,head_size,T) -> (B,T,T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B,T,T)
        wei = F.softmax(wei, dim=-1) # (B,T,T)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,head_size)
        out = wei @ v # (B,T,T) @ (B,T,head_size) -> (B,T,head_size)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size, n_embd, block_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(n_embd, head_size, block_size) for _ in range(num_heads)])

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)  # (B, T, num_heads * head_size)
        return out
class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd),
            nn.ReLU()
        )
    def foward(self, x):
        return self.net(x)

class Transformer_simple(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding = nn.Embedding(config.vocab_size, config.n_embd)
        self.position_embedding = nn.Embedding(config.block_size, config.n_embd)
        self.sa_heads = MultiHeadAttention(config.n_head, config.head_size, config.n_embd, config.block_size)
        self.ffwd = FeedFoward(config.n_embd)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        device = idx.device
        tok_emb = self.token_embedding(idx) # (B,T,n_embd)
        pos_emb = self.position_embedding(torch.arange(T, device=device)) # (T,n_embd)
        x = tok_emb + pos_emb # (B,T,n_embd)
        x = self.sa_heads(x) # (B,T,n_embd)
        x = self.ffwd(x)
        logits = self.lm_head(x) # (B,T,vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        return logits, loss
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -config.block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] # (B,C)
            probs = F.softmax(logits, dim=-1) # (B,C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
            idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
        return idx

In [8]:
# init model
vocab_size = train_dataset.get_vocab_size()
block_size = train_dataset.get_output_length()
config = ModelConfig(vocab_size=vocab_size+1, block_size=block_size,
                       n_layer=6, n_head=6,
                       n_embd=384, head_size=16, n_embd2=64)
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, n_embd, head_size, block_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.head_size = head_size

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B,T,head_size)
        q = self.query(x) # (B,T,head_size)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * (1.0 / math.sqrt(self.head_size)) # (B,T,head_size) @ (B,head_size,T) -> (B,T,T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B,T,T)
        wei = F.softmax(wei, dim=-1) # (B,T,T)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,head_size)
        out = wei @ v # (B,T,T) @ (B,T,head_size) -> (B,T,head_size)
        return out
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size, n_embd, block_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(n_embd, head_size, block_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)  # (B, T, num_heads * head_size)
        out = self.proj(out)
        return out
    
class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd)
        )
    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        n_embd = config.n_embd 
        num_heads = config.n_head
        block_size = config.block_size
        head_size = n_embd // num_heads
        self.sa = MultiHeadAttention(num_heads, head_size, n_embd, block_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(x)
        return x

class Transformer(nn.Module):
    def __init__(self, config):
            super().__init__()
            self.block_size = config.block_size

            self.transformer = nn.ModuleDict(dict(
                wte = nn.Embedding(config.vocab_size, config.n_embd),
                wpe = nn.Embedding(config.block_size, config.n_embd),
                h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
                ln_f = nn.LayerNorm(config.n_embd),
            ))
            self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

            n_params = sum(p.numel() for p in self.transformer.parameters())
            print("number of parameters: %.2fM" % (n_params/1e6,))

    def get_block_size(self):
        return self.block_size

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)

        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)

        return logits, loss


In [10]:
# Determine device (MPS for Apple Silicon, CUDA for NVIDIA, CPU as fallback)
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using MPS (Metal Performance Shaders)")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using CUDA")
else:
    device = torch.device('cpu')
    print("Using CPU")

transformer = Transformer(config)
transformer = transformer.to(device)
writer = SummaryWriter(log_dir='./training_dir')
if __name__ == '__main__':
    # init optimizer
    optimizer = torch.optim.AdamW(transformer.parameters(), lr=5e-4, weight_decay=0.01, betas=(0.9, 0.99), eps=1e-8)

    # init dataloader
    batch_loader = InfiniteDataLoader(train_dataset, batch_size=64, pin_memory=False, num_workers=0)

    # training loop
    best_loss = None
    step = 0
    while True:

        t0 = time.time()

        # get the next batch, ship to device, and unpack it to input and target
        batch = batch_loader.next()
        batch = [t.to(device) for t in batch]
        X, Y = batch

        # feed into the model
        logits, loss = transformer(X, Y)

        # calculate the gradient, update the weights
        transformer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        t1 = time.time()

        # logging
        if step % 10 == 0:
            print(f"step {step} | loss {loss.item():.4f} | step time {(t1-t0)*1000:.2f}ms")
        # evaluate the model
        if step > 0 and step % 500 == 0:
            train_loss = evaluate(transformer, train_dataset, device, batch_size=100, max_batches=10)
            test_loss  = evaluate(transformer, test_dataset, device, batch_size=100, max_batches=10)
            writer.add_scalar("Loss/train", train_loss, step)
            writer.add_scalar("Loss/test", test_loss, step)
            writer.flush()
            print(f"step {step} train loss: {train_loss} test loss: {test_loss}")
            # save the model to disk if it has improved
            if best_loss is None or test_loss < best_loss:
                out_path = os.path.join('./training_dir', "transformer.pt")
                print(f"test loss {test_loss} is the best so far, saving model to {out_path}")
                torch.save(transformer.state_dict(), out_path)
                best_loss = test_loss
        # sample from the model
        if step > 0 and step % 200 == 0:
            print_samples(transformer, train_dataset, device=device, num=10)
        step += 1


Using MPS (Metal Performance Shaders)
number of parameters: 10.66M
step 0 | loss 3.8030 | step time 123.10ms
step 10 | loss 2.9027 | step time 119.19ms
step 20 | loss 2.4808 | step time 118.70ms
step 30 | loss 2.6420 | step time 118.17ms
step 40 | loss 2.6384 | step time 117.74ms
step 50 | loss 2.2506 | step time 129.20ms
step 60 | loss 2.5379 | step time 122.11ms
step 70 | loss 2.5961 | step time 129.16ms
step 80 | loss 2.7412 | step time 119.41ms
step 90 | loss 2.1155 | step time 119.80ms
step 100 | loss 1.8900 | step time 121.86ms
step 110 | loss 2.5255 | step time 118.10ms
step 120 | loss 2.0532 | step time 121.66ms
step 130 | loss 2.1888 | step time 121.65ms
step 140 | loss 2.9278 | step time 119.66ms
step 150 | loss 1.6883 | step time 122.70ms
step 160 | loss 2.8465 | step time 120.21ms
step 170 | loss 2.1643 | step time 118.95ms
step 180 | loss 2.1699 | step time 117.80ms
step 190 | loss 2.1306 | step time 116.95ms
step 200 | loss 2.0626 | step time 117.39ms
--------------------

KeyboardInterrupt: 

In [13]:
@torch.no_grad()
def generate_infinite(model, dataset, device='cpu', max_new_tokens_per_word=100):
    """
    Generate words infinitely in real time using the transformer model.
    Each word is generated and printed as it's completed.
    
    Args:
        model: Trained transformer model
        dataset: CharDataset for decoding
        device: Device to run on ('cpu', 'cuda', 'mps', or torch.device)
        max_new_tokens_per_word: Maximum tokens to generate per word
    """
    # Handle device specification
    if isinstance(device, str):
        # Check if device is available
        if device == 'mps' and not torch.backends.mps.is_available():
            print("MPS not available, falling back to CPU")
            device = 'cpu'
        elif device == 'cuda' and not torch.cuda.is_available():
            print("CUDA not available, falling back to CPU")
            device = 'cpu'
        device = torch.device(device)
    
    model = model.to(device)
    model.eval()
    block_size = model.block_size
    word_count = 0
    
    print(f"Generating on device: {device}")
    
    try:
        while True:
            # Initialize with START token (index 0) on the specified device
            idx = torch.zeros((1, 1), dtype=torch.long, device=device)
            word_chars = []
            
            # Generate one word
            for token_pos in range(max_new_tokens_per_word):
                # Crop sequence to block_size if needed
                idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
                
                # Forward pass
                logits, _ = model(idx_cond)
                
                # Get next token probabilities from last position
                logits = logits[:, -1, :]  # (1, vocab_size)
                probs = F.softmax(logits, dim=-1)
                
                # Sample next token
                idx_next = torch.multinomial(probs, num_samples=1)  # (1, 1)
                idx = torch.cat((idx, idx_next), dim=1)  # (1, T+1)
                
                next_token = int(idx_next.item())
                
                # Stop token (index 0) ends the word
                if next_token == 0:
                    break
                
                # Decode and accumulate character
                char = dataset.itos[next_token]
                word_chars.append(char)
            
            # Print completed word
            word = ''.join(word_chars)
            if word:  # Only print non-empty words
                word_count += 1
                print(f"{word_count}. {word}")
    
    except KeyboardInterrupt:
        print(f"\n\nGeneration stopped. Generated {word_count} words.")

# Determine device (MPS for Apple Silicon, CUDA for NVIDIA, CPU as fallback)
if torch.backends.mps.is_available():
    device = 'mps'
    print("Using MPS (Metal Performance Shaders)")
elif torch.cuda.is_available():
    device = 'cuda'
    print("Using CUDA")
else:
    device = 'cpu'
    print("Using CPU")

# Run infinite generation (press Ctrl+C to stop)
generate_infinite(transformer, train_dataset, device=device)


Using MPS (Metal Performance Shaders)
Generating on device: mps
1. deliverr
2. for
3. with
4. and
5. for
6. takeng
7. a
8. party
9. in
10. of
11. each
12. ofdgcer
13. in
14. a
15. and
16. a
17. solely
18. upon
19. in
20. respect
21. a
22. to
23. for
24. b
25. of
26. year
27. the
28. molth
29. pursuant
30. a
31. 30
32. and
33. deendule
34. in
35. by
36. period
37. for
38. other
39. thres
40. in
41. or
42. behalf
43. capacy
44. no
45. provided
46. its
47. such
48. undngs
49. counterparty
50. support
51. a
52. unsa
53. transaction
54. of
55. imfeenay
56. each
57. a
58. apply
59. payment
60. as
61. sameshold
62. the
63. party
64. to
65. w
66. prevident
67. a
68. condinions
69. 4
70. substitute
71. other
72. will
73. assent
74. potter
75. deternal
76. relied
77. specified
78. a
79. a
80. in
81. or
82. its
83. facsimile
84. of
85. and
86. waxng
87. party
88. the
89. under
90. not
91. discsion
92. bld
93. will
94. v3
95. in
96. be
97. not
98. and
99. of
100. inrevupt
101. by
102. for
103. or
