In [1]:
# read input text file
with open("data/input.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [2]:
print("Length of dataset in characters: ", len(text))

Length of dataset in characters:  1115394


In [3]:
# First 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [4]:
# All the unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print('Number of unique characters: ', vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Number of unique characters:  65


In [5]:
# Tokenize
# Here we build a character-level model so we need to convert each character to a unique integer.

# Create a mapping from characters to integers - dictionaries
stoi = { ch:i for i,ch in enumerate(chars) } # stoi: string to integer
itos = { i:ch for i,ch in enumerate(chars) } # itos: integer to string

encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Test the encoder and decoder
print(encode('hii there'))
print(decode(encode('hii there')))

# There are different schemas/methods to tokenize textual data -> usually subword tokenization is used for word-level models
# But we keep it simple here and use character-level tokenization for easier understanding and implementation 

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [6]:
# Now, encode the entire text 
import torch
data = torch.tensor(encode(text), dtype=torch.long) # Tensor is a generalized form of a matrix that can have any number of dimensions
print(data, data.dtype)
print(data[:1000]) # This is what the first 1000 characters from before will look like to the GPT model

tensor([18, 47, 56,  ..., 45,  8,  0]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 3

In [7]:
# Split into train and validation sets
n = int(0.9 * len(data)) # first 90% for training
train_data, val_data = data[:n], data[n:]

In [8]:
# Usualy we do not train on entire dataset at once, but in batches -> block_size, context_size etc.
block_size = 8
train_data[:block_size+1] # first 9 characters 

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
# Chunk of 9 characters has 8 examples (see below)
# We train on 8 examples with context size between 1 and block_size (8 in this case)
# We want to transformer to know every context size from 1 to up to block_size
# Transformer will never receive more than block_size context size at once 
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} then target is {target}")

When input is tensor([18]) then target is 47
When input is tensor([18, 47]) then target is 56
When input is tensor([18, 47, 56]) then target is 57
When input is tensor([18, 47, 56, 57]) then target is 58
When input is tensor([18, 47, 56, 57, 58]) then target is 1
When input is tensor([18, 47, 56, 57, 58,  1]) then target is 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) then target is 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) then target is 58


In [10]:
# GPUs work in parallel so we would like to use parallelizable "chunks"
torch.manual_seed(1337)
batch_size = 4 # How many independent sequences will we process in parallel?
block_size = 8 # What is the maximum context length for prediction?

def get_batch(split):
    # Generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # random starting indices for the sequences
    x = torch.stack([data[i:i+block_size] for i in ix]) # input sequences, stacked as rows in a tensor
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # target sequences, shifted one position to the right, stacked as rows in a tensor

    return x, y

# This means 32 examples in both x and y

xb, yb = get_batch('train')
print('Inputs:')
print(xb.shape)
print(xb)
print('Targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"When input is {context.tolist()} then target is {target}")

Inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
When input is [24] then target is 43
When input is [24, 43] then target is 58
When input is [24, 43, 58] then target is 5
When input is [24, 43, 58, 5] then target is 57
When input is [24, 43, 58, 5, 57] then target is 1
When input is [24, 43, 58, 5, 57, 1] then target is 46
When input is [24, 43, 58, 5, 57, 1, 46] then target is 43
When input is [24, 43, 58, 5, 57, 1, 46, 43] then target is 39
When input is [44] then target is 53
When input is [44, 53] then target is 56
When input is [44, 53, 56] then target is 1
When input is [44, 53, 56, 1] then target is 58
When input is [44, 53, 56, 1, 58] 

In [11]:
print(xb) # our input to the transformer

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


What Does the Bigram Language Model Do?

- The Bigram Language Model predicts the next token in a sequence based on the current token. For each token in the input sequence, the model looks up a vector from the embedding table, which represents the likelihood of each possible next token (vocabulary size).
- In essence, it's learning bigram statistics—the probability of one token following another. It doesn't use sophisticated context like more advanced models, such as GPT, which consider the entire sequence of previous tokens.

This simple model serves as a foundation for more complex language models. It directly uses the current token to predict the next token without considering any longer context (like trigrams or beyond).

In [12]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        # each token directly reads off the logits for the next token from a lookup table
        # Essentially, this layer is a lookup table that outputs a vector (of size vocab_size) for each token in the input. 
        # In this case, it's used to predict the next token (bigram prediction).
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both [batch_size, block_size] tensors of integers
        # The output is a tensor of shape [batch_size, block_size, vocab_size], where each value in the vocab_size dimension represents the score (logit) for a possible next token.
        # This means for each token in the input, the model produces a vector of vocab_size logits that represent the predicted probabilities of the next token in the sequence (the bigram prediction).
        logits = self.token_embedding_table(idx) # [batch_size, block_size, vocab_size] or (B, T, C)


        if targets is None:
            loss = None

        else:   
            # Inuitively, we want to predict the next token in the sequence, given the current token.
            # We can do this by comparing the predicted logits to the actual next token in the sequence.
            # This is done using the cross-entropy loss, which measures the difference between the predicted probabilities and the actual target (the next token in the sequence).
            
            # Cross entropy wants a (B, C, T) input, so we need to permute the logits tensor
            # It's mainly important that C is in second position so we just make 2-dimensional tensor like below
            # Basically stretching the array
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    # This generate function does not make much sense for the Bigram model because Bigram only uses one character to predict the next
    # But we would like to keep it fixed to use with other more complex models
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) tensor of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions for the next token
            logits, loss = self(idx)

            # focus only on the last time step
            logits = logits[:, -1, :] # Becomes (B, C) as we remove the time dimension

            # apply softmax to convert logits to probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)

            # sample from the probability distribution to get the next token
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx
    
    
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

# We would expect a loss of -ln(1/vocab_size) = -ln(1/65) = 4.17

# generate some new text from the model
idx = torch.zeros((1, 1), dtype=torch.long) # 1 x 1 tensor with a zero to start with (new line character)

print(decode(m.generate(idx, max_new_tokens=100)[0].tolist())) # index into the first batch and convert to list

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [13]:
# create a pytorch optimizer
# can get away with a kind of high learning rate because the model is so simple
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [20]:
batch_size = 32

for steps in range(10000):

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(f'Loss: {loss.item()}')


Loss: 2.390815258026123


In [21]:
# Text of optimized bigram model
print(decode(m.generate(idx, max_new_tokens=300)[0].tolist())) # index into the first batch and convert to list




Tod on resheen doffousatheonisuckiceyoul me ee
T:
PShintherd fu h ta w

COMende pos pitl tot s maru loofrlomson En; by; me t?
manchure.
CERoof Ye, INCEShe h hatharvecowitofochand tee oucough s; he als'd:
Alens iostist, t med thesthoa us, m s an;
An ase

heruerdeve-n o'sk;
GAnouilerrlince hane s s 
