In [28]:
import torch
import torch.nn as nn
from torch.nn import functional as F
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"GPU Count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.10.0+cpu
CUDA Available: False
GPU Count: 0


In [4]:
# Choosing a text file to train the bigram model on.
with open('Alice_In_Wonderland.txt', 'r', encoding='utf-8') as file:
    text = file.read()
# Printing some stuff about the text, like its length.
print(len(text))

145414


In [30]:
# Now we are going to make a \"characters\" variable to create a vocabulary list.
characters = sorted(list(set(text)))
print(characters)
# We can even see how many unique characters there are in the text.
print(len(characters))
# As you can see, there are 84 unique characters in the text.
vocab_size = len(characters) # Time to give this number a name.

['\n', ' ', '!', '#', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '5', '6', '7', '8', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ù', '—', '‘', '’', '“', '”', '\ufeff']
84


In [6]:
# Now we are going to make a tokenizer that maps characters to integers.
# The code below achieves this by creating two dictionaries: one for mapping characters to integers and another for mapping integers back to characters. 
# It also defines two functionsfor encoding strings to lists of integers and the reverse process.
string_to_int = {ch: i for i, ch in enumerate(characters)}
int_to_string = {i: ch for i, ch in enumerate(characters)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

# If we work with, maybe, a word-level model, we would want to tokenize by words instead of characters.
# With a word-level tokenizer, we would have, for example, every single word in the English language as a unique token.
# That's a lot of tokens, and for a multiple language model, we would have millions, billions, or trillions of tokens.
# But in that case, we would have a much smaller set to work with, resulting in a large vocabulary size and a very small amount to encode and decode.
# If you have a subword tokenizer, you would have a smaller vocabulary size than a word-level tokenizer, but a larger vocabulary size than a character-level tokenizer.
# In the context, of language models, it's really important that we are efficient with our data and that we have a good tokenizer that can capture the structure of the language while also being efficient in terms of the number of tokens it produces.
# It should also be noted that we will be using a machine learning framework called PyTorch to build this bigram model.
# We're going to add our data in now.
data = torch.tensor(encode(text), dtype=torch.long)
# Let's print the data to see what it looks like.
print (data [:100])


tensor([83, 41, 58, 55,  1, 37, 68, 65, 60, 55, 53, 70,  1, 28, 71, 70, 55, 64,
        52, 55, 68, 57,  1, 55, 23, 65, 65, 61,  1, 65, 56,  1, 22, 62, 59, 53,
        55,  4, 69,  1, 22, 54, 72, 55, 64, 70, 71, 68, 55, 69,  1, 59, 64,  1,
        44, 65, 64, 54, 55, 68, 62, 51, 64, 54,  0,  1,  1,  1,  1,  0, 41, 58,
        59, 69,  1, 55, 52, 65, 65, 61,  1, 59, 69,  1, 56, 65, 68,  1, 70, 58,
        55,  1, 71, 69, 55,  1, 65, 56,  1, 51])


In [31]:
# Now we are going to split the data into a training set and a validation set.
# n is the index at which we will split the data. We will use 80% of the data for training and 20% for validation.
# The general rule for splitting data is to use 80% for training and 20% for validation, based on Pareto principle.
# However, the optimal split can depend on the size of the dataset and the specific task at hand.
# In this case, using an 80/20 split should work well for training this bigram model.
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]
print(f"Training data length: {len(train_data)}")
print(f"Validation data length: {len(val_data)}")

Training data length: 116331
Validation data length: 29083


In [18]:
# Now we are going to make a function that will generate batches of data for training the model.
# The function will take in the batch size and the block size as arguments.
# We will take a set of predictions and offset the targets by one character.
# This way, the model will learn to predict the next character in the sequence.
block_size = 8  # The block size is the number of characters the model will look at to make a prediction.

x = train_data[:block_size]  # The input data will be the first block of characters.
y = train_data[1:block_size+1]  # The target data will be the next block of characters, offset by one.

for i in range(block_size):
    input = x[:i+1]  # The input will be the characters up to the current index. This is also called the "context".
    target = y[i]  # The target will be the character at the current index.
    print(f'Input: {input}, Target: {target}')

Input: tensor([83]), Target: 41
Input: tensor([83, 41]), Target: 58
Input: tensor([83, 41, 58]), Target: 55
Input: tensor([83, 41, 58, 55]), Target: 1
Input: tensor([83, 41, 58, 55,  1]), Target: 37
Input: tensor([83, 41, 58, 55,  1, 37]), Target: 68
Input: tensor([83, 41, 58, 55,  1, 37, 68]), Target: 65
Input: tensor([83, 41, 58, 55,  1, 37, 68, 65]), Target: 60


In [58]:
# Now we can take these blocks, stack them, and push them to the GPU to scale up the training process.
# We can represent this stack of blocks as a new hyperparameter called "batch size".
# First, let's check if we have a GPU available and set the device accordingly.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
# Looks like we need to find an available GPU.
# How to find an available GPU? 
# You can use the command "nvidia-smi" in your terminal to check the status of your GPUs. 
# It will show you the memory usage and the processes running on each GPU. 
# You can choose a GPU that has low memory usage and is not being heavily used by other processes.
# Since GPU is not available, so we will be using the CPU for training.
batch_size = 4
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device) # If we had a GPU, this would move the data to the GPU for parallel processing.
    return x, y
x, y = get_batch('train')
print("Inputs: ")
print(x.shape)  # This will show the shape of the input batch, which should be (batch_size, block_size).
print(x)
print("Targets: ")
print(y.shape)  # This will show the shape of the target batch, which should also be (batch_size, block_size).
print(y)

Using device: cpu
Inputs: 
torch.Size([4, 8])
tensor([[82,  1, 69, 51, 59, 54,  1, 70],
        [ 1, 59, 70,  0, 56, 55, 62, 70],
        [55,  1, 66, 59, 55, 53, 55, 69],
        [58, 59, 64, 57,  1, 73, 65, 68]])
Targets: 
torch.Size([4, 8])
tensor([[ 1, 69, 51, 59, 54,  1, 70, 58],
        [59, 70,  0, 56, 55, 62, 70,  1],
        [ 1, 66, 59, 55, 53, 55, 69,  1],
        [59, 64, 57,  1, 73, 65, 68, 70]])


In [71]:
@torch.no_grad()  # This tells PyTorch not to compute gradients, which is useful for inference and evaluation.
def estimate_loss():
    out = {}
    model.eval()  # This sets the model to evaluation mode, which is important for certain layers like dropout and batch normalization.
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()  # This sets the model back to training mode.
    return out

In [77]:
# Now we are going to use gradient descent to train the model.
# PyTorch has a variety of optimizers that we can use for this purpose.
# We are going to use the AdamW optimization algorithm.
# Adam is an optimization algorithm that uses a moving average of the gradient and its square value to adapt the learning rate of each parameter.
# AdamW is a variant of the Adam optimizer that decouples weight decay from the gradient update, which can lead to better performance in some cases.

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)  # This creates an embedding table that maps each token to a vector of the same size as the vocabulary.
    
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)  # Logits are a bunch of floating point numbers that are normalized in a probability distribution. They represent the model's predictions for the next character in the sequence.
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape # B is the batch size, T is the time dimension, and C is the channel dimension. The time dimension is named as such because there are some tokens that we do know and some that we do not know at a given time step. Channels are just how many channels we are going to have, which would be our vocab size.
            logits = logits.view(B*T, C) # We are paying attention to the channel dimension, so we can blend the batch and time dimensions together.
            targets = targets.view(B*T) # The targets will have the same batch and time dimensions, which should be alright.
            loss = F.cross_entropy(logits, targets) # Cross entropy loss is a common loss function used for classification problems. It measures the difference between the predicted probability distribution and the true distribution.
            # We essentially reshaped this data because PyTorch expects this shape to be a (N, C) shape, where N is the number of samples and C is the number of classes.
            # When passing the logits into a functional, it is important to understand what shape PyTorch expects the data to be in, and how to reshape the data to fit that shape.
        
        return logits, loss

    def generate(self, index, max_new_tokens):
        # Index is (B, T) shape, where B is the batch size and T is the time dimension. This represents the current context of tokens that we have generated so far.
        for _ in range(max_new_tokens):
            # Get the predictions
            logits, loss = self.forward(index)
            # Focus only on the last time step
            logits = logits[:, -1, :]  # Specifying the last time step creates (B, C) shape.
            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # Softmax is a function that converts logits into probabilities. It takes the exponent of each logit and normalizes them so that they sum to 1. (B, C) shape.
            # Sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1)  # Multinomial sampling is a way to sample from a probability distribution. It takes the probabilities and samples an index based on those probabilities. (B, 1) shape.
            # Append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1)  # (B, T+1) shape.
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device) # This will move the model to the GPU if it is available, otherwise it will stay on the CPU.

context = torch.zeros((1, 1), dtype=torch.long, device=device) # This is the initial context for generation. It starts with a single token (the start token) and will be updated as we generate new tokens.
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist()) # This will generate a sequence of 500 new tokens based on the initial context and decode it back into characters.
print(generated_chars)


EmDEeR*.:jQU3ai?wZ-﻿’8(us-‘e1mo’Brkf(WP*F#8BM2’uLGh8Em6.Fw_c﻿'*)wObQ”Pz3gvT ”Br‘ts_3'Dx)vMd!G]‘7“0q!Q5(ùY16'pdaKvYw_5dJy)“v5-—Yv16fUO(l!ML)6yqDCru,
Zic(UM#mz*U32’]“Nu﻿g8X
'PO6b!nMjI#7‘3L!MzCu*QJTb
j;sLzofaZeZK’R8xKHt;'Q*(yHKS6c)r—kKHFp]t0E—Uf[U“!0ùo]FrSg[fsn.h_Es2w
)“l].?7k8(o]D.x w
s[F8fU)!onK
ES0kt[”YKS‘Nw_)rc18”8;)f_e”Tù Yk1z;X﻿BiHAs:02GGYUYG“;5AUZg‘UeW]Dy”5 R—.3-m3,UY‘JaC]ùe1nU86RLgu1hfs5ioesf
K]rLrq’3So’aErJ,MWP.ù2Tt_WOKm’rs5ZaGùIdnYSKe-*U).-﻿xJ*WPpMeZ#.F#x(s[
)Zb,N#8’[D”’uJ#KvfO6cu:“Fgyj;ù


In [81]:
# Create the optimization algorithm and the learning rate.
learning_rate = 3e-4 # Picking a learning rate. A common learning rate is 3e-4, but you can experiment with different learning rates to test the performance of the model.
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) # As discussed, we will use the AdamW optimization algorithm.

max_iters = 10000  # Define number of training iterations. 10000 iterations is generally good for this case, but you can experiment with different values to test the performance of the model.

# We can even add loss reporting with /"eval_iters/" to see how the model is improving during training.
eval_iters = 250

for iter in range(max_iters):

    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f'Step: {iter} Train loss: {losses["train"]:.3f}, Val loss: {losses["val"]:.3f}')
    # Sample a batch of data
    x, y = get_batch('train')

    # Evaluate the loss
    logits, loss = model.forward(x, y)
    optimizer.zero_grad(set_to_none=True) # This will reset the gradients of the model parameters to zero before we perform backpropagation. This is important because PyTorch accumulates gradients by default, so we need to clear them out before each optimization step.
    loss.backward() # This will perform backpropagation and compute the gradients of the loss with respect to the model parameters.
    optimizer.step() # This will update the model parameters based on the computed gradients and the optimization algorithm.
print(loss.item()) # This will print the final loss after training the model. The .item() method is used to get the scalar value of the loss tensor.

Step: 0 Train loss: 2.676, Val loss: 2.710
Step: 250 Train loss: 2.655, Val loss: 2.683
Step: 500 Train loss: 2.670, Val loss: 2.683
Step: 750 Train loss: 2.655, Val loss: 2.689
Step: 1000 Train loss: 2.669, Val loss: 2.664
Step: 1250 Train loss: 2.614, Val loss: 2.638
Step: 1500 Train loss: 2.639, Val loss: 2.658
Step: 1750 Train loss: 2.620, Val loss: 2.635
Step: 2000 Train loss: 2.621, Val loss: 2.620
Step: 2250 Train loss: 2.608, Val loss: 2.645
Step: 2500 Train loss: 2.622, Val loss: 2.621
Step: 2750 Train loss: 2.627, Val loss: 2.615
Step: 3000 Train loss: 2.580, Val loss: 2.615
Step: 3250 Train loss: 2.596, Val loss: 2.609
Step: 3500 Train loss: 2.608, Val loss: 2.606
Step: 3750 Train loss: 2.589, Val loss: 2.615
Step: 4000 Train loss: 2.583, Val loss: 2.599
Step: 4250 Train loss: 2.606, Val loss: 2.575
Step: 4500 Train loss: 2.567, Val loss: 2.579
Step: 4750 Train loss: 2.573, Val loss: 2.566
Step: 5000 Train loss: 2.553, Val loss: 2.569
Step: 5250 Train loss: 2.561, Val loss: 

In [82]:
# We run the same generation code as before to see how the model has improved after training.
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


So hed_ lind y fup,” Qn blitotrd.” t caud
nggalopenr me ase; Aliliserded—_m’se howe welale

QJmith, she d s, f  lof smarelit f)“If as?Nke whede as
ntheno is y!﻿7in “R*jUN!, y cte, d dlderend ve




f hash, s tomeeirt pus t dy het ar—“On’ru?Cr s k’lbans “orrue won shind, wnog toupuzJ[D2GTh,
‘3f seticend imette d—”'f she tab0OJI o targooaypphelit C)fo?_[Mitit, ajityandeB—” erevllldngoowithere t Athit _y ssplllped heed ain he herovend  ifenothelony d ry, esed fuff-‘-﻿—YK0#5Nbsassheaien wnt, ouitlor
