# Chapter 5 Exercises

In [1]:
from importlib.metadata import version

pkgs = ["numpy", 
        "tiktoken", 
        "torch",
        "tensorflow" # For OpenAI's pretrained weights
       ]
for p in pkgs:
    print(f"{p} version: {version(p)}")

numpy version: 1.24.3
tiktoken version: 0.7.0
torch version: 2.4.1
tensorflow version: 2.11.0


# Exercise 5.1: Temperature-scaled softmax scores and sampling probabilities

In [2]:
import torch

vocab = { 
    "closer": 0,
    "every": 1, 
    "effort": 2, 
    "forward": 3,
    "inches": 4,
    "moves": 5, 
    "pizza": 6,
    "toward": 7,
    "you": 8,
} 
inverse_vocab = {v: k for k, v in vocab.items()}

next_token_logits = torch.tensor(
    [4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79]
)

def print_sampled_tokens(probas):
    torch.manual_seed(123)
    sample = [torch.multinomial(probas, num_samples=1).item() for i in range(1_000)]
    sampled_ids = torch.bincount(torch.tensor(sample))
    for i, freq in enumerate(sampled_ids):
        print(f"{freq} x {inverse_vocab[i]}")


def softmax_with_temperature(logits, temperature):
    scaled_logits = logits / temperature
    return torch.softmax(scaled_logits, dim=0)


temperatures = [1, 0.1, 5]  # Original, higher, and lower temperature
scaled_probas = [softmax_with_temperature(next_token_logits, T) for T in temperatures]

In [3]:
for i, probas in enumerate(scaled_probas):
    print("\n\nTemperature:", temperatures[i])
    print_sampled_tokens(probas)



Temperature: 1
71 x closer
2 x every
0 x effort
544 x forward
2 x inches
1 x moves
0 x pizza
376 x toward
4 x you


Temperature: 0.1
0 x closer
0 x every
0 x effort
992 x forward
0 x inches
0 x moves
0 x pizza
8 x toward


Temperature: 5
153 x closer
68 x every
55 x effort
223 x forward
102 x inches
50 x moves
43 x pizza
218 x toward
88 x you


In [4]:
temp5_idx = 2
pizza_idx = 6

scaled_probas[temp5_idx][pizza_idx]

tensor(0.0430)

# Exercise 5.2: Different temperature and top-k settings

In [72]:
def generate(model, tokenizer, prompt, max_length=50, temperature=1.0, top_k=50):
    model.eval()
    device = next(model.parameters()).device
    input_ids = torch.tensor([tokenizer.encode(prompt)], dtype=torch.long).to(device)

    with torch.no_grad():
        for _ in range(max_length):
            logits = model(input_ids)  # Shape: (1, vocab_size)
            logits = logits[0, -1] if logits.dim() == 3 else logits[0]  # Support both 3D and 2D

            logits = logits / temperature

            if top_k > 0:
                top_k_logits, top_k_indices = torch.topk(logits, top_k)
                probs = F.softmax(top_k_logits, dim=-1)
                next_token = top_k_indices[torch.multinomial(probs, num_samples=1)]
            else:
                probs = F.softmax(logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)

            input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)

    return tokenizer.decode(input_ids[0].tolist())


In [73]:
prompt = "The future of AI is"

for temp in [0.7, 1.0, 1.3]:
    for k in [10, 50, 100]:
        print(f"\n=== Temp: {temp}, Top-k: {k} ===")
        print(generate(model, tokenizer, prompt, temperature=temp, top_k=k))



=== Temp: 0.7, Top-k: 10 ===
The future of AI is to work on--forming, as it were, so inevitably the background of her own picture--had lent herself in an unusual degree to the display of this false virtuosity. The picture was one of Jack's "strongest," as his own

=== Temp: 0.7, Top-k: 50 ===
The future of AI is thought of his pictures with a cheap genius--though a good She wanted him vindicated--and by me!"
" resolve had been taken. It might be that he had married her--the quality of Jack's "strongest," as his own

=== Temp: 0.7, Top-k: 100 ===
The future of AI is the inevitable garlanded frame. The mere outline of the frame called up all Gisburn's past!
"Oh, you like."
" (I didn't seen him to have him done by a fashionable painter--that I found her

=== Temp: 1.0, Top-k: 10 ===
The future of AI is my diagnosis go a little wild--I must that lifted the frame called up at him down an exquisitely appointed luncheon-table, when, fitting that they should mourn him. It had longed to say: "

Lower temperature and top-k values yield more coherent and focused output, which is useful for applications like legal document drafting or summarization. Higher temperature and top-k values increase diversity, which is better suited for creative tasks like poetry, storytelling, or brainstorming.

# Exercise 5.3: Deterministic behavior in the decoding functions

In [11]:
import os
print(os.getcwd())


/mnt/c/Users/tarek/OneDrive - Trinity College/Documents/Trinity College/Senior/CPSC-352/LLMs-from-scratch/ch05


In [13]:
import tiktoken
import torch
import sys
sys.path.append("01_main-chapter-code")

from previous_chapters import GPTModel


GPT_CONFIG_124M = {
    "vocab_size": 50257,  # Vocabulary size
    "context_length": 256,       # Shortened context length (orig: 1024)
    "emb_dim": 768,       # Embedding dimension
    "n_heads": 12,        # Number of attention heads
    "n_layers": 12,       # Number of layers
    "drop_rate": 0.1,     # Dropout rate
    "qkv_bias": False     # Query-key-value bias
}


torch.manual_seed(123)

tokenizer = tiktoken.get_encoding("gpt2")
model = GPTModel(GPT_CONFIG_124M)
model_path = "01_main-chapter-code/model.pth"
model.load_state_dict(torch.load(model_path, weights_only=True))
model.eval();

In [15]:
from gpt_generate import generate, text_to_token_ids, token_ids_to_text
from previous_chapters import generate_text_simple

In [16]:
# Deterministic function that used torch.argmax

start_context = "Every effort moves you"

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=25,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you?"
"I that my hostess was "interesting": on that point I could have given Miss Croft the fact,


In [17]:
# Deterministic behavior: No top_k, no temperature scaling

token_ids = generate(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=25,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=None,
    temperature=0.0
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you?"
"I that my hostess was "interesting": on that point I could have given Miss Croft the fact,


In [18]:
# Deterministic behavior: No top_k, no temperature scaling

token_ids = generate(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=25,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=None,
    temperature=0.0
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you?"
"I that my hostess was "interesting": on that point I could have given Miss Croft the fact,


# Exercise 5.4: Continued pretraining

In [20]:
import tiktoken
import torch
from previous_chapters import GPTModel


GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = tiktoken.get_encoding("gpt2")



checkpoint = torch.load("01_main-chapter-code/model_and_optimizer.pth", weights_only=True)
model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(checkpoint["model_state_dict"])
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train();

In [21]:
import os
import urllib.request
from previous_chapters import create_dataloader_v1


file_path = "the-verdict.txt"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

if not os.path.exists(file_path):
    with urllib.request.urlopen(url) as response:
        text_data = response.read().decode('utf-8')
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()


# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [22]:
from gpt_train import train_model_simple

num_epochs = 1
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Every effort moves you", tokenizer=tokenizer
)

Ep 1 (Step 000000): Train loss 0.438, Val loss 6.438
Ep 1 (Step 000005): Train loss 0.302, Val loss 6.531
Every effort moves you?" "Oh, pushed one of the deep arm-chairs forward. "There: make yourself comfortable--and here are the cigars you like." "Oh, and he had dropped his painting, had been the man of the hour. The


# Exercise 5.5: Training and validation set losses of the pretrained model

In [23]:
import tiktoken
import torch
from previous_chapters import GPTModel


GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}


torch.manual_seed(123)

tokenizer = tiktoken.get_encoding("gpt2")

In [24]:
from gpt_download import download_and_load_gpt2

settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")

checkpoint: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77.0/77.0 [00:00<00:00, 12.5kiB/s]
encoder.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.04M/1.04M [00:00<00:00, 2.35MiB/s]
hparams.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90.0/90.0 [00:00<00:00, 12.0kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 498M/498M [02:06<00:00, 3.94MiB/s]
model.ckpt.index: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5.21k/5.21k [00:00<00:00, 551kiB/s]
model.ckpt.meta: 100

In [51]:
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")


checkpoint: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77.0/77.0 [00:00<00:00, 11.2kiB/s]
encoder.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.04M/1.04M [-1:59:58<00:00, -476kiB/s]
hparams.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90.0/90.0 [00:00<00:00, 11.9kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 498M/498M [02:09<00:00, 3.84MiB/s]
model.ckpt.index: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5.21k/5.21k [00:00<00:00, 574kiB/s]
model.ckpt.meta: 100

In [53]:
# Define model configurations in a dictionary for compactness
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Copy the base configuration and update with specific model settings
model_name = "gpt2-small (124M)"  # Example model name
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

gpt = GPTModel(NEW_CONFIG)
gpt.eval();

In [54]:
from gpt_generate import load_weights_into_gpt


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_weights_into_gpt(gpt, params)
gpt.to(device);

In [55]:
import os
import urllib.request
from previous_chapters import create_dataloader_v1


file_path = "the-verdict.txt"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

if not os.path.exists(file_path):
    with urllib.request.urlopen(url) as response:
        text_data = response.read().decode('utf-8')
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()


# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [56]:
from gpt_train import calc_loss_loader

torch.manual_seed(123) # For reproducibility due to the shuffling in the data loader
train_loss = calc_loss_loader(train_loader, gpt, device)
val_loss = calc_loss_loader(val_loader, gpt, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 3.754763205846151
Validation loss: 3.559633255004883


In [57]:
settings, params = download_and_load_gpt2(model_size="1558M", models_dir="gpt2")

model_name = "gpt2-xl (1558M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

gpt = GPTModel(NEW_CONFIG)
gpt.eval()

load_weights_into_gpt(gpt, params)
gpt.to(device)

torch.manual_seed(123)
train_loss = calc_loss_loader(train_loader, gpt, device)
val_loss = calc_loss_loader(val_loader, gpt, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

checkpoint: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77.0/77.0 [00:00<00:00, 11.0kiB/s]
encoder.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.04M/1.04M [00:00<00:00, 2.10MiB/s]
hparams.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 91.0/91.0 [00:00<00:00, 13.6kiB/s]
model.ckpt.data-00000-of-00001: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.23G/6.23G [26:05<00:00, 3.98MiB/s]
model.ckpt.index: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.7k/20.7k [00:00<00:00, 488kiB/s]
model.ckpt.meta: 100

Training loss: 3.304650730556912
Validation loss: 3.1195342540740967


# Exercise 5.6: Trying larger models

In [62]:
import tiktoken
import torch
from previous_chapters import GPTModel


GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}


tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
from gpt_download import download_and_load_gpt2
from gpt_generate import load_weights_into_gpt


model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

model_name = "gpt2-xl (1558M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

gpt = GPTModel(NEW_CONFIG)
gpt.eval()

settings, params = download_and_load_gpt2(model_size="1558M", models_dir="gpt2")
load_weights_into_gpt(gpt, params)

In [64]:
from gpt_generate import generate, text_to_token_ids, token_ids_to_text

In [65]:
torch.manual_seed(123)

token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=25,
    context_size=NEW_CONFIG["context_length"],
    top_k=50,
    temperature=1.5
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you closer… You are all here for us… Please take us to God!" she cried.
But they knew to hold the
