EVALS

In [11]:
import matplotlib
import numpy
import tiktoken
import torch
# import tensorflow

In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [None]:
from llm_from_scratch import GPTModel

In [5]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();

In [None]:
import tiktoken
from llm_from_scratch import generate_simple_text

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

In [7]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

text_to_token_ids(start_context, tokenizer)

tensor([[6109, 3626, 6100,  345]])

In [24]:
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

In [9]:
out = generate_simple_text(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)


In [10]:
token_ids_to_text(out, tokenizer)

'Every effort moves you rentingetic wasnم refres RexAngel infieldcigans'

CROSS ENTROPY

In [11]:
inputs = torch.tensor ([
    [16833, 3626, 6100],
    [40, 1107, 588]
])

targets = torch.tensor ([
    [3626, 6100, 345],
    [1107, 588, 11311]
])

In [12]:
with torch.no_grad():
    logits = model(inputs)

In [13]:
logits.shape

torch.Size([2, 3, 50257])

In [14]:
probas = torch.softmax(logits, dim=-1)
probas.shape

torch.Size([2, 3, 50257])

In [15]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print(f"Token IDs: {token_ids}")

Token IDs: tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])


In [16]:
print(f"Target batch: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Output batch: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")


Target batch:  effort moves you
Output batch:  Armed heNetflix


In [17]:
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()

In [18]:
torch.nn.functional.cross_entropy(logits_flat, targets_flat)


tensor(10.8009)

TRAIN AND VALIDATION

In [19]:
import os
import requests

file_path = "data/the_verdict.txt"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

if not os.path.exists(file_path):
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    text_data = response.text
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()

In [20]:
text_data[:55]

'I HAD always thought Jack Gisburn rather a cheap genius'

In [21]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))

total_characters, total_tokens

(20479, 5145)

In [None]:
from llm_from_scratch import create_dataloader_v1, generate_simple_text

train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [23]:
torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [24]:
print("Train loader")
for x, y in train_loader:
    # pass
    print(x.shape, y.shape)
    

Train loader
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [25]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [31]:
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = "cpu"
# model.to(device);

In [32]:
torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)

print(f"Training loss: {train_loss}")
print(f"Validation loss: {val_loss}")

Training loss: 10.988949563768175
Validation loss: 10.97446346282959


TRAININGGG

In [37]:
def train_model_simple(model, train_loader, 
    val_loader, optimizer, device, num_epochs,
    eval_freq, eval_iter, start_context, tokenizer):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context, max_new_tokens=50):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_simple_text(
            model=model, idx=encoded,
            max_new_tokens=max_new_tokens, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

In [34]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to("cpu");
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

In [35]:
num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Every effort moves you", tokenizer=tokenizer
)

Ep 1 (Step 000000): Train loss 9.764, Val loss 9.903
Ep 1 (Step 000005): Train loss 8.115, Val loss 8.337
Every effort moves you,,,,,,,,,,,,.                                     
Ep 2 (Step 000010): Train loss 6.668, Val loss 7.049
Ep 2 (Step 000015): Train loss 5.982, Val loss 6.624
Every effort moves you, and, and, and, and, and, and, and, and,, and,,,,, and, and,, and, and, and,, and, and,, and, and,, and,, and
Ep 3 (Step 000020): Train loss 5.689, Val loss 6.482
Ep 3 (Step 000025): Train loss 5.687, Val loss 6.481
Every effort moves you, and, and the the of the the the the to the the.                                   
Ep 4 (Step 000030): Train loss 5.380, Val loss 6.435
Ep 4 (Step 000035): Train loss 4.965, Val loss 6.374
Every effort moves you his ""I and I had been the picture the picture and I had the picture, and I had been the                           
Ep 5 (Step 000040): Train loss 4.693, Val loss 6.249
Every effort moves you, and I had been a, and I was a a a his.         

/\ overfiting - val loss is stuck while train loss goes down

In [45]:
generate_and_print_sample(
    model, tokenizer, device, "He was the only",
    max_new_tokens=5
)

He was the only course I meant to do


this function retrieves the token with highest probability, so it will essentially memorize the training set and overfit

DECODING STRATEGIES TO CONTROL RANDOMNESS

In [None]:
model.to("cpu")
model.eval()

tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_simple_text(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=25,
    context_size=GPT_CONFIG_124M["context_length"]
)
print(token_ids_to_text(token_ids=token_ids, tokenizer=tokenizer))


NameError: name 'model' is not defined

TEMPERATURE SCALING

use multinomial sampling to not "brute force" the most probable token, but instead sample based on the probability of each token

In [52]:
def softmax_with_temperature(logits, temperature):
    scaled_logits = logits / temperature
    return torch.softmax(scaled_logits, dim=0)

In [60]:
temperatures = [1, 0.1, 5]
test_logits = torch.tensor(
    [4.51, 0.89, -11.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79]
)


scaled_probas = [softmax_with_temperature(test_logits, T) for T in temperatures]

In [56]:
scaled_probas

[tensor([6.0913e-02, 1.6314e-03, 4.5492e-09, 5.7218e-01, 3.4193e-03, 1.3258e-04,
         1.0121e-04, 3.5761e-01, 4.0126e-03]),
 tensor([1.8530e-10, 3.5189e-26, 0.0000e+00, 9.9099e-01, 5.7569e-23, 4.4220e-37,
         2.9718e-38, 9.0133e-03, 2.8514e-22]),
 tensor([0.1606, 0.0779, 0.0060, 0.2514, 0.0903, 0.0471, 0.0447, 0.2288, 0.0932])]

increasing temperature allow less likely values to be scaled up and adds "creativity" to the model output, lower temperature doubles down on most likely probas

top k sampling

minimizes the probability of generating garbage in exchange of creativity - masks low probas with -inf so they become 0 after softmax

In [63]:
test_logits = torch.tensor(
    [4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79]
)
test_logits

tensor([ 4.5100,  0.8900, -1.9000,  6.7500,  1.6300, -1.6200, -1.8900,  6.2800,
         1.7900])

In [65]:
top_k = 3
top_logits, top_pos = torch.topk(test_logits, top_k)
top_logits, top_pos

(tensor([6.7500, 6.2800, 4.5100]), tensor([3, 7, 0]))

In [70]:
new_logits = torch.where(
    condition=test_logits < top_logits[-1], #top_logits[-1] is already the lowest of the top k values
    input=torch.tensor(float("-inf")),
    other=test_logits
)

In [76]:
torch.softmax(new_logits, dim=0)

tensor([0.0615, 0.0000, 0.0000, 0.5775, 0.0000, 0.0000, 0.0000, 0.3610, 0.0000])

MODIFYING TEXT GENERATION

In [48]:
def generate_text(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # New (not in book): numerical stability tip to get equivalent results on mps device
            # subtract rowwise max before softmax
            logits = logits - logits.max(dim=-1, keepdim=True).values
            
            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

In [83]:
torch.manual_seed(123)

token_ids = generate_text(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=15,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=25,
    temperature=1.4
)
print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you in," was to surprise, a little to have been the end of my


LOADING AND SAVING MODEL WEIGHTS

In [87]:
# to save
torch.save(model.state_dict(), "model.pth")


In [91]:
# to load
loaded_model = GPTModel(GPT_CONFIG_124M)

device = torch.device("cpu")
loaded_model.load_state_dict(torch.load("model.pth", map_location=device))

<All keys matched successfully>

LOADING PRETRAINED WEIGHTS FROM OPENAI

In [60]:
from gpt_download import download_and_load_gpt2

In [1]:
seetings, params = download_and_load_gpt2(
    model_size="124M",
    models_dir="data/models/gpt2"
)

NameError: name 'download_and_load_gpt2' is not defined

In [None]:
from llm_from_scratch import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Copy the base configuration and update with specific model settings
model_name = "gpt2-medium (355M)"  # Example model name
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

gpt = GPTModel(NEW_CONFIG)
gpt.eval();



In [43]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError("Shape mismatch")
    return torch.nn.Parameter(torch.tensor(right))

In [44]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
    
    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])
    
    
load_weights_into_gpt(gpt, params)
gpt.to("cpu");

In [59]:
# torch.manual_seed(11)
tokenizer = tiktoken.get_encoding("gpt2")


token_ids = generate_text(
    model=gpt,
    idx=text_to_token_ids("I would only", tokenizer).to("cpu"),
    max_new_tokens=25,
    context_size=NEW_CONFIG["context_length"],
    top_k=50,
    temperature=1.4
)
print(token_ids_to_text(token_ids, tokenizer))

I would only call for his presence to do the world and our country together. But my answer to a second term on the campaign trail will
