In [8]:
# download instruct-dataset

import json
import os
import urllib

def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding = "utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding = "utf-8") as file:
            text_data = file.read()
    with open(file_path, "r") as file:
        data = json.load(file)

    return data

file_path = "instruction/instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))


Number of entries: 1100


In [9]:
print("Example entry:\n", data[50])

Example entry:
 {'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


In [5]:
# implementing the prompt formatting function
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriatelt completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    # skips the optional Input section if it is empty
    input_text = (
        f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    )
    return instruction_text + input_text

In [10]:
# check format
model_input = format_input(data[999])
desired_response = f"\n\n### Response: \n{data[50]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriatelt completes the request.

### Instruction:
What is an antonym of 'complicated'?

### Response: 
The correct spelling is 'Occasion.'


In [11]:
# train-test-val split
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))

Training set length: 935
Validation set length: 55
Test set length: 110


In [12]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.encoded_texts[index]
    
    def __len__(self):
        return len(self.data)

In [13]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [14]:
# implement the padding process with a custom collate function
'''
This custom collate function pads the training examples in each batch to the same length while allowing different batches to have different lengths. This approach minimizes unnecessary padding by only extending
sequences to match the longest one in each batch, not the whole dataset.
'''
def custom_collate_draft_1(
        batch,
        pad_token_id = 50256,
        device = "cpu"
):
    batch_max_length = max(len(item)+1 for item in batch)
    inputs_lst = []
    for item in batch:
        new_item = item.copy()

        new_item += [pad_token_id]
        # print(new_item)
        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        # print(padded)
        inputs = torch.tensor(padded[:-1])
        inputs_lst.append(inputs)

    inputs_tensor = torch.stack(inputs_lst).to(device)
    return inputs_tensor

In [15]:
# example of padding with customized collate function
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]
batch = (
    inputs_1,
    inputs_2,
    inputs_3)
print(custom_collate_draft_1(batch))

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])


In [16]:
def custom_collate_draft_2(
        batch,
        pad_token_id = 50256,
        device = "cpu"
):
    batch_max_length = max(len(item)+1 for item in batch)
    inputs_lst, targets_lst = [], []
    
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]

        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])
        inputs_lst.append(inputs)
        targets_lst.append(targets)

    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor, targets_tensor

inputs, targets = custom_collate_draft_2(batch)
print("inputs:\n", inputs)
print("targets:\n", targets)

inputs:
 tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
targets:
 tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256, 50256, 50256, 50256],
        [    8,     9, 50256, 50256, 50256]])


### ***what is special about token ID `-100` ?***
The default setting of the cross entropy function in PyTorch is `cross_entropy(..., ignore_index=-100)`. 

**This means that it ignores targets labeled with `-100`**

We take advantage of this ignore_index to ignore the additional end-of-text (padding) tokens that we used to pad the training examples to have the same length in each batch.

In [17]:

def custom_collate_fn(
        batch,
        pad_token_id = 50256,
        ignore_index = -100,
        allowed_max_length = None,
        device = "cpu"
):
    batch_max_length = max(len(item)+1 for item in batch)
    inputs_lst, targets_lst = [], []
    
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]

        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])

        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor, targets_tensor

In [18]:
# test
inputs, targets = custom_collate_fn(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


### ***cross entropy loss should only be computed for the generated response target IDs***

Thus, **it is common that** the model
is trained to focus on generating accurate responses rather than memorizing instruc-
tions, which can help reduce overfitting.

![](mdfig\2025-04-09-16-55-45.png)

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.backends.mps.is_available():
    device = torch.device("mps")
print("Device:", device)

Device: cpu


In [20]:
from functools import partial

customized_collate_fn = partial(
    custom_collate_fn,
    device = device,
    allowed_max_length = 1024
)

In [52]:
# initialize data loader
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)


print("Train loader:")
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)
    # print(tokenizer.decode(inputs[0].tolist()))
    # print('-'*50)
    # print(tokenizer.decode(targets[0].tolist(),))

Train loader:
torch.Size([8, 63]) torch.Size([8, 63])
torch.Size([8, 78]) torch.Size([8, 78])
torch.Size([8, 75]) torch.Size([8, 75])
torch.Size([8, 70]) torch.Size([8, 70])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 74]) torch.Size([8, 74])
torch.Size([8, 82]) torch.Size([8, 82])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 64]) torch.Size([8, 64])
torch.Size([8, 77]) torch.Size([8, 77])
torch.Size([8, 64]) torch.Size([8, 64])
torch.Size([8, 70]) torch.Size([8, 70])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 79]) torch.Size([8, 79])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 81]) torch.Size([8, 81])
torch.Size([8, 73]) torch.Size([8, 73])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 85]) torch.Size([8, 85])
torch.Size([8, 70]) torch.Size([8, 70])
torch.Size([8, 82]) torch.Size([8, 82])
torch.Size([8, 73]) torch.Size([8, 73])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 70]) torch.

### Fine-Tune

In [32]:
# load thee pretained llm
from gpt_download import download_and_load_gpt2
from BuildingBlocks import GPTModel, load_weights_into_gpt
BASE_CONFIG = {
    "vocab_size": 50257, # Vocabulary size
    "context_length": 1024, # Context length
    "drop_rate": 0.0, # Dropout rate
    "qkv_bias": True # Query-key-value bias
}
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-small (124M)"#"gpt2-medium (355M)"
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")

settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)


File already exists and is up-to-date: gpt2\124M\checkpoint
File already exists and is up-to-date: gpt2\124M\encoder.json
File already exists and is up-to-date: gpt2\124M\hparams.json
File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001
Primary URL (https://openaipublic.blob.core.windows.net/gpt-2/models\124M\model.ckpt.index) failed. Attempting backup URL: https://f001.backblazeb2.com/file/LLMs-from-scratch/gpt2\124M\model.ckpt.index
Failed to download from both primary URL (https://openaipublic.blob.core.windows.net/gpt-2/models\124M\model.ckpt.index) and backup URL (https://f001.backblazeb2.com/file/LLMs-from-scratch/gpt2\124M\model.ckpt.index).
Check your internet connection or the file availability.
For help, visit: https://github.com/rasbt/LLMs-from-scratch/discussions/273
File already exists and is up-to-date: gpt2\124M\model.ckpt.meta
File already exists and is up-to-date: gpt2\124M\vocab.bpe


In [33]:
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [43]:
import torch
torch.manual_seed(123)
input_text = format_input(val_data[0])
print(input_text)

Below is an instruction that describes a task. Write a response that appropriatelt completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'


In [48]:
tmp = torch.tensor(train_dataset.encoded_texts[0])
tokenizer.decode(tmp.tolist())

'Below is an instruction that describes a task. Write a response that appropriatelt completes the request.\n\n### Instruction:\nEvaluate the following phrase by transforming it into the spelling given.\n\n### Input:\nfreind --> friend\n\n### Response: \nThe spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".'

In [53]:
from BuildingBlocks import generate, text_to_token_ids, token_ids_to_text

token_ids = generate(
    model=model,
    idx=text_to_token_ids(input_text, tokenizer),
    max_new_tokens=35,
    context_size=BASE_CONFIG["context_length"],
    eos_id=50256,
)
generated_text = token_ids_to_text(token_ids, tokenizer)

In [54]:

response_text = generated_text[len(input_text):].strip()
print(f"full: {generated_text}")
print(f"response: {response_text}")

full: Below is an instruction that describes a task. Write a response that appropriatelt completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response: 
The chef cooks the meal every day.
response: ### Response: 
The chef cooks the meal every day.


In [55]:
import importlib
import BuildingBlocks
importlib.reload(BuildingBlocks)
from BuildingBlocks import calc_loss_loader, train_model_simple

model.to(device)
torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(
        train_loader, model, device, num_batches=5
    )
    val_loss = calc_loss_loader(
        val_loader, model, device, num_batches=5
    )

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 0.4315296471118927
Validation loss: 0.7182615876197815


In [38]:
# training loop

import time

start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)

num_epochs = 2
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 3.719, Val loss 3.630
Ep 1 (Step 000005): Train loss 2.085, Val loss 1.996
Ep 1 (Step 000010): Train loss 1.147, Val loss 1.224
Ep 1 (Step 000015): Train loss 1.054, Val loss 1.084
Ep 1 (Step 000020): Train loss 0.969, Val loss 1.036
Ep 1 (Step 000025): Train loss 0.912, Val loss 0.992
Ep 1 (Step 000030): Train loss 0.944, Val loss 0.960
Ep 1 (Step 000035): Train loss 0.851, Val loss 0.924
Ep 1 (Step 000040): Train loss 0.824, Val loss 0.911
Ep 1 (Step 000045): Train loss 0.752, Val loss 0.899
Ep 1 (Step 000050): Train loss 0.848, Val loss 0.890
Ep 1 (Step 000055): Train loss 0.904, Val loss 0.873
Ep 1 (Step 000060): Train loss 0.853, Val loss 0.858
Ep 1 (Step 000065): Train loss 0.775, Val loss 0.848
Ep 1 (Step 000070): Train loss 0.672, Val loss 0.835
Ep 1 (Step 000075): Train loss 0.687, Val loss 0.830
Ep 1 (Step 000080): Train loss 0.730, Val loss 0.821
Ep 1 (Step 000085): Train loss 0.666, Val loss 0.809
Ep 1 (Step 000090): Train loss 0.714, Val loss

In [27]:
# save model together with optimizer
# 1.81 GB
# torch.save({
#     "model_instruct_state_dict": model.state_dict(),
#     "optimizer_instruct_state_dict": optimizer.state_dict(),
#     },
#     "instruction/model_and_optimizer_instruct.pth")

checkpoint = torch.load("instruction/model_and_optimizer_instruct.pth", map_location = device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_instruct_state_dict"])

In [None]:
import importlib
import BuildingBlocks
importlib.reload(BuildingBlocks)

from BuildingBlocks import plot_losses

# epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
# plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

In [57]:
torch.manual_seed(123)

for entry in test_data[:3]:
    input_text = format_input(entry)
    token_ids = generate(
        model = model,
        idx = text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size = BASE_CONFIG["context_length"],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = (
        generated_text[len(input_text):].replace("### Response:", "").strip()
    )
    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")

Below is an instruction that describes a task. Write a response that appropriatelt completes the request.

### Instruction:
Rewrite the sentence using a simile.

### Input:
The car is very fast.

Correct response:
>> The car is as fast as lightning.

Model response:
>> The car is as fast as a horse.
-------------------------------------
Below is an instruction that describes a task. Write a response that appropriatelt completes the request.

### Instruction:
What type of cloud is typically associated with thunderstorms?

Correct response:
>> The type of cloud typically associated with thunderstorms is cumulonimbus.

Model response:
>> The type of cloud typically associated with thunderstorms is a tropical storm.
-------------------------------------
Below is an instruction that describes a task. Write a response that appropriatelt completes the request.

### Instruction:
Name the author of 'Pride and Prejudice'.

Correct response:
>> Jane Austen.

Model response:
>> The author of 'Prid

In [59]:
for entry in test_data[50:55]:
    input_text = format_input(entry)
    token_ids = generate(
        model = model,
        idx = text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size = BASE_CONFIG["context_length"],
        eos_id=50256
    )

    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = (
        generated_text[len(input_text):].replace("### Response:", "").strip()
    )
    print(input_text)
    print("---- input len:", len(input_text))
    print("---- output len:", len(generated_text))
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")

Below is an instruction that describes a task. Write a response that appropriatelt completes the request.

### Instruction:
Edit the given text to ensure all plural nouns are spelled correctly.

### Input:
The birds sings beautiful songs.
---- input len: 238
---- output len: 287

Correct response:
>> The birds sing beautiful songs.

Model response:
>> The birds sings beautiful songs.
-------------------------------------
Below is an instruction that describes a task. Write a response that appropriatelt completes the request.

### Instruction:
Transform the following sentence into a question using "could."

### Input:
You can help me tomorrow.
---- input len: 225
---- output len: 269

Correct response:
>> Could you help me tomorrow?

Model response:
>> Could you help me tomorrow?
-------------------------------------
Below is an instruction that describes a task. Write a response that appropriatelt completes the request.

### Instruction:
Classify the following items: bicycle, rose, tig