In [1]:
import torch 
import torch.nn
import tiktoken

from gpt2 import GPTModel
from utils import generate_text_simple, text_to_token_ids, token_ids_to_text, calc_loss_loader, train_model_simple
from dataloader import create_dataloader_v1

In [2]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "ctx_len": 256,        # Shortened context length (orig: 1024)
    "d_model": 768,        # Embedding dimension
    "d_ff": 4*768,
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval() 

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (mha): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=False)
        (W_k): Linear(in_features=768, out_features=768, bias=False)
        (W_v): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (ff): FeedForward(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
    )
    (1): TransformerBlock(
      (mha): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=False)
        (W_k): Linear(in_features=768, out_features=768, bias=False)
        (W_v): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=7

In [3]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = generate_text_simple(
    model, 
    text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    ctx_len=GPT_CONFIG_124M["ctx_len"],
)

In [4]:
token_ids_to_text(token_ids, tokenizer)

'Every effort moves you rentingetic chief refusing holidays Shannon GamergateHay men methamphetamine'

In [5]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

In [6]:
with torch.no_grad():
    logits = model(inputs)

probas = torch.softmax(logits, dim=-1)
probas.shape # (batch_size, seq_len, vocab_size)

torch.Size([2, 3, 50257])

In [7]:
token_ids = probas.argmax(dim=-1, keepdim=True)
token_ids

tensor([[[50153],
         [13866],
         [42826]],

        [[49906],
         [29669],
         [41751]]])

In [8]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  PRESIDENTbageNetflix


In [9]:
# logits: (batch_size, seq_len, vocab_size)
# targets: (batch_size, seq_len)

# need to flatten them over the batch dimension:
# logits -> (batch_size * seq_len, vocab_size)
# targets -> (batch_size * seq_len)

# loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
## takes care of applying the softmax and log-probability computation
# perplexity = torch.exp(loss)

In [10]:
import os
import urllib.request

file_path = "the-verdict.txt"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

if not os.path.exists(file_path):
    with urllib.request.urlopen(url) as response:
        text_data = response.read().decode('utf-8')
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()

In [11]:
print(text_data[:99])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [12]:
# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["ctx_len"],
    stride=GPT_CONFIG_124M["ctx_len"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["ctx_len"],
    stride=GPT_CONFIG_124M["ctx_len"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [13]:
# sanity check
print("Train loader:")
for x, y in train_loader:
    print(x.shape, y.shape)

print("\nValidation loader:")
for x, y in val_loader:
    print(x.shape, y.shape)

Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])


In [14]:
device = "cpu"
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 10.989978472391764
Validation loss: 10.986329078674316


In [15]:
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
num_epochs = 10
start_context = "Every effort moves you"
train_losses, val_losses, tokens_seen = train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq=5, eval_iter=5, start_context=start_context, tokenizer=tokenizer)


Ep 1 (Step 000000): Train loss 9.798, Val loss 10.001
Ep 1 (Step 000005): Train loss 8.192, Val loss 8.323
Every effort moves you,,,,,,,,,,,,,,,,,,,,,,,,,,,,, the,,,,,,,,,,, the,, the,,,,,
Ep 2 (Step 000010): Train loss 6.610, Val loss 7.004
Ep 2 (Step 000015): Train loss 5.953, Val loss 6.548
Every effort moves you, and, and, and." to the, and, and, and, and, and, and, and, and, and the, and the.".", and the, and the, and, and,
Ep 3 (Step 000020): Train loss 5.343, Val loss 6.499
Ep 3 (Step 000025): Train loss 4.900, Val loss 6.298
Every effort moves you?"
Ep 4 (Step 000030): Train loss 4.647, Val loss 6.486
Ep 4 (Step 000035): Train loss 4.245, Val loss 6.238
Every effort moves you?""I a little of his the to the donkey."--as of his eyes, with the donkey." to the donkey, in the donkey to me."I of the donkey."I."
Ep 5 (Step 000040): Train loss 3.357, Val loss 6.164
Every effort moves you know and was not that my hostess was "interesting": on the fact of the picture--and by me!""Oh, I h