# Pretraining GPT-2 Medium

## Loading the Input and Validation Tokens

In [10]:
from scripts.preload_dataloaders import load_train_dataloader, load_val_dataloader

train_loader = load_train_dataloader("data/fineweb-3b/train_loader.dl")
print("Loaded train_loader.")

val_loader = load_val_dataloader("data/fineweb-3b/val_loader.dl")
print("Loaded val_loader")

Loaded train_loader.
Loaded val_loader


In [7]:
batch = next(iter(train_loader))
batch

[tensor([[5613, 1139,   11,  ..., 4803, 6853,  508],
         [ 262, 1366, 4237,  ..., 1265,  262, 4639]]),
 tensor([[1139,   11,  475,  ..., 6853,  508, 1444],
         [1366, 4237,   11,  ...,  262, 4639,  290]])]

In [11]:
# To check token ID range in your dataset
max_token = float('-inf')
for i, (input_batch, _) in enumerate(train_loader):

    max_token = max(max_token, input_batch.max().item())
print(f"Maximum token ID: {max_token}")

Maximum token ID: 50256


In [12]:
from scripts.gpt2_model import GPTModel

GPT_CONFIG_355M = {
  "vocab_size": 50257,   # Vocabulary size
  "context_length": 1024, # Context length
  "emb_dim": 1024,        # Embedding dimension (larger than 124M)
  "n_heads": 16,         # Number of attention heads (larger than 124M)
  "n_layers": 24,        # Number of layers (larger than 124M)
  "drop_rate": 0.0,      # Dropout rate
  "qkv_bias": False      # Query-key-value bias
}

model = GPTModel(GPT_CONFIG_355M)

In [13]:
import torch
from scripts.train import calc_loss_loader

torch.manual_seed(123)

train_loss = calc_loss_loader(train_loader, model)
val_loss = calc_loss_loader(val_loader, model)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Processing batch: 0


KeyboardInterrupt: 

Now it is time to train our 355M model. Here we go!

In [None]:
from scripts.perf_timer import PerfTimer

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_355M)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

# We have lots of data, so we can just train for a single epoch.
num_epochs = 1

timer = PerfTimer()

timer.start()
train_losses, val_losses = train_model_simple(
    model, train_loader, val_loader, optimizer,
    num_epochs=num_epochs, eval_freq=50, eval_iter=50, # eval less frequently
    start_context="Every effort moves you", tokenizer=tokenizer
)
timer.stop()

print(f"Took this long to train: {timer.elapsed_ms()} ms")


## Save the model 

In [None]:
torch.save(model.state_dict(), "models/gpt2-355M-model.pth")

## Reload the model 

In [None]:
import torch
from scripts.gpt2_model import GPTModel

model = GPTModel(GPT_CONFIG_355M)
model.load_state_dict(
  torch.load("models/gpt2-355M-model.pth", weights_only=True)
)

## Testing by inferencing

In [None]:
from scripts.perf_timer import PerfTimer
from scripts.generate import generate_text_simple

perf_timer = PerfTimer()

perf_timer.start()
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=50,
    context_size=GPT_CONFIG_355M["context_length"]
)
perf_timer.stop()

print("Generated tokens in", perf_timer.elapsed_ms(), "ms")
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))