# LLMFS Load gpt2 weights into pytorch model
Extracts from LLMFS to load weights into pytorch implementation of GPT2. Test different layers with simple values to compare with Tensorflow model.

In [2]:
!pip -q install tiktoken

[0m

In [3]:
from importlib.metadata import version

pkgs = ["matplotlib", 
        "numpy", 
        "tiktoken", 
        "torch",
        "tensorflow" # For OpenAI's pretrained weights
       ]
for p in pkgs:
    print(f"{p} version: {version(p)}")

matplotlib version: 3.7.3
numpy version: 1.26.3
tiktoken version: 0.12.0
torch version: 2.1.1+cu121
tensorflow version: 2.15.0


### 5.1.1 Using GPT to generate text

In [4]:
import torch
from previous_chapters import GPTModel
# If the `previous_chapters.py` file is not available locally,
# you can import it from the `llms-from-scratch` PyPI package.
# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg
# E.g.,
# from llms_from_scratch.ch04 import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();  # Disable dropout during inference

In [5]:
import tiktoken
from previous_chapters import generate_text_simple

# Alternatively:
# from llms_from_scratch.ch04 import generate_text_simple

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


### 5.3.3 Modifying the text generation function

In [59]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    # For-loop is the same as before: Get logits, and only focus on last time step
    for i in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        print("i=", i, "idx_cond=", idx_cond)
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        # print("  logits=", logits.shape)

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break
        # print("  idx_next=", idx_next)
        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

In [60]:
torch.manual_seed(123)

token_ids = generate(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=15,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=25,
    temperature=1.4
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

i= 0 idx_cond= tensor([[6109, 3626, 6100,  345]])
i= 1 idx_cond= tensor([[ 6109,  3626,  6100,   345, 41240]])
i= 2 idx_cond= tensor([[ 6109,  3626,  6100,   345, 41240, 11686]])
i= 3 idx_cond= tensor([[ 6109,  3626,  6100,   345, 41240, 11686, 28611]])
i= 4 idx_cond= tensor([[ 6109,  3626,  6100,   345, 41240, 11686, 28611, 38876]])
i= 5 idx_cond= tensor([[ 6109,  3626,  6100,   345, 41240, 11686, 28611, 38876, 48215]])
i= 6 idx_cond= tensor([[ 6109,  3626,  6100,   345, 41240, 11686, 28611, 38876, 48215, 21196]])
i= 7 idx_cond= tensor([[ 6109,  3626,  6100,   345, 41240, 11686, 28611, 38876, 48215, 21196,
           994]])
i= 8 idx_cond= tensor([[ 6109,  3626,  6100,   345, 41240, 11686, 28611, 38876, 48215, 21196,
           994, 19513]])
i= 9 idx_cond= tensor([[ 6109,  3626,  6100,   345, 41240, 11686, 28611, 38876, 48215, 21196,
           994, 19513, 31728]])
i= 10 idx_cond= tensor([[ 6109,  3626,  6100,   345, 41240, 11686, 28611, 38876, 48215, 21196,
           994, 19513, 3172

## 5.5 Loading pretrained weights from OpenAI

In [8]:
print("TensorFlow version:", version("tensorflow"))
print("tqdm version:", version("tqdm"))

TensorFlow version: 2.15.0
tqdm version: 4.66.1


In [9]:
from gpt_download import download_and_load_gpt2

2025-10-26 14:28:46.039379: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-26 14:28:46.039466: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-26 14:28:46.041159: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-26 14:28:46.051397: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


In [11]:
print("Settings:", settings)

Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}


In [12]:
print("Parameter dictionary keys:", params.keys())

Parameter dictionary keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [13]:
print(params["wte"])
print("Token embedding weight tensor dimensions:", params["wte"].shape)

[[-0.11010301 -0.03926672  0.03310751 ... -0.1363697   0.01506208
   0.04531523]
 [ 0.04034033 -0.04861503  0.04624869 ...  0.08605453  0.00253983
   0.04318958]
 [-0.12746179  0.04793796  0.18410145 ...  0.08991534 -0.12972379
  -0.08785918]
 ...
 [-0.04453601 -0.05483596  0.01225674 ...  0.10435229  0.09783269
  -0.06952604]
 [ 0.1860082   0.01665728  0.04611587 ... -0.09625227  0.07847701
  -0.02245961]
 [ 0.05135201 -0.02768905  0.0499369  ...  0.00704835  0.15519823
   0.12067825]]
Token embedding weight tensor dimensions: (50257, 768)


In [14]:
# Define model configurations in a dictionary for compactness
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Copy the base configuration and update with specific model settings
model_name = "gpt2-small (124M)"  # Example model name
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

gpt = GPTModel(NEW_CONFIG)
gpt.eval();

In [15]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [16]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
    
    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])
    

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_weights_into_gpt(gpt, params)
gpt.to(device);

In [18]:
torch.manual_seed(123)

token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(device),
    max_new_tokens=25,
    context_size=NEW_CONFIG["context_length"],
    top_k=50,
    temperature=1.5
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))


Output text:
 Every effort moves you as far as the hand can go until the end of your turn unless something happens

This would remove you from a battle


In [19]:
idx=text_to_token_ids("Every effort moves you", tokenizer)
idx.shape

torch.Size([1, 4])

In [20]:
idx

tensor([[6109, 3626, 6100,  345]])

In [21]:
tok_emb = gpt.tok_emb
pos_emb = gpt.pos_emb

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
gpt.to("cpu");
x = torch.tensor([[1]])
batch_size, seq_len = x.shape
tok_embeds = tok_emb(x)
pos_embeds = pos_emb(torch.arange(seq_len, device=x.device))
x_embedded = tok_embeds + pos_embeds
# x_embedded # [ 2.1520e-02, -2.4603e-01,  5.0275e-02


cuda


In [23]:
# dir(gpt.trf_blocks)
# gpt.trf_blocks
tb0_norm1 = gpt.trf_blocks.get_submodule("0.norm1")
# dir(tb0_norm1)
x = torch.tensor(np.ones((1, 768) , dtype=np.float32))
# tb0_norm1(x) # [-3.6773e-03,  2.7197e-02, -6.4041e-02

tb11_norm1 = gpt.trf_blocks.get_submodule("11.norm1")
#tb11_norm1(x) # [ 5.0957e-02,  5.3063e-03,  7.1952e-02

In [24]:
# test query layer, key layer, value layer
x = torch.tensor(np.ones((1, 768) , dtype=np.float32))

# block 0
att0 = gpt.trf_blocks.get_submodule("0.att")
query0 = att0.get_submodule("W_query")
# query0(x) # [-1.3708e+01,  1.3385e+01,  1.4323e+01
key0 = att0.get_submodule("W_key")
# key0(x) # [ 1.8049e-01, -1.4381e-01,  6.2964e-01
value0 = att0.get_submodule("W_value")
# value0(x)
proj0 = att0.get_submodule("out_proj")
# proj0(x)
# proj0(value0(key0(query0(x)))) # [-2.3273e+01, -7.9272e+02,  5.6245e+02

# block 11
att11 = gpt.trf_blocks.get_submodule("11.att")
query11 = att11.get_submodule("W_query")
# query11(x) # [-5.4209e+00,  4.6236e+00,  4.5401e+00
key11 = att11.get_submodule("W_key")
# key11(x) # [ 5.8911e+00, -3.3184e-01,  6.3656e-01
value11 = att11.get_submodule("W_value")
# value11(x) # [-1.2480e+00, -3.0783e+00,  5.9679e+00

proj11 = att11.get_submodule("out_proj")
# proj11(x) # [-4.1535e-01,  2.1763e+00,  4.7958e-01

# proj11(value11(key11(query11(x)))) # [ 3.4414e+02,  4.9568e+02,  3.8639e+02


In [25]:

x = torch.tensor(np.ones((1, 768) , dtype=np.float32))
tb0_norm2 = gpt.trf_blocks.get_submodule("0.norm2")
# tb0_norm2(x)

tb0_ff = gpt.trf_blocks.get_submodule("0.ff")
#tb0_ff(x) # [-1.6735e+01, -6.9883e+00,  4.1138e+00

# tb11_norm2 = gpt.trf_blocks.get_submodule("11.norm2")
# tb11_norm2(x)
tb11_ff = gpt.trf_blocks.get_submodule("11.ff")
#tb11_ff(x) # [ 1.3675e+01,  2.2839e+01, -1.7306e+01

In [41]:
x_trivial = torch.tensor([[1, 2, 3]])
gpt(x_trivial)

tensor([[[-32.9011, -31.2024, -34.6623,  ..., -39.4868, -39.8732, -32.2387],
         [-55.5208, -53.4286, -56.4767,  ..., -68.1539, -66.7709, -58.6006],
         [-61.7969, -60.5386, -59.5503,  ..., -75.3206, -72.7731, -65.5706]]],
       grad_fn=<UnsafeViewBackward0>)

In [49]:
context_size=GPT_CONFIG_124M["context_length"]
idx = text_to_token_ids("Every effort moves you", tokenizer)

# x # tensor([[6109, 3626, 6100,  345]])
idx_cond = idx[:, -context_size:]
idx_cond # tensor([[6109, 3626, 6100,  345]])
logits = gpt(idx)
print("logits:\n", logits)
logits = logits[:, -1, :]
print("logits:\n", logits)
print("logits[-1]:\n", logits[-1])
probas = torch.softmax(logits, dim=-1)
print("probas:\n", probas)
idx_next = torch.argmax(probas, dim=-1)
print("idx_next:\n", idx_next)

logits:
 tensor([[[ -35.5820,  -34.9804,  -38.4522,  ...,  -42.0959,  -41.8533,
           -35.5966],
         [ -76.9601,  -76.6970,  -81.9309,  ...,  -88.7984,  -86.7631,
           -78.9627],
         [-125.3487, -126.2704, -135.0948,  ..., -132.3173, -135.2544,
          -127.6511],
         [-136.6002, -137.3804, -146.5556,  ..., -148.2978, -147.2155,
          -139.5677]]], grad_fn=<UnsafeViewBackward0>)
logits:
 tensor([[-136.6002, -137.3804, -146.5556,  ..., -148.2978, -147.2155,
         -139.5677]], grad_fn=<SliceBackward0>)
logits[-1]:
 tensor([-136.6002, -137.3804, -146.5556,  ..., -148.2978, -147.2155,
        -139.5677], grad_fn=<SelectBackward0>)
probas:
 tensor([[1.6013e-03, 7.3391e-04, 7.6013e-08,  ..., 1.3313e-08, 3.9292e-08,
         8.2357e-05]], grad_fn=<SoftmaxBackward0>)
idx_next:
 tensor([2651])


In [54]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

gpt.to(device);
token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(device),
    max_new_tokens=5,
    context_size=NEW_CONFIG["context_length"],
)
print(token_ids)

i= 0 idx_cond= tensor([[6109, 3626, 6100,  345]], device='cuda:0')
  logits= torch.Size([1, 50257])
  idx_next= tensor([[2651]], device='cuda:0')
i= 1 idx_cond= tensor([[6109, 3626, 6100,  345, 2651]], device='cuda:0')
  logits= torch.Size([1, 50257])
  idx_next= tensor([[13]], device='cuda:0')
i= 2 idx_cond= tensor([[6109, 3626, 6100,  345, 2651,   13]], device='cuda:0')
  logits= torch.Size([1, 50257])
  idx_next= tensor([[198]], device='cuda:0')
i= 3 idx_cond= tensor([[6109, 3626, 6100,  345, 2651,   13,  198]], device='cuda:0')
  logits= torch.Size([1, 50257])
  idx_next= tensor([[198]], device='cuda:0')
i= 4 idx_cond= tensor([[6109, 3626, 6100,  345, 2651,   13,  198,  198]], device='cuda:0')
  logits= torch.Size([1, 50257])
  idx_next= tensor([[464]], device='cuda:0')
tensor([[6109, 3626, 6100,  345, 2651,   13,  198,  198,  464]],
       device='cuda:0')


In [56]:
gpt.to("cpu")
start_context = "Every effort moves you"
token_ids = generate_text_simple(
    model=gpt,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=256
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you forward.

The first step is to understand
