In [4]:
import urllib.request
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch05/01_main-chapter-code/gpt_download.py"

In [5]:
filename = "ch05/" + url.split("/")[-1]
urllib.request.urlretrieve(url, filename)

('ch05/gpt_download.py', <http.client.HTTPMessage at 0x178d57410>)

In [6]:
from ch05.gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(model_size="124M", models_dir="ch05/gpt2")

File already exists and is up-to-date: ch05/gpt2/124M/checkpoint
File already exists and is up-to-date: ch05/gpt2/124M/encoder.json
File already exists and is up-to-date: ch05/gpt2/124M/hparams.json
File already exists and is up-to-date: ch05/gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: ch05/gpt2/124M/model.ckpt.index
File already exists and is up-to-date: ch05/gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: ch05/gpt2/124M/vocab.bpe


In [7]:
print(settings)
print(params.keys())

{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [8]:
print(params["wte"])
print(params["wte"].shape)

[[-0.11010301 -0.03926672  0.03310751 ... -0.1363697   0.01506208
   0.04531523]
 [ 0.04034033 -0.04861503  0.04624869 ...  0.08605453  0.00253983
   0.04318958]
 [-0.12746179  0.04793796  0.18410145 ...  0.08991534 -0.12972379
  -0.08785918]
 ...
 [-0.04453601 -0.05483596  0.01225674 ...  0.10435229  0.09783269
  -0.06952604]
 [ 0.1860082   0.01665728  0.04611587 ... -0.09625227  0.07847701
  -0.02245961]
 [ 0.05135201 -0.02768905  0.0499369  ...  0.00704835  0.15519823
   0.12067825]]
(50257, 768)


In [9]:
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-media (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emd_dim": 1600, "n_layers": 48, "n_heads": 25}
}

In [10]:
from codes.configs import GPT_CONFIG_124M

model_name = "gpt2-small (124M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])

NEW_CONFIG.update({"context_length": 1024})

In [11]:
NEW_CONFIG.update({"qkv_bias": True})

In [17]:
from codes.gpt_model import GPTModel
gpt = GPTModel(NEW_CONFIG)
#gpt.eval()

In [18]:
import torch

def assign(left, right):
    assert left.shape == right.shape, f"Shape mismatch. Left: {left.shape}, Right: {right.shape}"
    return torch.nn.Parameter(torch.tensor(right))

In [19]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1
        )
        gpt.trans_blocks[b].attn.W_query.weight = assign(
            gpt.trans_blocks[b].attn.W_query.weight, q_w.T
        )
        gpt.trans_blocks[b].attn.W_key.weight = assign(
            gpt.trans_blocks[b].attn.W_key.weight, k_w.T
        )
        gpt.trans_blocks[b].attn.W_value.weight = assign(
            gpt.trans_blocks[b].attn.W_value.weight, v_w.T
        )

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1
        )
        gpt.trans_blocks[b].attn.W_query.bias = assign(
            gpt.trans_blocks[b].attn.W_query.bias, q_b
        )
        gpt.trans_blocks[b].attn.W_key.bias = assign(
            gpt.trans_blocks[b].attn.W_key.bias, k_b
        )
        gpt.trans_blocks[b].attn.W_value.bias = assign(
            gpt.trans_blocks[b].attn.W_value.bias, v_b
        )

        gpt.trans_blocks[b].attn.out_proj.weight = assign(
            gpt.trans_blocks[b].attn.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T
        )
        gpt.trans_blocks[b].attn.out_proj.bias = assign(
            gpt.trans_blocks[b].attn.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"]
        )

        gpt.trans_blocks[b].ffn.layers[0].weight = assign(
            gpt.trans_blocks[b].ffn.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T
        )
        gpt.trans_blocks[b].ffn.layers[0].bias = assign(
            gpt.trans_blocks[b].ffn.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"]
        )
        gpt.trans_blocks[b].ffn.layers[2].weight = assign(
            gpt.trans_blocks[b].ffn.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T
        )
        gpt.trans_blocks[b].ffn.layers[2].bias = assign(
            gpt.trans_blocks[b].ffn.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"]
        )

        gpt.trans_blocks[b].norm1.scale = assign(
            gpt.trans_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"]
        )
        gpt.trans_blocks[b].norm1.shift = assign(
            gpt.trans_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"]
        )
        gpt.trans_blocks[b].norm2.scale = assign(
            gpt.trans_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"]
        )
        gpt.trans_blocks[b].norm2.shift = assign(
            gpt.trans_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"]
        )
    gpt.final_norm.scale = assign(
        gpt.final_norm.scale, params["g"]
    )
    gpt.final_norm.shift = assign(
        gpt.final_norm.shift, params["b"]
    )

    gpt.out_head.weight = assign(
        gpt.out_head.weight, params["wte"]
    )

In [20]:
load_weights_into_gpt(gpt, params)

In [21]:
gpt.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trans_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiheadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffn): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiheadAttention(
        (W_query): Linear(in_featu

In [22]:
import tiktoken
from codes.utils import text_to_token_ids, token_ids_to_text, generate

tokenizer = tiktoken.get_encoding("gpt2")

In [24]:
torch.manual_seed(123)
token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tokenizer=tokenizer).to("cpu"),
    max_new_tokens=25,
    context_size=NEW_CONFIG["context_length"],
    top_k=50,
    temperature=1.5
)

print(token_ids_to_text(token_ids, tokenizer=tokenizer))

Every effort moves you toward finding an ideal new way to practice something!

What makes us want to be on top of that?




In [26]:
torch.save(gpt.state_dict(), "./ch05/gpt2-small-124m-pretrained.pth")