# Saving and loading the GPT2 model

We saved the model we created after training at:

```python
torch.save(model.state_dict(), 'model.pth')
```

Now we learn how to deal with it

In [11]:
import torch
import tiktoken
import urllib.request
import myllm.gpt as gpt
import myllm.util

## Loading model to GPTModel

In [13]:
# Determine to which compute unit to send
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = tiktoken.get_encoding("gpt2")

# setup config
gpt_config = gpt.GPT_CONFIG_124M
gpt_config["context_length"] = 256 # reduced to faster training
model = gpt.GPTModel(gpt_config)

# load model
model.load_state_dict(torch.load("model.pth", map_location=device))


# common practice in Deep Learning is using an optimizer
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=0.0004, weight_decay=0.1
)
# stops dropout layers for inference
model.eval()


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [8]:
# saving the optimizer
# model aroung 700Mb
torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    }, 
    "model_and_optimizer.pth")



In [14]:
# Validading everything
checkpoint = torch.load('model_and_optimizer.pth', map_location=device)
model = gpt.GPTModel(gpt_config)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

### Does it run?


In [17]:
model.eval()
start_context = myllm.util.text_to_token_ids('Every effort moves you', tokenizer)

token_ids = model.generate(
    idx=start_context,
    max_new_tokens=10,
    context_size=gpt_config['context_length'],
    temperature=1.4,
)

# fuck yeah
print(myllm.util.token_ids_to_text(token_ids, tokenizer))

Every effort moves youfx window sw Lor nearIVESed by a oak


## Download OpenAI GPT2 weights

In [2]:

url = (
    "https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch05/"
    "01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)


('gpt_download.py', <http.client.HTTPMessage at 0x105f4f590>)

In [3]:
from gpt_download import download_and_load_gpt2

settings, params = download_and_load_gpt2(
    model_size="124M", models_dir="gpt2"
)

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 37.8kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 1.91MiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 65.4kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [06:00<00:00, 1.38MiB/s] 
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 2.12MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:00<00:00, 1.14MiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 1.50MiB/s]


In [4]:
print("Settings", settings)
print("Parameter dictionary keys:", params.keys())

Settings {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
Parameter dictionary keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [5]:
print(params["wte"])
print("Token embedding weight tensor dimensions:", params["wte"].shape)

[[-0.11010301 -0.03926672  0.03310751 ... -0.1363697   0.01506208
   0.04531523]
 [ 0.04034033 -0.04861503  0.04624869 ...  0.08605453  0.00253983
   0.04318958]
 [-0.12746179  0.04793796  0.18410145 ...  0.08991534 -0.12972379
  -0.08785918]
 ...
 [-0.04453601 -0.05483596  0.01225674 ...  0.10435229  0.09783269
  -0.06952604]
 [ 0.1860082   0.01665728  0.04611587 ... -0.09625227  0.07847701
  -0.02245961]
 [ 0.05135201 -0.02768905  0.0499369  ...  0.00704835  0.15519823
   0.12067825]]
Token embedding weight tensor dimensions: (50257, 768)
