In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import numpy as np
!pip install -q tiktoken
from utils import load_gpt2_params_from_tf_ckpt
from utils import tldr
from utils import text_to_token_ids
from utils import token_ids_to_text
from utils import generate_text_simple
tf.random.set_seed(1234)

[0m

In [2]:
from gpt2u import GPT2u

In [3]:
x_effort = tf.constant([[6109, 3626, 6100, 345]])
config124M = {'n_embd': 768, 'n_vocab': 50257, 'n_ctx': 1024, 'n_layer': 12, 'n_head': 12}
gpt2u=GPT2u(config124M)

### First test with random weights

In [19]:
y = gpt2u(x_effort)
assert y.shape == [1, 4, 50257]
assert np.allclose(y[0][0][:3], np.array([-44.049656 , -20.970324 , -23.276432]), rtol=1e-3, atol=1e-3)
start_context = "Every effort moves you"
token_ids = generate_text_simple(
    model=gpt2u,
    idx=text_to_token_ids(start_context),
    max_new_tokens=10,
    # max_new_tokens=30,
    context_size=256
)

print("Output text:\n", token_ids_to_text(token_ids))

.. final logits: float32 (1, 4, 50257) [-44.049656 -20.970324 -23.276432]
.. final logits: float32 (1, 4, 50257) [-44.049656 -20.970324 -23.276432]
.. final logits: float32 (1, 5, 50257) [-44.049656 -20.970324 -23.276432]
.. final logits: float32 (1, 6, 50257) [-44.049652 -20.970324 -23.276432]
.. final logits: float32 (1, 7, 50257) [-44.049652 -20.970324 -23.276432]
.. final logits: float32 (1, 8, 50257) [-44.04966  -20.970287 -23.276419]
.. final logits: float32 (1, 9, 50257) [-44.04966  -20.970287 -23.276419]
.. final logits: float32 (1, 10, 50257) [-44.04965  -20.97031  -23.276417]
.. final logits: float32 (1, 11, 50257) [-44.04965  -20.97031  -23.276417]
.. final logits: float32 (1, 12, 50257) [-44.04965  -20.97031  -23.276417]
.. final logits: float32 (1, 13, 50257) [-44.04965  -20.97031  -23.276417]
Output text:
 Every effort moves younc Brune pioneerSw Summon MB 40 directory Hay comply


In [20]:
model_dir="openai_gpt2_weights/124M"
tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, config124M)

In [22]:
def load_weights_into_gpt2u(model, params):
    model.embedding.word_embedding = tf.Variable(params['wte'], name="word_embedding")
    model.embedding.position_embedding = tf.Variable(params['wpe'], name="position_embedding")

    for b in range(12):
        print(f"loading block {b}")
        # Attention layer_norm
        model.transformer.blocks[b].attention.layer_norm.beta = tf.Variable(params["blocks"][b]["ln_1"]["b"], name=f"transformer.blocks-{b}.attention.layer_norm.beta")
        model.transformer.blocks[b].attention.layer_norm.gamma = tf.Variable(params["blocks"][b]["ln_1"]["g"], name=f"transformer.blocks-{b}.attention.layer_norm.gamma")

        q_w, k_w, v_w = np.split((params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        q_b, k_b, v_b = np.split((params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        model.transformer.blocks[b].attention.self_attention.query_layer.w = tf.Variable(q_w, name=f"transformer.blocks-{b}.attention.self_attention.query_layer.w")
        model.transformer.blocks[b].attention.self_attention.query_layer.b = tf.Variable(q_b, name=f"transformer.blocks-{b}.attention.self_attention.query_layer.b")
        model.transformer.blocks[b].attention.self_attention.key_layer.w = tf.Variable(k_w, name=f"transformer.blocks-{b}.attention.self_attention.key_layer.w")
        model.transformer.blocks[b].attention.self_attention.key_layer.b = tf.Variable(k_b, name=f"transformer.blocks-{b}.attention.self_attention.key_layer.b")
        model.transformer.blocks[b].attention.self_attention.value_layer.w = tf.Variable(v_w, name=f"transformer.blocks-{b}.attention.self_attention.value_layer.w")
        model.transformer.blocks[b].attention.self_attention.value_layer.b = tf.Variable(v_b, name=f"transformer.blocks-{b}.attention.self_attention.value_layer.b")
        model.transformer.blocks[b].attention.projection.w = tf.Variable(params["blocks"][b]["attn"]["c_proj"]["w"], name=f"transformer.blocks-{b}.attention.projection.w")
        model.transformer.blocks[b].attention.projection.b = tf.Variable(params["blocks"][b]["attn"]["c_proj"]["b"], name=f"transformer.blocks-{b}.attention.projection.b")

        model.transformer.blocks[b].mlp.layer_norm.beta = tf.Variable(params["blocks"][b]["ln_2"]["b"], name=f"transformer.blocks-{b}.mlp.layer_norm.beta")
        model.transformer.blocks[b].mlp.layer_norm.gamma = tf.Variable(params["blocks"][b]["ln_2"]["g"], name=f"transformer.blocks-{b}.mlp.layer_norm.gamma")
        model.transformer.blocks[b].mlp.perceptron.w = tf.Variable(params["blocks"][b]["mlp"]["c_fc"]["w"], name=f"transformer.blocks-{b}.mlp.perceptron.w")
        model.transformer.blocks[b].mlp.perceptron.b = tf.Variable(params["blocks"][b]["mlp"]["c_fc"]["b"], name=f"transformer.blocks-{b}.mlp.perceptron.b")                
        model.transformer.blocks[b].mlp.projection.w = tf.Variable(params["blocks"][b]["mlp"]["c_proj"]["w"], name=f"transformer.blocks-{b}.mlp.projection.w")
        model.transformer.blocks[b].mlp.projection.b = tf.Variable(params["blocks"][b]["mlp"]["c_proj"]["b"], name=f"transformer.blocks-{b}.mlp.projection.b")     
    model.transformer.layer_norm.beta = tf.Variable(params["b"], name="transformer.layer_norm.beta")
    model.transformer.layer_norm.gamma = tf.Variable(params["g"], name="transformer.layer_norm.gamma")

In [23]:
load_weights_into_gpt2u(gpt2u, params)

loading block 0
loading block 1
loading block 2
loading block 3
loading block 4
loading block 5
loading block 6
loading block 7
loading block 8
loading block 9
loading block 10
loading block 11


### Validate embedding layer

In [24]:
x_effort = tf.constant([[6109, 3626, 6100, 345]])
x_effort_emb = gpt2u.embedding(x_effort)
assert x_effort_emb.shape == [1, 4, 768]
assert np.allclose(x_effort_emb[0][0][:3], np.array([0.07927368, -0.2979193 ,  0.08817437]), rtol=1e-3, atol=1e-3)

### Validate attention.layer_norm

In [39]:
y = gpt2u.transformer.blocks[0].attention.layer_norm(x_effort_emb)
assert y.shape == [1, 4, 768]
assert np.allclose(y[0][0][:3], np.array([0.047223  , -0.11664161, -0.02536647]), rtol=1e-3, atol=1e-3)

### Validate attention.self_attention

In [40]:
y = gpt2u.transformer.blocks[0].attention.self_attention(x_effort_emb)
assert y.shape == [1, 4, 768]
assert np.allclose(y[0][0][:3], np.array([0.28434375, -0.00881347,  0.34210888]), rtol=1e-3, atol=1e-3)

### Validate attention.projection

In [41]:
y = gpt2u.transformer.blocks[0].attention.projection(x_effort_emb)
assert y.shape == [1, 4, 768]
assert np.allclose(y[0][0][:3], np.array([2.5602593 ,  0.34704542,  0.3729586]), rtol=1e-3, atol=1e-3)

### Validate attention

In [33]:
y = gpt2u.transformer.blocks[0].attention(x_effort_emb)
tldr(y)
assert y.shape == [1, 4, 768]
assert np.allclose(y[0][0][:3], np.array([5.4214954e-01, -1.1554953e-01,  2.5736535e-01]), rtol=1e-2, atol=1e-2) # I loosened this!!!

### Validate mlp

In [37]:
y = gpt2u.transformer.blocks[0].mlp(x_effort_emb)
assert y.shape == [1, 4, 768]
assert np.allclose(y[0][0][:3], np.array([3.2065992,   2.262373 ,   1.4517794]), rtol=1e-2, atol=1e-2) # had to loosen !

In [31]:
#
#
#
#
#
#

In [6]:
x_effort = tf.constant([[6109, 3626, 6100, 345]])

In [None]:
### Validate the whole model

In [42]:
y = gpt2u(x_effort)
assert y.shape == [1, 4, 50257]
assert np.allclose(y[0][0][:3], np.array([-35.521214, -34.924126, -38.39469]), rtol=1e-2, atol=1e-2)
tldr(y)


.. final logits: float32 (1, 4, 50257) [-35.521214 -34.924126 -38.39469 ]


'float32 (1, 4, 50257) [-35.521214 -34.924126 -38.39469 ]'

### Write out to checkpoint

In [45]:
chkp_path = "my_checkpoint"
checkpoint = tf.train.Checkpoint(model=gpt2u)
checkpoint.write(chkp_path)

'my_checkpoint'

### Reload checkpoint weights into new model gpt2z

In [47]:
config124M = {'n_embd': 768, 'n_vocab': 50257, 'n_ctx': 1024, 'n_layer': 12, 'n_head': 12}
gpt2z=GPT2u(config124M)
checkpoint = tf.train.Checkpoint(model=gpt2z)
checkpoint.restore(chkp_path)
assert len(gpt2z.variables) == len(gpt2u.variables)

### Validate gpt2z works just like gpt2u

In [49]:
y = gpt2z(x_effort)
assert y.shape == [1, 4, 50257]
assert np.allclose(y[0][0][:3], np.array([-35.521214, -34.924126, -38.39469]), rtol=1e-2, atol=1e-2)

.. final logits: float32 (1, 4, 50257) [-35.521214 -34.924126 -38.39469 ]


In [50]:
token_ids = generate_text_simple(gpt2z, idx=x_effort, max_new_tokens=5, context_size=256)

.. final logits: float32 (1, 4, 50257) [-35.521214 -34.924126 -38.39469 ]
.. final logits: float32 (1, 5, 50257) [-35.521214 -34.924126 -38.39469 ]
.. final logits: float32 (1, 6, 50257) [-35.521214 -34.924126 -38.3947  ]
.. final logits: float32 (1, 7, 50257) [-35.521214 -34.924126 -38.3947  ]
.. final logits: float32 (1, 8, 50257) [-35.521263 -34.924175 -38.394753]


In [19]:
assert np.array_equal(token_ids.numpy(), np.array([[6109, 3626, 6100,  345, 2651,   13,  198,  198,  464]]))

In [51]:
start_context = "Every effort moves you"
token_ids = generate_text_simple(
    model=gpt2z,
    idx=text_to_token_ids(start_context),
    max_new_tokens=30,
    # max_new_tokens=30,
    context_size=256
)

print("Output text:\n", token_ids_to_text(token_ids))

.. final logits: float32 (1, 4, 50257) [-35.521214 -34.924126 -38.39469 ]
.. final logits: float32 (1, 5, 50257) [-35.521214 -34.924126 -38.39469 ]
.. final logits: float32 (1, 6, 50257) [-35.521214 -34.924126 -38.3947  ]
.. final logits: float32 (1, 7, 50257) [-35.521214 -34.924126 -38.3947  ]
.. final logits: float32 (1, 8, 50257) [-35.521263 -34.924175 -38.394753]
.. final logits: float32 (1, 9, 50257) [-35.521263 -34.924175 -38.394753]
.. final logits: float32 (1, 10, 50257) [-35.521236 -34.92415  -38.39473 ]
.. final logits: float32 (1, 11, 50257) [-35.521236 -34.92415  -38.39473 ]
.. final logits: float32 (1, 12, 50257) [-35.521236 -34.92415  -38.39473 ]
.. final logits: float32 (1, 13, 50257) [-35.521236 -34.92415  -38.39473 ]
.. final logits: float32 (1, 14, 50257) [-35.521236 -34.92415  -38.39473 ]
.. final logits: float32 (1, 15, 50257) [-35.521236 -34.92415  -38.39473 ]
.. final logits: float32 (1, 16, 50257) [-35.521236 -34.92415  -38.39473 ]
.. final logits: float32 (1, 17

In [52]:
x_effort = tf.constant([[6109, 3626, 6100, 345]])
text_to_token_ids("Every effort moves")

<tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[16833,  3626,  6100]], dtype=int32)>

In [54]:
token_ids_to_text([[16833,  3626,  6100]])

'every effort moves'

In [55]:
token_ids_to_text([[6109, 3626, 6100, 345]])

'Every effort moves you'

In [56]:
text_to_token_ids("I really like")

<tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[  40, 1107,  588]], dtype=int32)>