In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf

In [2]:
import numpy as np
from typing import Any

In [3]:
def color_print(text, condition):
    """Prints text in green if condition is True, or red if condition is False."""
    # ANSI escape code for Red
    RED = '\033[91m'
    # ANSI escape code for Green
    GREEN = '\033[92m'
    # ANSI escape code to reset color
    ENDC = '\033[0m'    
    if condition:
        print(f"{GREEN}{text}{ENDC}")
    else:
        print(f"{RED}{text}{ENDC}")

In [4]:
# load weights into a "params" dict
def load_gpt2_params_from_tf_ckpt(ckpt_path, settings) -> dict[str: Any]:
    # Initialize parameters dictionary with empty blocks for each layer
    params = {"blocks": [{} for _ in range(settings["n_layer"])]}

    # Iterate over each variable in the checkpoint
    for name, _ in tf.train.list_variables(ckpt_path):
        # Load the variable and remove singleton dimensions
        variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))

        # Process the variable name to extract relevant parts
        variable_name_parts = name.split("/")[1:]  # Skip the 'model/' prefix

        # Identify the target dictionary for the variable
        target_dict = params
        if variable_name_parts[0].startswith("h"):
            layer_number = int(variable_name_parts[0][1:])
            target_dict = params["blocks"][layer_number]

        # Recursively access or create nested dictionaries
        for key in variable_name_parts[1:-1]:
            target_dict = target_dict.setdefault(key, {})

        # Assign the variable array to the last key
        last_key = variable_name_parts[-1]
        target_dict[last_key] = variable_array

    return params

In [5]:
settings = {"n_layer": 12}

model_dir="ch05/01_main-chapter-code/gpt2/124M"
tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)
# params.keys() # dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])

In [6]:
emb = tf.keras.layers.Embedding(input_dim=4, output_dim=4)


In [7]:
x = tf.constant([1,2,3,4,5,6])
emb(x)

<tf.Tensor: shape=(6, 4), dtype=float32, numpy=
array([[ 0.02903571,  0.0076947 , -0.03703507, -0.02101977],
       [-0.01560724, -0.03536775,  0.0268792 , -0.02861235],
       [-0.00908197, -0.02716467,  0.01001495,  0.01660894],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ]],
      dtype=float32)>

In [8]:
config124M = {'n_embd': 768, 'n_vocab': 50257, 'n_ctx': 1024, 'n_layer': 12, 'n_head': 12}

In [9]:
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding

In [10]:
# w = np.array(params['wte'])
word_emb = tf.keras.layers.Embedding(input_dim=50257, output_dim=768, weights=[np.array(params['wte'])])
pos_emb = tf.keras.layers.Embedding(input_dim=1024, output_dim=768, weights=[np.array(params['wpe'])])


In [11]:
X1 = tf.constant([[1]])
x_trivial = tf.constant([[1, 2, 3]])
we = word_emb(X1)
print(we.shape)
pe = pos_emb(tf.range(1024))
print(pe.shape) # 1, 
e = we + pe # [ 2.1520e-02, -2.4603e-01,  5.0275e-02

test_values = e[0][0][:3].numpy()
expected_values = np.array([ 2.1520e-02, -2.4603e-01,  5.0275e-02])
test_result = np.allclose(test_values, expected_values, rtol=1e-3, atol=1e-3)
color_print(f"test_result: {test_result}", test_result)

(1, 1, 768)
(1024, 768)
[92mtest_result: True[0m


In [12]:
x_trivial = tf.constant([[1, 2, 3]])
we = word_emb(x_trivial) # TensorShape([1, 3, 768])
pe = pos_emb(tf.range(1024)) # .shape # TensorShape([1024, 768])
pe_corrected = pe[:3, :]
x = we + pe_corrected # [ 0.02151961, -0.24603364,  0.05027542
test_values = x[0][0][:3].numpy()
expected_values = np.array([ 0.02151961, -0.24603364,  0.05027542])
test_result = np.allclose(test_values, expected_values, rtol=1e-3, atol=1e-3)
color_print(f"test_result: {test_result}", test_result)

[92mtest_result: True[0m


In [13]:
we.shape

TensorShape([1, 3, 768])

In [14]:

b=0
norm1_beta = tf.keras.initializers.Constant(params["blocks"][b]["ln_1"]["b"])
norm1_gamma = tf.keras.initializers.Constant(params["blocks"][b]["ln_1"]["g"])
norm1 = tf.keras.layers.LayerNormalization(beta_initializer=norm1_beta, gamma_initializer=norm1_gamma, name=f"norm1-{b}")

norm2_beta = tf.keras.initializers.Constant(params["blocks"][b]["ln_2"]["b"])
norm2_gamma = tf.keras.initializers.Constant(params["blocks"][b]["ln_2"]["g"])
norm2 = tf.keras.layers.LayerNormalization(beta_initializer=norm2_beta, gamma_initializer=norm2_gamma, name=f"norm2-{b}")

final_norm_beta = tf.keras.initializers.Constant(params["b"])
final_norm_gamma = tf.keras.initializers.Constant(params["g"])
final_norm = tf.keras.layers.LayerNormalization(beta_initializer=final_norm_beta, gamma_initializer=final_norm_gamma, name=f"final-norm")


In [15]:
x = np.ones((1, 768) , dtype=np.float32)

test_values = norm1(x)[0][:3].numpy()
expected_values = np.array([-3.6773e-03,  2.7197e-02, -6.4041e-02])
test_result = np.allclose(test_values, expected_values)
color_print(f"norm1 test_result: {test_result}", test_result)

test_values = norm2(x)[0][:3].numpy()
expected_values = np.array([ 4.2478e-02,  3.2627e-02,  4.4881e-03])
test_result = np.allclose(test_values, expected_values)
color_print(f"norm2 test_result: {test_result}", test_result)

test_values = final_norm(x)[0][:3].numpy()
expected_values = np.array([ 1.0872e-03,  3.6529e-02, -6.7296e-02])
test_result = np.allclose(test_values, expected_values, rtol=1e-3, atol=1e-3)
color_print(f"final_norm test_result: {test_result}", test_result)


[92mnorm1 test_result: True[0m
[92mnorm2 test_result: True[0m
[92mfinal_norm test_result: True[0m


In [16]:
params['wte'].shape

(50257, 768)

In [17]:
params["blocks"][b]["mlp"]["c_proj"]["w"].shape

(3072, 768)

In [18]:
x = np.ones((1, 768) , dtype=np.float32)
out_head_layer = tf.keras.layers.Dense(units=50257, activation=None, use_bias=False, name=f"out-head")
out_head_layer.build((50257, 768))
out_head_layer.set_weights([params['wte'].T])
# out_head_layer.weights
# out_head_layer.get_config()


In [19]:
b=0
q_w, k_w, v_w = np.split((params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
q_b, k_b, v_b = np.split((params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)

query_layer = tf.keras.layers.Dense(units=768, activation=None, name=f"query-{b}")
query_layer.build((None, 768))
key_layer = tf.keras.layers.Dense(units=768, activation=None, name=f"key-{b}")
key_layer.build((None, 768))
value_layer = tf.keras.layers.Dense(units=768, activation=None, name=f"value-{b}")
value_layer.build((None, 768))
proj_layer = tf.keras.layers.Dense(units=768, activation=None, name=f"proj-{b}")
proj_layer.build((None, 768))

# WORK ON THESE!
perceptron_layer = tf.keras.layers.Dense(units=3072, activation=tf.keras.activations.gelu, name=f"mlp-perceptron-{b}")
perceptron_layer.build((3072, 768))
mlp_proj_layer = tf.keras.layers.Dense(units=768, activation=None, name=f"mlp-proj-{b}")
mlp_proj_layer.build((None, 3072))

query_layer.set_weights([q_w, q_b])
key_layer.set_weights([k_w, k_b])
value_layer.set_weights([v_w, v_b])
proj_layer.set_weights([params["blocks"][b]["attn"]["c_proj"]["w"], params["blocks"][b]["attn"]["c_proj"]["b"]])
perceptron_layer.set_weights([params["blocks"][b]["mlp"]["c_fc"]["w"], params["blocks"][b]["mlp"]["c_fc"]["b"]])
mlp_proj_layer.set_weights([params["blocks"][b]["mlp"]["c_proj"]["w"], params["blocks"][b]["mlp"]["c_proj"]["b"]])




In [20]:
# query_layer(x) # [-1.3708e+01,  1.3385e+01,  1.4323e+01
test_values = query_layer(x)[0][:3].numpy()
expected_values = np.array([-1.3708e+01,  1.3385e+01,  1.4323e+01])
test_result = np.allclose(test_values, expected_values, rtol=1e-3, atol=1e-3)
color_print(f"test_result: {test_result}", test_result)

[92mtest_result: True[0m


In [21]:
test_values = key_layer(x)[0][:3].numpy()
expected_values = np.array([ 1.8049e-01, -1.4381e-01,  6.2964e-01])
test_result = np.allclose(test_values, expected_values, rtol=1e-3, atol=1e-3)
color_print(f"test_result: {test_result}", test_result)

[92mtest_result: True[0m


In [22]:
test_values = value_layer(x)[0][:3].numpy()
expected_values = np.array([-6.1687e-02, -1.3786e-01, -3.0145e-01])
test_result = np.allclose(test_values, expected_values, rtol=1e-3, atol=1e-3)
color_print(f"test_result: {test_result}", test_result)

[92mtest_result: True[0m


In [23]:
test_values = proj_layer(x)[0][:3].numpy()
expected_values = np.array([-9.7561e+00, -1.7296e+01, -6.7800e-01])
test_result = np.allclose(test_values, expected_values, rtol=1e-3, atol=1e-3)
color_print(f"test_result: {test_result}", test_result)

[92mtest_result: True[0m


In [24]:
# Compose proj_layer, value_layer, key_layer, query_layer
test_values = proj_layer(value_layer(key_layer(query_layer(x))))[0][:3].numpy() # [-2.3273e+01, -7.9272e+02,  5.6245e+02
expected_values = np.array([-2.3273e+01, -7.9272e+02,  5.6245e+02])
test_result = np.allclose(test_values, expected_values, rtol=1e-3, atol=1e-3)
color_print(f"test_result: {test_result}", test_result)

[92mtest_result: True[0m


In [25]:
test_values =  perceptron_layer(x)[0][:3].numpy()  # [-1.6735e+01, -6.9883e+00,  4.1138e+00 
expected_values = np.array([ 3.5592, -0.1381, -0.1655])
test_result = np.allclose(test_values, expected_values, rtol=1e-3, atol=1e-3)
color_print(f"test_result: {test_result}", test_result)


[92mtest_result: True[0m


In [26]:
# test_values =  mlp_proj_layer(x)[0][:3].numpy()  # How to test?


In [27]:
# Compose perceptron_layer, mlp_proj_layer
test_values =  mlp_proj_layer(perceptron_layer(x))[0][:3].numpy()  # [-1.6735e+01, -6.9883e+00,  4.1138e+00 
expected_values = np.array([-1.6735e+01, -6.9883e+00,  4.1138e+00])
test_result = np.allclose(test_values, expected_values, rtol=1e-3, atol=1e-3)
color_print(f"test_result: {test_result}", test_result)

[92mtest_result: True[0m


In [28]:

test_values = out_head_layer(x)[0][:3].numpy()
expected_values = np.array([ 0.3766,  3.4404,  2.0287])
test_result = np.allclose(test_values, expected_values, rtol=1e-3, atol=1e-3)
color_print(f"test_result: {test_result}", test_result)

[92mtest_result: True[0m


In [29]:

class GPT21(tf.keras.Model):

    def __init__(self, config, name=None, trainable=True, dtype=None):
        super().__init__(name=name)
        self.trainable = trainable
        self.embedding_size=config['n_embd']
        self.vocab_size=config['n_vocab']
        self.max_position_length=config['n_ctx']
        self.blocks_num = config["n_layer"]

        self.word_emb = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.embedding_size, weights=[np.array(params['wte'])], name="word_emb")
        self.pos_emb = tf.keras.layers.Embedding(input_dim=self.max_position_length, output_dim=self.embedding_size, weights=[np.array(params['wpe'])], name="pos_emb")

        self.blocks = []
        self.blocks_num = config["n_layer"]
        for b in range(self.blocks_num):
            norm1_beta = tf.keras.initializers.Constant(params["blocks"][b]["ln_1"]["b"])
            norm1_gamma = tf.keras.initializers.Constant(params["blocks"][b]["ln_1"]["g"])
            norm1 = tf.keras.layers.LayerNormalization(beta_initializer=norm1_beta, gamma_initializer=norm1_gamma, name=f"norm1-{b}")

            norm2_beta = tf.keras.initializers.Constant(params["blocks"][b]["ln_2"]["b"])
            norm2_gamma = tf.keras.initializers.Constant(params["blocks"][b]["ln_2"]["g"])
            norm2 = tf.keras.layers.LayerNormalization(beta_initializer=norm2_beta, gamma_initializer=norm2_gamma, name=f"norm2-{b}")

            q_w, k_w, v_w = np.split((params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
            q_b, k_b, v_b = np.split((params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)

            query_layer = tf.keras.layers.Dense(units=768, activation=None, name=f"query-{b}")
            query_layer.build((None, 768))
            key_layer = tf.keras.layers.Dense(units=768, activation=None, name=f"key-{b}")
            key_layer.build((None, 768))
            value_layer = tf.keras.layers.Dense(units=768, activation=None, name=f"value-{b}")
            value_layer.build((None, 768))
            proj_layer = tf.keras.layers.Dense(units=768, activation=None, name=f"proj-{b}")
            proj_layer.build((None, 768))

            perceptron_layer = tf.keras.layers.Dense(units=3072, activation=tf.keras.activations.gelu, name=f"mlp-perceptron-{b}")
            perceptron_layer.build((3072, 768))
            mlp_proj_layer = tf.keras.layers.Dense(units=768, activation=None, name=f"mlp-proj-{b}")
            mlp_proj_layer.build((None, 3072))

            query_layer.set_weights([q_w, q_b])
            key_layer.set_weights([k_w, k_b])
            value_layer.set_weights([v_w, v_b])
            proj_layer.set_weights([params["blocks"][b]["attn"]["c_proj"]["w"], params["blocks"][b]["attn"]["c_proj"]["b"]])
            perceptron_layer.set_weights([params["blocks"][b]["mlp"]["c_fc"]["w"], params["blocks"][b]["mlp"]["c_fc"]["b"]])
            mlp_proj_layer.set_weights([params["blocks"][b]["mlp"]["c_proj"]["w"], params["blocks"][b]["mlp"]["c_proj"]["b"]])
            
            block = tf.keras.Sequential([
                norm1,
                query_layer,
                key_layer,
                value_layer,
                proj_layer,
                norm2,
                perceptron_layer,
                mlp_proj_layer
                ],
                name=f"block-{b}")
            self.blocks.append(block)
        final_norm_beta = tf.keras.initializers.Constant(params["b"])
        final_norm_gamma = tf.keras.initializers.Constant(params["g"])
        self.final_norm = tf.keras.layers.LayerNormalization(beta_initializer=final_norm_beta, gamma_initializer=final_norm_gamma, name=f"final-norm")
        self.out_head_layer = tf.keras.layers.Dense(units=50257, activation=None, use_bias=False, name=f"out-head")
        self.out_head_layer.build((50257, 768))
        self.out_head_layer.set_weights([params['wte'].T])
    def __call__(self, inputs):
        print("inputs.shape:", inputs.shape)
        we = word_emb(inputs)
        print("we.shape:", we.shape)
        pe = pos_emb(tf.range(1024))
        print("pe.shape:", pe.shape)
        pe_corrected = pe[:we.shape[1], :]
        print("pe_corrected.shape:", pe_corrected.shape)
        x = we + pe_corrected
        print("1 x.shape:", x.shape)
        print("1 x:", x[0][0][:3].numpy())
        for block in self.blocks:
            x = block(x)
            print("  b x.shape:", x.shape)
            print("  b x:", x[0][0][:3].numpy())
        print("2 x.shape:", x.shape)
        print("2 x:", x[0][0][:3].numpy())
        x = self.final_norm(x)
        print("3 x.shape:", x.shape)
        print("3 x:", x[0][0][:3].numpy())
        x = self.out_head_layer(x)
        print("4 x.shape:", x.shape)
        print("4 x:", x[0][0][:3].numpy())
        return x
                     
                     

In [32]:
config124M = {'n_embd': 768, 'n_vocab': 50257, 'n_ctx': 1024, 'n_layer': 12, 'n_head': 12}
model=GPT21(config124M)

x_trivial = tf.constant([[1, 2, 3]])
#e = model(x_trivial) # [-32.901043, -31.202375, -34.662212


In [34]:
word_emb = model.word_emb
pos_emb = model.pos_emb
x = word_emb(x_trivial)
pe = pos_emb(tf.range(1024))
pe_corrected = pe[:we.shape[1], :]
x_embedded = we + pe_corrected
x_embedded

<tf.Tensor: shape=(1, 3, 768), dtype=float32, numpy=
array([[[ 0.02151961, -0.24603364,  0.05027542, ...,  0.04301079,
          0.03080702,  0.09767969],
        [-0.10350236, -0.00585408,  0.0892228 , ...,  0.12408535,
         -0.11955193, -0.08801492],
        [-0.08849797, -0.39009592,  0.26571876, ...,  0.19665493,
          0.055693  , -0.2011495 ]]], dtype=float32)>

In [46]:
block0 = model.get_layer('block-0')
norm10 = block0.get_layer('norm1-0')
x01 = norm10(x_embedded)
print('x01:', x01[0][0][:3].numpy())

x01: [ 0.00979763 -0.09247549 -0.04293772]


In [50]:
query0 = block0.get_layer('query-0')
key0 = block0.get_layer('key-0')
value0 = block0.get_layer('value-0')

In [51]:
x02 = value0(key0(query0(x01)))
print('x02:', x02[0][0][:3].numpy())

x02: [-9.707672 14.597338 -9.301317]
