In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import numpy as np
from utils import tldr
from utils import load_gpt2_params_from_tf_ckpt

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [224]:
class Embedding(tf.keras.layers.Layer):
    def __init__(self, embedding_size, vocab_size, max_position_length, dtype=tf.float32):
        super().__init__(name="embedding", dtype=dtype)
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.max_position_length = max_position_length
        self.word_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.embedding_size, name="word_embedding")
        # self.word_embedding.build((None, self.embedding_size))
        self.position_embedding = tf.keras.layers.Embedding(input_dim=self.max_position_length, output_dim=self.embedding_size, name="position_embedding")
        # self.position_embedding.build((None, self.embedding_size))
        self.debug = False

    def build(self, input_shape):
        # input_shape[-1] is the number of features from the previous layer
        if self.debug:       
            print("..Embedding input_shape=", input_shape)
        super().build(input_shape)
        
    def call(self, inputs):
        we = self.word_embedding(inputs)
        pe = self.position_embedding(tf.range(self.max_position_length))
        pe_corrected = pe[:we.shape[1], :]
        x = we + pe_corrected
        if self.debug:
            print(f".. Embedding output: {tldr(x)}")
        return x

In [225]:

class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, b, use_outproj=False): # , use_outproj=False):
        super().__init__(name="self")
        self.debug = False
        assert (d_out % num_heads == 0), "d_out must be divisible by num_heads"
        qkv_bias=True # ?
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        # self.query_layer = tf.keras.layers.Dense(units=self.head_dim, activation=None, name=f"query-{b}", use_bias=qkv_bias)
        self.query_layer = tf.keras.layers.Dense(units=self.d_out, activation=None, name=f"query-{b}", use_bias=qkv_bias)
        # self.query_layer.build((None, d_in))
        self.key_layer = tf.keras.layers.Dense(units=self.d_out, activation=None, name=f"key-{b}", use_bias=qkv_bias)
        # self.key_layer.build((None, d_in))
        self.value_layer = tf.keras.layers.Dense(units=self.d_out, activation=None, name=f"value-{b}", use_bias=qkv_bias)
        # self.value_layer.build((None, d_in))
        self.use_outproj = use_outproj

        self.out_proj = tf.keras.layers.Dense(units=d_out, activation=None, name=f"proj-{b}", use_bias=True)
        # self.out_proj.build((None, d_out))

        mask = tf.ones((context_length, context_length), dtype=tf.bool) # square matrix of True
        causal_mask = tf.linalg.band_part(mask, num_lower=-1, num_upper=0) # upper right becomes False
        additive_mask = 1.0 - tf.cast(causal_mask, dtype=tf.float32) # upper right becomes 1.0
        self.additive_mask_applied = additive_mask * -1e9   # upper right is large negative value

    def build(self, input_shape):
        # input_shape[-1] is the number of features from the previous layer
        if self.debug:         
            print("..SelfAttention input_shape=", input_shape)
        super().build(input_shape) 
        
    def call(self, inputs):
        if self.debug:
            print(f".... input to    SelfAttention: {tldr(inputs)}")
        batch_size, num_tokens, d_in = inputs.shape

        keys = self.key_layer(inputs)      
        queries = self.query_layer(inputs)
        values = self.value_layer(inputs)
        
        keys = tf.reshape(keys, [batch_size, num_tokens, self.num_heads, self.head_dim ]) # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim): 2, 6, 2 -> 2, 6, 2, 1        
        queries = tf.reshape(queries, [batch_size, num_tokens, self.num_heads, self.head_dim ]) # 2, 6, 2, 1
        values = tf.reshape(values, [batch_size, num_tokens, self.num_heads, self.head_dim ]) # 2, 6, 2, 1

        keys = tf.transpose(keys, perm=[0, 2, 1, 3])      # [2,6,2,1] ->  [2, 2, 6, 1]        
        values = tf.transpose(values, perm=[0, 2, 1, 3])      # [2,6,2,1] ->  [2, 2, 6, 1]
        queries = tf.transpose(queries, perm=[0, 2, 1, 3])      # [2,6,2,1] ->  [2, 2, 6, 1]
        
        attn_scores = tf.matmul(queries, tf.transpose(keys, perm=[0, 1, 3, 2]))
        trimmed_additive_mask_applied = self.additive_mask_applied[:num_tokens, :num_tokens]
        attn_scores = attn_scores + trimmed_additive_mask_applied
        attn_weights = tf.nn.softmax(attn_scores / keys.shape[-1]**0.5, axis=-1)

        context_vec = tf.matmul(attn_weights, values)        
        context_vec = tf.transpose(context_vec, perm=[0, 2, 1, 3])
        context_vec = tf.reshape(context_vec, [batch_size, num_tokens, self.d_out]) # (b, num_tokens, self.d_out)

        if self.use_outproj: # llmfs uses this!
            context_vec = self.out_proj(context_vec)
        if self.debug:
            print(f".... output from SelfAttention: {tldr(context_vec)}")
        return context_vec

In [226]:

class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, b):
        super().__init__(name="attention")
        self.layer_norm = tf.keras.layers.LayerNormalization(name="layer_norm") 
        self.self_attention = SelfAttention(d_in, d_out, context_length, dropout, num_heads, b)
        self.projection = tf.keras.layers.Dense(units=d_out, activation=None, name=f"projection")
        self.debug = False

    def build(self, input_shape):
        # input_shape[-1] is the number of features from the previous layer
        if self.debug:         
            print("..AttentionLayer input_shape=", input_shape)
        super().build(input_shape)  
        
    def call(self, inputs):
        if self.debug:
            print(f".... input to    layer_norm: {tldr(inputs)}")
        x = self.layer_norm(inputs)
        if self.debug:
            print(f".... output from layer_norm: {tldr(x)}")
        x = self.self_attention(x)
        if self.debug:
            print(f".... input to    projection: {tldr(x)}")
        x = self.projection(x)
        if self.debug:
            print(f".... output from projection: {tldr(x)}")
        return x

In [227]:
class MultiLayerPerceptron(tf.keras.layers.Layer):
    def __init__(self, d_out, b):
        super().__init__(name="mlp")
        self.layer_norm = tf.keras.layers.LayerNormalization(name=f"layer_norm")
        self.perceptron = tf.keras.layers.Dense(units=d_out * 4, activation=tf.keras.activations.gelu, name=f"perceptron")
        self.projection = tf.keras.layers.Dense(units=d_out, name=f"projection")
        self.debug = False

    def call(self, inputs):
        if self.debug:
            print(f".... input to    mlp: {tldr(inputs)}")
        x = self.layer_norm(inputs)
        x = self.perceptron(x)
        x = self.projection(x)
        if self.debug:
            print(f".... output from mlp: {tldr(x)}")
        return x

In [228]:
class Block(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, b, qkv_bias=False):
        super().__init__(name=f'block-{b}')
        self.b = b       

        self.attention = AttentionLayer(d_in, d_out, context_length, dropout, num_heads, b)
        self.mlp = MultiLayerPerceptron(d_out, b)
        self.debug = False

    def build(self, input_shape):
        # input_shape[-1] is the number of features from the previous layer
        if self.debug:         
            print("..Block input_shape=", input_shape)
        super().build(input_shape)          

    def call(self, inputs):
        if self.debug:
            print()
            print(f".. input to    block {self.b}: {tldr(inputs)}")
        x = inputs
        a = self.attention(x)
        x = x + a
        m = self.mlp(x)
        x = x + m
        if self.debug:
            print(f".. output from block {self.b}: {tldr(x)}")
        return x

In [234]:
class Transformer(tf.keras.layers.Layer):
    def __init__(self, blocks_num, d_in, d_out, context_length, dropout, num_heads):
        super().__init__(name="transformer")
        self.blocks_num = blocks_num
        self.blocks = []
        self.debug = False
        for b in range(blocks_num):
            block = Block(d_in, d_out, context_length, dropout, num_heads, b)
            self.blocks.append(block)
        self.layer_norm = tf.keras.layers.LayerNormalization(name=f"layer_norm")

    def build(self, input_shape):
        # input_shape[-1] is the number of features from the previous layer
        if self.debug:        
            print("..Transformer input_shape=", input_shape)
        super().build(input_shape)        
        
    def call(self, inputs):
        x = inputs
        for b in range(self.blocks_num):
            x = self.blocks[b](x)

        x = self.layer_norm(x)
        return x

In [258]:
@tf.keras.saving.register_keras_serializable()
class GPT2k(tf.keras.Model):
    def __init__(self, config, name=None, trainable=True, dtype=None):
        super().__init__(name=name)
        self.trainable = trainable
        self.embedding_size=config['n_embd']
        self.vocab_size=config['n_vocab']
        self.max_position_length=config['n_ctx']
        self.blocks_num = config["n_layer"]
        d_in=config['n_embd']
        d_out=config['n_embd']
        context_length = config['n_ctx']
        num_heads = config['n_head']
        self.embedding = Embedding(embedding_size=self.embedding_size, vocab_size=self.vocab_size, max_position_length=self.max_position_length)
        self.transformer = Transformer(self.blocks_num, d_in, d_out, context_length, dropout=None, num_heads=num_heads)
        self.debug = False
        
    def call(self, inputs):
        x = self.embedding(inputs)
        # return x
        x = self.transformer(x)

        if self.debug:
            print(f".. input to final matmul: {tldr(x)}")
        logits = tf.matmul(x, self.embedding.word_embedding.weights[0], transpose_b=True)
        if self.debug:
            print(f".. final logits: {tldr(logits)}")
        return logits


In [259]:
config124M = {'n_embd': 768, 'n_vocab': 50257, 'n_ctx': 1024, 'n_layer': 12, 'n_head': 12}
gpt2k=GPT2k(config124M)

In [260]:
len(gpt2k.variables)

0

In [261]:
# gpt2k.embedding.build((None, 768))
gpt2k.build((2, 768))


In [262]:
gpt2k.summary()

Model: "gpt2k_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  39383808  
                                                                 
 transformer (Transformer)   multiple                  85056000  
                                                                 
Total params: 124439808 (474.70 MB)
Trainable params: 124439808 (474.70 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [263]:
gpt2k.get_config()

{'name': 'gpt2k_25',
 'trainable': True,
 'dtype': 'float32',
 'config': {'n_embd': 768,
  'n_vocab': 50257,
  'n_ctx': 1024,
  'n_layer': 12,
  'n_head': 12}}

In [264]:
gpt2k.embedding.word_embedding.built

True

In [265]:
b=0
gpt2k.transformer.blocks[b].attention.self_attention.query_layer.built


True

In [266]:
model_dir="openai_gpt2_weights/124M"
tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, config124M)

In [267]:

def load_weights_into_gpt2k(model, params):
    assert model.built
    model.embedding.word_embedding.set_weights([np.array(params['wte'])])
    model.embedding.position_embedding.set_weights([np.array(params['wpe'])])
    for b in range(len(params["blocks"])):
        assert model.transformer.blocks[b].attention.layer_norm.built
        model.transformer.blocks[b].attention.layer_norm.beta = tf.Variable(params["blocks"][b]["ln_1"]["b"]) 
        model.transformer.blocks[b].attention.layer_norm.gamma = tf.Variable(params["blocks"][b]["ln_1"]["g"]) 


        # gpt.transformer.blocks[b].attention.layer_norm.beta_initializer = tf.keras.initializers.Constant(params["blocks"][b]["ln_1"]["b"]) 
        # gpt.transformer.blocks[b].attention.layer_norm.gamma_initializer = tf.keras.initializers.Constant(params["blocks"][b]["ln_1"]["g"]) 
        # gpt.transformer.blocks[b].attention.layer_norm.build((None, None, n_embd))


        q_w, k_w, v_w = np.split((params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        q_b, k_b, v_b = np.split((params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        
        # assert gpt.transformer.blocks[b].attention.self_attention.query_layer.built
        model.transformer.blocks[b].attention.self_attention.query_layer.set_weights([q_w, q_b])
        # assert gpt.transformer.blocks[b].attention.self_attention.key_layer.built
        model.transformer.blocks[b].attention.self_attention.key_layer.set_weights([k_w, k_b])
        # assert gpt.transformer.blocks[b].attention.self_attention.value_layer.built
        model.transformer.blocks[b].attention.self_attention.value_layer.set_weights([v_w, v_b])
    
        # AttentionLayer projection
        # model.transformer.blocks[b].attention.projection.build((None, n_embd))
        # assert gpt.transformer.blocks[b].attention.projection.built
        model.transformer.blocks[b].attention.projection.set_weights([params["blocks"][b]["attn"]["c_proj"]["w"], params["blocks"][b]["attn"]["c_proj"]["b"]])      
    
        # MultiLayerPerceptron layer_norm
        model.transformer.blocks[b].mlp.layer_norm.beta = tf.Variable(params["blocks"][b]["ln_2"]["b"]) 
        model.transformer.blocks[b].mlp.layer_norm.gamma = tf.Variable(params["blocks"][b]["ln_2"]["g"])
        # model.transformer.blocks[b].mlp.layer_norm.build((None, None, n_embd))
        # assert gpt.transformer.blocks[b].mlp.layer_norm.built
    
        # MultiLayerPerceptron perceptron
        # model.transformer.blocks[b].mlp.perceptron.build((None, n_embd))
        # assert gpt.transformer.blocks[b].mlp.perceptron.built
        model.transformer.blocks[b].mlp.perceptron.set_weights([params["blocks"][b]["mlp"]["c_fc"]["w"], params["blocks"][b]["mlp"]["c_fc"]["b"]])
        # MultiLayerPerceptron projection
        mlp_proj_embd = params["blocks"][b]["mlp"]["c_proj"]["w"].shape[0]
        # gpt.transformer.blocks[b].mlp.projection.build((None, mlp_proj_embd))
        # assert gpt.transformer.blocks[b].mlp.projection.built
        model.transformer.blocks[b].mlp.projection.set_weights([params["blocks"][b]["mlp"]["c_proj"]["w"], params["blocks"][b]["mlp"]["c_proj"]["b"]])           
        
        
        
    model.transformer.layer_norm.beta = tf.Variable(params["b"])
    model.transformer.layer_norm.gamma = tf.Variable(params["g"])

In [268]:
load_weights_into_gpt2k(gpt2k, params)

### Validate embedding layer

In [269]:
x_effort = tf.constant([[6109, 3626, 6100, 345]])
x_effort_emb = gpt2k.embedding(x_effort)
assert x_effort_emb.shape == [1, 4, 768]
assert np.allclose(x_effort_emb[0][0][:3], np.array([0.07927368, -0.2979193 ,  0.08817437]), rtol=1e-3, atol=1e-3)

### Validate attention.layer_norm

In [270]:
y = gpt2k.transformer.blocks[0].attention.layer_norm(x_effort_emb)
assert y.shape == [1, 4, 768]
assert np.allclose(y[0][0][:3], np.array([0.047223  , -0.11664161, -0.02536647]), rtol=1e-3, atol=1e-3)

### Validate self_attention

In [271]:
y = gpt2k.transformer.blocks[0].attention.self_attention(x_effort_emb)
assert y.shape == [1, 4, 768]
assert np.allclose(y[0][0][:3], np.array([0.28434375, -0.00881347,  0.34210888]), rtol=1e-3, atol=1e-3)

### Validate the entire model

In [272]:
y = gpt2k(x_effort)
assert y.shape == [1, 4, 50257]
assert np.allclose(y[0][0][:3], np.array([-35.521214, -34.924126, -38.39469]), rtol=1e-2, atol=1e-2)
tldr(y)

'float32 (1, 4, 50257) [-35.526115 -34.928413 -38.399166]'

### Write to checkpoint

In [250]:
chkp_path = "my_checkpoint"
checkpoint = tf.train.Checkpoint(model=gpt2k)
checkpoint.write(chkp_path)

'my_checkpoint'

In [251]:
len(gpt2k.variables)

196

### Reload checkpoint into new model gpt2z

In [252]:
config124M = {'n_embd': 768, 'n_vocab': 50257, 'n_ctx': 1024, 'n_layer': 12, 'n_head': 12}
gpt2z=GPT2k(config124M)
checkpoint = tf.train.Checkpoint(model=gpt2z)
checkpoint.restore(chkp_path)
# gpt2z.build((2, 768))
# assert len(gpt2z.variables) == len(gpt2k.variables)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f5df8392110>

### Revalidate

In [253]:
x_effort = tf.constant([[6109, 3626, 6100, 345]])
x_effort_emb = gpt2z.embedding(x_effort)
assert x_effort_emb.shape == [1, 4, 768]
assert np.allclose(x_effort_emb[0][0][:3], np.array([0.07927368, -0.2979193 ,  0.08817437]), rtol=1e-3, atol=1e-3)

In [254]:
y = gpt2z(x_effort)
assert y.shape == [1, 4, 50257]
assert np.allclose(y[0][0][:3], np.array([-35.521214, -34.924126, -38.39469]), rtol=1e-2, atol=1e-2)
tldr(y)

'float32 (1, 4, 50257) [-35.526115 -34.928413 -38.399166]'

In [273]:
gpt2k.save('gpt2k.keras')  # The file needs to end with the .keras extension

In [274]:
model = tf.keras.models.load_model('gpt2k.keras')

In [275]:
y = model(x_effort)

In [276]:
assert y.shape == [1, 4, 50257]
assert np.allclose(y[0][0][:3], np.array([-35.521214, -34.924126, -38.39469]), rtol=1e-2, atol=1e-2)
tldr(y)

'float32 (1, 4, 50257) [-35.526115 -34.928413 -38.399166]'

In [277]:
len(model.variables)

196