In [None]:
from tidygrad.tensor import Tensor
from tidygrad.functional import Embedding, embedding
import numpy as np
from lovely_numpy import Lo

from transformers import GPT2Tokenizer

In [None]:
from safetensors import safe_open

In [None]:
text = "In a hole in the ground there lived a"
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# tokens = tokenizer.encode(text)  # returns a list of integers
# tokens = Tensor(tokens)

tokens = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
tokens

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
model = safe_open("model.safetensors", framework="np")

In [None]:
model.keys()[-10:]

['h.9.ln_2.bias',
 'h.9.ln_2.weight',
 'h.9.mlp.c_fc.bias',
 'h.9.mlp.c_fc.weight',
 'h.9.mlp.c_proj.bias',
 'h.9.mlp.c_proj.weight',
 'ln_f.bias',
 'ln_f.weight',
 'wpe.weight',
 'wte.weight']

In [None]:
wte = Tensor(model.get_tensor("wte.weight"))
wpe = Tensor(model.get_tensor("wpe.weight"))

print(wpe)
print(wte)

Tensor[1024, 768](name="?" op=Load):
    v=array[1024, 768] f32 n=786432 (3Mb) x∈[-4.538, 4.065] μ=-0.001 σ=0.123
    
Tensor[50257, 768](name="?" op=Load):
    v=array[50257, 768] f32 n=38597376 (0.1Gb) x∈[-1.270, 1.785] μ=0.000 σ=0.144
    


In [None]:
import tidygrad

In [None]:
token_embeddings = embedding(wte, tokens)

positions = np.arange(len(tokens))
position_embeddings = embedding(wpe, positions)

embeddings = token_embeddings + position_embeddings
Lo(embeddings)

Tensor[10, 768](name="(embedding(?)+embedding(?))" op=Add):
    v=array[10, 768] f32 n=7680 (30Kb) x∈[-4.511, 3.938] μ=-9.411e-05 σ=0.219
    

In [None]:
ln_1_w = model.get_tensor("h.0.ln_1.weight")
ln_1_b = model.get_tensor("h.0.ln_1.bias")

In [None]:
def layer_norm(x, w, b, eps=1e-5):
    mu = x.mean(axis=-1, keepdims=True)
    sigma = x.std(axis=-1, keepdims=True, correction=0)

    return (
        (x - mu) / (sigma + eps)
    ) * w + b  #  tensor[10, 768] n=7680 (30Kb) x∈[-0.788, 0.579] μ=-0.005 σ=0.106

In [None]:
ln_1 = layer_norm(embeddings, ln_1_w, ln_1_b)
ln_1

#  tensor[10, 768] n=7680 (30Kb) x∈[-0.788, 0.579] μ=-0.005 σ=0.106

Tensor[10, 768](name="(((((embedding(?)+embedding(?))-(sum((embedding(?)+embedding(?)))/?))/(pow((sum(var)/?),0.5)+?))*?)+?)" op=Add):
    v=array[10, 768] f32 n=7680 (30Kb) x∈[-0.788, 0.579] μ=-0.005 σ=0.106
    

In [None]:
attn_w_qkv = model.get_tensor("h.0.attn.c_attn.weight")
attn_b_qkv = model.get_tensor("h.0.attn.c_attn.bias")

attn_w_q, attn_w_k, attn_w_v = np.split(attn_w_qkv, 3, axis=-1)
attn_b_q, attn_b_k, attn_b_v = np.split(attn_b_qkv, 3, axis=-1)

In [None]:
q = ln_1.mmul(attn_w_q) + attn_b_q
k = ln_1.mmul(attn_w_k) + attn_b_k
v = ln_1.mmul(attn_w_v) + attn_b_v

q_chunked_np = np.array_split(q.data, 12, axis=-1)
k_chunked_np = np.array_split(k.data, 12, axis=-1)
v_chunked_np = np.array_split(v.data, 12, axis=-1)

q_chunked_np = np.stack(q_chunked_np, axis=0)
k_chunked_np = np.stack(k_chunked_np, axis=0)
v_chunked_np = np.stack(v_chunked_np, axis=0)

# q_chunked = Tensor(q_chunked_np, name="q_chunked")
# k_chunked = Tensor(k_chunked_np, name="k_chunked")
# v_chunked = Tensor(v_chunked_np, name="v_chunked")

# attention = q_chunked_np.mmul(k_chunked_np.transpose(-1, -2)) / np.sqrt(64)

print(Lo(q_chunked_np))
print(Lo(k_chunked_np))
print(Lo(k_chunked_np.swapaxes(-1, -2)))

attention = np.matmul(q_chunked_np, k_chunked_np.swapaxes(-1, -2)) / np.sqrt(64)
Lo(attention)

# Lo(q_chunked_np).chans(scale=5)

array[12, 10, 64] f32 n=7680 (30Kb) x∈[-4.234, 4.473] μ=-0.064 σ=0.971
array[12, 10, 64] f32 n=7680 (30Kb) x∈[-6.097, 6.787] μ=0.034 σ=1.350
array[12, 64, 10] f32 n=7680 (30Kb) x∈[-6.097, 6.787] μ=0.034 σ=1.350


array[12, 10, 10] f32 n=1200 (4.7Kb) x∈[-7.848, 11.893] μ=-0.591 σ=2.526

In [None]:
mask = np.tril(np.ones(attention.shape), k=0)  # * (np.finfo(float).min)
ee = np.exp(attention) * mask

softmaxed = ee / ee.sum(axis=-1, keepdims=True)

# print(Lo(softmaxed))

attention_output = np.matmul(softmaxed, v_chunked_np)
# print(Lo(attention_output))

attention_chunks = attention_output[:]
Lo(attention_chunks[0])
attention_reshaped_np = np.concatenate(attention_chunks, axis=-1)
Lo(attention_reshaped_np)

array[10, 768] n=7680 (60Kb) x∈[-1.057, 1.432] μ=0.003 σ=0.166

In [None]:
cproj_w_np = model.get_tensor("h.0.attn.c_proj.weight")
cproj_b_np = model.get_tensor("h.0.attn.c_proj.bias")

cproj_w = Tensor(cproj_w_np)
cproj_b = Tensor(cproj_b_np)

attention_reshaped = Tensor(attention_reshaped_np)

crosstalk = attention_reshaped.mmul(cproj_w) + cproj_b
print(crosstalk)

after_residual = crosstalk + embeddings
print(after_residual)

ln2_w = Tensor(model.get_tensor("h.0.ln_2.weight"), name="ln2_w")
ln2_b = Tensor(model.get_tensor("h.0.ln_2.bias"), name="ln2_b")

after_ln2 = layer_norm(after_residual, ln2_w, ln2_b)

print(after_ln2)

Tensor[10, 768](name="((?@?)+?)" op=Add):
    v=array[10, 768] n=7680 (60Kb) x∈[-14.188, 14.257] μ=0.011 σ=1.083
    
Tensor[10, 768](name="(((?@?)+?)+(embedding(?)+embedding(?)))" op=Add):
    v=array[10, 768] n=7680 (60Kb) x∈[-14.241, 14.485] μ=0.011 σ=1.123
    
Tensor[10, 768](name="(((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)/?),0.5)+?))*ln2_w)+ln2_b)" op=Add):
    v=array[10, 768] n=7680 (60Kb) x∈[-2.793, 1.674] μ=0.005 σ=0.160
    


In [None]:
mlp_c_fc_w = Tensor(model.get_tensor("h.0.mlp.c_fc.weight"), name="fc_w")
mlp_c_fc_b = Tensor(model.get_tensor("h.0.mlp.c_fc.bias"), name="fc_b")

after_up = after_ln2.mmul(mlp_c_fc_w) + mlp_c_fc_b

print(after_up)
# mlp_c_fca = gelu(mlp_c_fc)

Tensor[10, 3072](name="(((((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)/?),0.5)+?))*ln2_w)+ln2_b)@fc_w)+fc_b)" op=Add):
    v=array[10, 3072] n=30720 (0.2Mb) x∈[-6.346, 10.617] μ=-1.086 σ=0.855
    


In [None]:
from tidygrad.functional import sigmoid, tanh
import math

In [None]:
def gelu(x: Tensor):
    return x * sigmoid(1.702 * x)

def new_gelu(input):
    return (0.5 * input * (1.0 + tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * input.pow(3)))))

In [None]:
after_up_a = new_gelu(after_up)

mlp_c_proj_w = Tensor(model.get_tensor("h.0.mlp.c_proj.weight"), name="proj_w")
mlp_c_proj_b = Tensor(model.get_tensor("h.0.mlp.c_proj.bias"), name="proj_b")

after_down = after_up_a.mmul(mlp_c_proj_w) + mlp_c_proj_b

attention_output = after_down + after_residual
attention_output

Tensor[10, 768](name="((((((((((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)/?),0.5)+?))*ln2_w)+ln2_b)@fc_w)+fc_b)*?)*(tanh((((((((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)/?),0.5)+?))*ln2_w)+ln2_b)@fc_w)+fc_b)+(pow((((((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)/?),0.5)+?))*ln2_w)+ln2_b)@fc_w)+fc_b),3)*?))*?))+?))@proj_w)+proj_b)+(((?@?)+?)+(embedding(?)+embedding(?))))" op=Add):
    v=array[10, 768] n=7680 (60Kb) x∈[-67.477, 97.448] μ=0.023 σ=2.375
    

In [None]:
def transformer_block(model_weigts, i, inputs):

    print(f" === Block {i} ===")

    ln_1_w = model.get_tensor(f"h.{i}.ln_1.weight")
    ln_1_b = model.get_tensor(f"h.{i}.ln_1.bias")

    # ln_1 = embeddings
    ln_1 = layer_norm(embeddings, ln_1_w, ln_1_b)
    print("ln_1", ln_1)

    attn_w_qkv = model.get_tensor(f"h.{i}.attn.c_attn.weight")
    attn_b_qkv = model.get_tensor(f"h.{i}.attn.c_attn.bias")

    attn_w_q, attn_w_k, attn_w_v = np.split(attn_w_qkv, 3, axis=-1)
    attn_b_q, attn_b_k, attn_b_v = np.split(attn_b_qkv, 3, axis=-1)

    q = ln_1.mmul(attn_w_q) + attn_b_q
    k = ln_1.mmul(attn_w_k) + attn_b_k
    v = ln_1.mmul(attn_w_v) + attn_b_v

    q_chunked_np = np.array_split(q.data, 12, axis=-1)
    k_chunked_np = np.array_split(k.data, 12, axis=-1)
    v_chunked_np = np.array_split(v.data, 12, axis=-1)

    q_chunked_np = np.stack(q_chunked_np, axis=0)
    k_chunked_np = np.stack(k_chunked_np, axis=0)
    v_chunked_np = np.stack(v_chunked_np, axis=0)

    attention = np.matmul(q_chunked_np, k_chunked_np.swapaxes(-1, -2)) / np.sqrt(64)

    mask = np.tril(np.ones(attention.shape), k=0)  # * (np.finfo(float).min)
    ee = np.exp(attention) * mask

    softmaxed = ee / ee.sum(axis=-1, keepdims=True)

    attention_output = np.matmul(softmaxed, v_chunked_np)
    attention_chunks = attention_output[:]
    attention_reshaped_np = np.concatenate(attention_chunks, axis=-1)

    cproj_w = Tensor(model.get_tensor(f"h.{i}.attn.c_proj.weight"))
    cproj_b = Tensor(model.get_tensor(f"h.{i}.attn.c_proj.bias"))

    attention_reshaped = Tensor(attention_reshaped_np)

    crosstalk = attention_reshaped.mmul(cproj_w) + cproj_b

    after_residual = crosstalk + embeddings

    ln2_w = Tensor(model.get_tensor(f"h.{i}.ln_2.weight"), name="ln2_w")
    ln2_b = Tensor(model.get_tensor(f"h.{i}.ln_2.bias"), name="ln2_b")

    after_ln2 = layer_norm(after_residual, ln2_w, ln2_b)

    mlp_c_fc_w = Tensor(model.get_tensor(f"h.{i}.mlp.c_fc.weight"), name="fc_w")
    mlp_c_fc_b = Tensor(model.get_tensor(f"h.{i}.mlp.c_fc.bias"), name="fc_b")

    after_up = after_ln2.mmul(mlp_c_fc_w) + mlp_c_fc_b

    after_up_a = new_gelu(after_up)

    mlp_c_proj_w = Tensor(model.get_tensor(f"h.{i}.mlp.c_proj.weight"), name="proj_w")
    mlp_c_proj_b = Tensor(model.get_tensor(f"h.{i}.mlp.c_proj.bias"), name="proj_b")

    after_down = after_up_a.mmul(mlp_c_proj_w) + mlp_c_proj_b

    output = after_down + after_residual
    return output


res = transformer_block(model, 0, embeddings)

 === Block 0 ===
ln_1 Tensor[10, 768](name="(((((embedding(?)+embedding(?))-(sum((embedding(?)+embedding(?)))/?))/(pow((sum(var)/?),0.5)+?))*?)+?)" op=Add):
    v=array[10, 768] f32 n=7680 (30Kb) x∈[-0.788, 0.579] μ=-0.005 σ=0.106
    


In [None]:
def transformer(model, token_ids):
    wte = Tensor(model.get_tensor("wte.weight"))
    wpe = Tensor(model.get_tensor("wpe.weight"))

    token_embeddings = embedding(wte, tokens)

    positions = np.arange(len(tokens))
    position_embeddings = embedding(wpe, positions)

    embeddings = token_embeddings + position_embeddings

    for i in range(2):
        embeddings = transformer_block(model, i, embeddings)
        print("Embedding out:", embeddings)

    ln_f_w = Tensor(model.get_tensor("ln_f.weight"))
    ln_f_b = Tensor(model.get_tensor("ln_f.bias"))

    res = layer_norm(embeddings, ln_f_w, ln_f_b)

    return res

tidygrad.tensor._grad = True

res = transformer(model, tokens)

# def gpt2_language_model(model, token_ids):
#     res = transformer(model, token_ids)

#     wte = Tensor(model.get_tensor("wte.weight").swapaxes(-1, -2))
#     logits = res.mmul(wte)
#     return logits

# res = gpt2_language_model(model, tokens)
res


 === Block 0 ===
ln_1 Tensor[10, 768](name="(((((embedding(?)+embedding(?))-(sum((embedding(?)+embedding(?)))/?))/(pow((sum(var)/?),0.5)+?))*?)+?)" op=Add):
    v=array[10, 768] f32 n=7680 (30Kb) x∈[-0.788, 0.579] μ=-0.005 σ=0.106
    
Embedding out: Tensor[10, 768](name="((((((((((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)/?),0.5)+?))*ln2_w)+ln2_b)@fc_w)+fc_b)*?)*(tanh((((((((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)/?),0.5)+?))*ln2_w)+ln2_b)@fc_w)+fc_b)+(pow((((((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)/?),0.5)+?))*ln2_w)+ln2_b)@fc_w)+fc_b),3)*?))*?))+?))@proj_w)+proj_b)+(((?@?)+?)+(embedding(?)+embedding(?))))" op=Add):
    v=array[10, 768] n=7680 (60Kb) x∈[-67.477, 97.448] μ=0.023 σ=2.375
    
 === Block 1 ===
ln_1 Tensor[10, 768](name="(((((embedding(?)+embedding(?))-(sum((embedding(?)+embedd

Tensor[10, 768](name="((((((((((((((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)/?),0.5)+?))*ln2_w)+ln2_b)@fc_w)+fc_b)*?)*(tanh((((((((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)/?),0.5)+?))*ln2_w)+ln2_b)@fc_w)+fc_b)+(pow((((((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)/?),0.5)+?))*ln2_w)+ln2_b)@fc_w)+fc_b),3)*?))*?))+?))@proj_w)+proj_b)+(((?@?)+?)+(embedding(?)+embedding(?))))-(sum(((((((((((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)/?),0.5)+?))*ln2_w)+ln2_b)@fc_w)+fc_b)*?)*(tanh((((((((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)/?),0.5)+?))*ln2_w)+ln2_b)@fc_w)+fc_b)+(pow((((((((((?@?)+?)+(embedding(?)+embedding(?)))-(sum((((?@?)+?)+(embedding(?)+embedding(?))))/?))/(pow((sum(var)