In [None]:
import tidygrad
from tidygrad.tensor import Tensor
from tidygrad.functional import Embedding, embedding
import numpy as np
from lovely_numpy import Lo

from transformers import GPT2Tokenizer

In [None]:
from safetensors import safe_open

In [None]:
text = "In a hole in the ground there lived a"
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# tokens = tokenizer.encode(text)  # returns a list of integers
# tokens = Tensor(tokens)

tokens = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
model = safe_open("model.safetensors", framework="np")

In [None]:
def layer_norm(x, w, b, eps=1e-5):
    mu = x.mean(axis=-1, keepdims=True)
    sigma = x.std(axis=-1, keepdims=True, correction=0)

    return ((x-mu) / (sigma+eps)) * w + b  #  tensor[10, 768] n=7680 (30Kb) x∈[-0.788, 0.579] μ=-0.005 σ=0.106

In [None]:
from tidygrad.functional import sigmoid, tanh
import math

In [None]:
def gelu(x: Tensor):
    return x * sigmoid(1.702 * x)




def new_gelu(input):
    return (
        0.5
        * input
        * (1.0
           + tanh(
                math.sqrt(2.0 / math.pi) * (input + 0.044715 * input.pow(3))))
    )

In [None]:
def split_tensor(t: Tensor, axis: int, n: int):
    step = t.shape[axis] // n
    assert step * n == t.shape[axis], "Can't split tensor evenly"

    chunks = []
    for i in range(n):
        start = i * step
        end = (i + 1) * step
        chunks.append(t[..., start:end])

    return chunks

def stack_tensors(tensors: list, axis=0):

    assert axis == 0, "Only axis=0 is supported for now"
    out_shape = (tensors[0].shape[0] * len(tensors), *tensors[0].shape[1:])

    out = Tensor(np.zeros(out_shape))
    for i, t in enumerate(tensors):
        assert tensors[i].shape[1:] == tensors[0].shape[1:], "All tensors must have the same shape"
        out[i::len(tensors)] = t
    return out


def transformer_block(model, i, inputs):
    # print(f" === Block {i} ===")

    ln_1_w = model.get_tensor(f"h.{i}.ln_1.weight")
    ln_1_b = model.get_tensor(f"h.{i}.ln_1.bias")

    # ln_1 = embeddings
    ln_1 = layer_norm(inputs, ln_1_w, ln_1_b)
    # print("ln_1", ln_1)

    attn_w_qkv = model.get_tensor(f"h.{i}.attn.c_attn.weight")
    attn_b_qkv = model.get_tensor(f"h.{i}.attn.c_attn.bias")

    attn_w_q, attn_w_k, attn_w_v = np.split(attn_w_qkv, 3, axis=-1)
    attn_b_q, attn_b_k, attn_b_v = np.split(attn_b_qkv, 3, axis=-1)


    q = ln_1.mmul(attn_w_q) + attn_b_q
    k = ln_1.mmul(attn_w_k) + attn_b_k
    v = ln_1.mmul(attn_w_v) + attn_b_v

    

    q_chunked_np = np.split(q.data, 12, axis=-1)
    k_chunked_np = np.split(k.data, 12, axis=-1)
    v_chunked_np = np.split(v.data, 12, axis=-1)

    # q_chunked = split_tensor(q, axis=-1, n=12)
    # k_chunked = split_tensor(k, axis=-1, n=12)
    # v_chunked = split_tensor(v, axis=-1, n=12)

    q_chunked_np = np.stack(q_chunked_np, axis=0)
    k_chunked_np = np.stack(k_chunked_np, axis=0)
    v_chunked_np = np.stack(v_chunked_np, axis=0)


    # q_chunked = (q_chunked, axis=0)
    # k_chunked = (k_chunked, axis=0)
    # v_chunked = (v_chunked, axis=0)


    attention = np.matmul(q_chunked_np, k_chunked_np.swapaxes(-1, -2)) / np.sqrt(64)

    mask = np.tril(np.ones(attention.shape), k=0)  # * (np.finfo(float).min)
    ee = np.exp(attention) * mask

    softmaxed = ee / ee.sum(axis=-1, keepdims=True)

    attention_output = np.matmul(softmaxed, v_chunked_np)
    # print("attention_output", Lo(attention_output))

    attention_chunks = attention_output[:]
    attention_reshaped_np = np.concatenate(attention_chunks, axis=-1)

    cproj_w = Tensor(model.get_tensor(f"h.{i}.attn.c_proj.weight"))
    cproj_b = Tensor(model.get_tensor(f"h.{i}.attn.c_proj.bias"))

    attention_reshaped = Tensor(attention_reshaped_np)

    crosstalk = attention_reshaped.mmul(cproj_w) + cproj_b

    after_residual = crosstalk + inputs

    # print("after_residual", after_residual)

    ln2_w = Tensor(model.get_tensor(f"h.{i}.ln_2.weight"), name="ln2_w")
    ln2_b = Tensor(model.get_tensor(f"h.{i}.ln_2.bias"), name="ln2_b")

    after_ln2 = layer_norm(after_residual, ln2_w, ln2_b)

    mlp_c_fc_w = Tensor(model.get_tensor(f"h.{i}.mlp.c_fc.weight"), name="fc_w")
    mlp_c_fc_b = Tensor(model.get_tensor(f"h.{i}.mlp.c_fc.bias"), name="fc_b")

    after_up = after_ln2.mmul(mlp_c_fc_w) + mlp_c_fc_b
    # print("after_up", after_up)
    after_up_a = new_gelu(after_up)

    # print("after_up_a", after_up_a)

    mlp_c_proj_w = Tensor(model.get_tensor(f"h.{i}.mlp.c_proj.weight"), name="proj_w")
    mlp_c_proj_b = Tensor(model.get_tensor(f"h.{i}.mlp.c_proj.bias"), name="proj_b")

    after_down = after_up_a.mmul(mlp_c_proj_w) + mlp_c_proj_b

    output = after_down + after_residual
    return output


# res = transformer_block(model, 0, embeddings)

In [None]:
def transformer(model, tokens):
    wte = Tensor(model.get_tensor("wte.weight"))
    wpe = Tensor(model.get_tensor("wpe.weight"))

    token_embeddings = embedding(wte, tokens)

    positions = np.arange(len(tokens))
    position_embeddings = embedding(wpe, positions)

    embeddings = token_embeddings + position_embeddings

    for i in range(12):
        embeddings = transformer_block(model, i, embeddings)
        # print("Embedding out:", embeddings)
        # print(tidygrad.tensor._num_tensors)
        # print(tidygrad.tensor._num_ops)

    ln_f_w = Tensor(model.get_tensor("ln_f.weight"))
    ln_f_b = Tensor(model.get_tensor("ln_f.bias"))

    res = layer_norm(embeddings, ln_f_w, ln_f_b)

    return res


tidygrad.tensor._grad = False

tokens = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

res = transformer(model, tokens)
print(res)

Tensor[10, 768](" op=Add):
    v=array[10, 768] n=7680 (60Kb) x∈[-15.634, 197.272] μ=0.346 σ=6.708
    


In [None]:
text = "In a hole in the ground there lived a"
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokens = tokenizer.encode(text)  # returns a list of integers
print(tokens)
# tokens = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


def gpt2_language_model(model, token_ids):
    res = transformer(model, token_ids)

    # res = Tensor(res.data[:, -1, :])

    wte = Tensor(model.get_tensor("wte.weight").swapaxes(-1, -2))
    logits = res.mmul(wte)
    return logits

res = gpt2_language_model(model, tokens)
print(res)
tokenizer.decode( res.data.argmax(axis=-1)[-1])

[818, 257, 7604, 287, 262, 2323, 612, 5615, 257]
Tensor[9, 50257](" op=Matmul):
    v=array[9, 50257] n=452313 (3.5Mb) x∈[-119.274, -27.010] μ=-87.018 σ=20.724
    


' man'

In [None]:
print("=== Generating ===")
print("Input: ", tokenizer.decode(tokens))
for i in range(10):
    res = gpt2_language_model(model, tokens)
    tokens.append(res.data.argmax(axis=-1)[-1])
    print("Output:", tokenizer.decode(tokens))

=== Generating ===
Input:  In a hole in the ground there lived a


Output: In a hole in the ground there lived a man
Output: In a hole in the ground there lived a man who
Output: In a hole in the ground there lived a man who had
Output: In a hole in the ground there lived a man who had been
Output: In a hole in the ground there lived a man who had been killed
Output: In a hole in the ground there lived a man who had been killed by
Output: In a hole in the ground there lived a man who had been killed by a
Output: In a hole in the ground there lived a man who had been killed by a bullet
Output: In a hole in the ground there lived a man who had been killed by a bullet.
Output: In a hole in the ground there lived a man who had been killed by a bullet.

