In [None]:
import tidygrad
from tidygrad.tensor import Tensor

import tidygrad.func as F
# from tidygrad.func import  embedding, layer_norm, stack, concat
import numpy as np
from lovely_numpy import Lo

from transformers import GPT2Tokenizer

In [None]:
from safetensors import safe_open

In [None]:
# Download the model weights if needed
# !wget -c https://huggingface.co/gpt2/resolve/main/model.safetensors -O gpt2.safetensors
# !wget -c https://huggingface.co/gpt2-medium/resolve/main/model.safetensors -O gpt2-medium.safetensors
# !wget -c https://huggingface.co/gpt2-large/resolve/main/model.safetensors -O gpt2-large.safetensors
# !wget -c https://huggingface.co/gpt2-xl/resolve/main/model.safetensors -O gpt2-xl.safetensors

In [None]:
class Gpt2Variant:
    def __init__(self, weight_file, n_head, n_layer):
        self.weight_file = weight_file
        self.n_head = n_head
        self.n_layer = n_layer

gpt2_variants = {
    "gpt2": Gpt2Variant("gpt2.safetensors", 12, 12),
    "gpt2-medium": Gpt2Variant("gpt2-medium.safetensors", 16, 24),
    "gpt2-large": Gpt2Variant("gpt2-large.safetensors", 20, 36),
    "gpt2-xl": Gpt2Variant("gpt2-xl.safetensors", 25, 48),
}

gpt2_variant = "gpt2-xl"

text = "In a hole in the ground there lived a"
tokenizer = GPT2Tokenizer.from_pretrained(gpt2_variant)

# tokens = tokenizer.encode(text)  # returns a list of integers
# tokens = Tensor(tokens)

tokens = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
model = safe_open(gpt2_variants[gpt2_variant].weight_file, framework="np")

In [None]:
import tidygrad.func as F

In [None]:
def transformer_block(model, i, input, n_head):
    dim = input.shape[-1]
    assert dim % n_head == 0

    ln_1_w = model.get_tensor(f"h.{i}.ln_1.weight")
    ln_1_b = model.get_tensor(f"h.{i}.ln_1.bias")

    ln_1 = F.layer_norm(input, ln_1_w, ln_1_b)
    # ln_1.ad

    attn_w_qkv = model.get_tensor(f"h.{i}.attn.c_attn.weight")
    attn_b_qkv = model.get_tensor(f"h.{i}.attn.c_attn.bias")

    attn_w_q, attn_w_k, attn_w_v = np.split(attn_w_qkv, 3, axis=-1)
    attn_b_q, attn_b_k, attn_b_v = np.split(attn_b_qkv, 3, axis=-1)

    q = ln_1.mmul(attn_w_q) + attn_b_q
    k = ln_1.mmul(attn_w_k) + attn_b_k
    v = ln_1.mmul(attn_w_v) + attn_b_v

    # q_chunked = split_tensor(q, axis=-1, n=12)
    # k_chunked = split_tensor(k, axis=-1, n=12)
    # v_chunked = split_tensor(v, axis=-1, n=12)

    q_chunked = F.stack(q.split(n=n_head, axis=-1), axis=0)
    k_chunked = F.stack(k.split(n=n_head, axis=-1), axis=0)
    v_chunked = F.stack(v.split(n=n_head, axis=-1), axis=0)

    attention = q_chunked.mmul(k_chunked.transpose(-1, -2)) / np.sqrt(dim / n_head)

    mask = np.tril(np.ones(attention.shape), k=0)
    ee = np.exp(attention) * mask

    softmaxed = ee / ee.sum(axis=-1, keepdims=True)

    attention_output = softmaxed.mmul(v_chunked)
    attention_chunks = attention_output.split(axis=0, n=n_head)
    # print("attention_chunks", attention_chunks)

    attention_reshaped = F.concat(attention_chunks, axis=-1)
    attention_reshaped = attention_reshaped[0]
    # print("attention_reshaped", attention_reshaped)

    cproj_w = Tensor(model.get_tensor(f"h.{i}.attn.c_proj.weight"))
    cproj_b = Tensor(model.get_tensor(f"h.{i}.attn.c_proj.bias"))
    # attention_reshaped = Tensor(attention_reshaped_np)

    crosstalk = attention_reshaped.mmul(cproj_w) + cproj_b

    after_residual = crosstalk + input
    # print("after_residual", after_residual)

    ln2_w = Tensor(model.get_tensor(f"h.{i}.ln_2.weight"), name="ln2_w")
    ln2_b = Tensor(model.get_tensor(f"h.{i}.ln_2.bias"), name="ln2_b")

    after_ln2 = F.layer_norm(after_residual, ln2_w, ln2_b)

    mlp_c_fc_w = Tensor(model.get_tensor(f"h.{i}.mlp.c_fc.weight"), name="fc_w")
    mlp_c_fc_b = Tensor(model.get_tensor(f"h.{i}.mlp.c_fc.bias"), name="fc_b")

    after_up = after_ln2.mmul(mlp_c_fc_w) + mlp_c_fc_b
    # print("after_up", after_up)

    after_up_a = F.gelu(after_up)
    # print("after_up_a", after_up_a)

    mlp_c_proj_w = Tensor(model.get_tensor(f"h.{i}.mlp.c_proj.weight"), name="proj_w")
    mlp_c_proj_b = Tensor(model.get_tensor(f"h.{i}.mlp.c_proj.bias"), name="proj_b")

    after_down = after_up_a.mmul(mlp_c_proj_w) + mlp_c_proj_b

    output = after_down + after_residual
    return output

# res = transformer_block(model, 0, embeddings)

In [None]:
def transformer(model, tokens, n_layer, n_head):
    wte = Tensor(model.get_tensor("wte.weight"))
    wpe = Tensor(model.get_tensor("wpe.weight"))

    token_embeddings = F.embedding(wte, tokens)

    positions = np.arange(len(tokens))
    position_embeddings = F.embedding(wpe, positions)

    embeddings = token_embeddings + position_embeddings

    for i in range(n_layer):
        embeddings = transformer_block(model, i, embeddings, n_head)
        # print("Embedding out:", embeddings)
        # print(tidygrad.tensor._num_tensors)
        # print(tidygrad.tensor._num_ops)

    ln_f_w = Tensor(model.get_tensor("ln_f.weight"))
    ln_f_b = Tensor(model.get_tensor("ln_f.bias"))

    res = F.layer_norm(embeddings, ln_f_w, ln_f_b)

    return res

tokens = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

with tidygrad.no_grad():
    res = transformer(model, tokens, gpt2_variants[gpt2_variant].n_layer, gpt2_variants[gpt2_variant].n_head)
    print(res)

Tensor[10, 1600](" op=Add):
    v=array[10, 1600] f32 n=16000 (62Kb) x∈[-5.412, 10.720] μ=0.017 σ=1.065
    


In [None]:
wte = Tensor(model.get_tensor("wte.weight").swapaxes(-1, -2))

In [None]:
text = "In a hole in the ground there lived a"
tokenizer = GPT2Tokenizer.from_pretrained(gpt2_variant)

tokens = tokenizer.encode(text)  # returns a list of integers
print(tokens)
# tokens = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

def gpt2_language_model(model, token_ids, wte, n_layer, n_head):
    res = transformer(model, token_ids, n_layer, n_head)

    res = res[-1, :]
    logits = res.mmul(wte)
    return logits, res

with tidygrad.no_grad():
    logits, res = gpt2_language_model(model, tokens, wte, n_layer=gpt2_variants[gpt2_variant].n_layer, n_head=gpt2_variants[gpt2_variant].n_head)
    print(res)
tokenizer.decode(logits.data.argmax(axis=-1))

[818, 257, 7604, 287, 262, 2323, 612, 5615, 257]
Tensor[1600](" op=Slice):
    v=array[1600] f32 6.2Kb x∈[-5.825, 4.088] μ=0.007 σ=1.243
    


' hob'

In [None]:
from tqdm.auto import tqdm

In [None]:
text = "In a hole in the ground there lived a"
tokens = tokenizer.encode(text)  # returns a list of integers

print("=== Generating ===")
print("Input: ", tokenizer.decode(tokens))
wte = Tensor(model.get_tensor("wte.weight").swapaxes(-1, -2))

with tidygrad.no_grad():
    for i in tqdm(range(10)):
        logits, res = gpt2_language_model(model, tokens, wte, n_layer=gpt2_variants[gpt2_variant].n_layer, n_head=gpt2_variants[gpt2_variant].n_head)
        tokens.append(logits.data.argmax(axis=-1))
        print("Output:", tokenizer.decode(tokens))

=== Generating ===
Input:  In a hole in the ground there lived a


  0%|          | 0/10 [00:00<?, ?it/s]

Output: In a hole in the ground there lived a hob
Output: In a hole in the ground there lived a hobbit
Output: In a hole in the ground there lived a hobbit.
Output: In a hole in the ground there lived a hobbit.

Output: In a hole in the ground there lived a hobbit.


Output: In a hole in the ground there lived a hobbit.

He
Output: In a hole in the ground there lived a hobbit.

He had
Output: In a hole in the ground there lived a hobbit.

He had big
Output: In a hole in the ground there lived a hobbit.

He had big feet
Output: In a hole in the ground there lived a hobbit.

He had big feet,
