In [None]:
import tidygrad as tg
from tidygrad import Tensor
import numpy as np

import huggingface_hub

import datasets

In [None]:
# ds = datasets.load_dataset("roneneldan/TinyStories")

In [None]:
n_vocab = 1024
n_layers = 2
n_heads = 4
ndim = 512
ctx_len = 128

In [None]:
def gpt2_new(n_vocab, n_layers, n_heads, ndim):
    shape_dict = {
        "wte": [n_vocab, ndim],
        "wpe": [ctx_len, ndim],
        "ln_f.weight": [ndim],
        "ln_f.bias": [ndim],
    }

    for i in range(n_layers):
        shape_dict[f"h.{i}.ln_1.weight"] = [ndim]
        shape_dict[f"h.{i}.ln_1.bias"] = [ndim]

        shape_dict[f"h.{i}.attn.c_attn.weight"] = [ndim, 3 * ndim]
        shape_dict[f"h.{i}.attn.c_attn.bias"] = [3 * ndim]

        shape_dict[f"h.{i}.attn.c_proj.weight"] = [ndim, ndim]
        shape_dict[f"h.{i}.attn.c_proj.bias"] = [ndim]

        shape_dict[f"h.{i}.ln_2.weight"] = [ndim]
        shape_dict[f"h.{i}.ln_2.bias"] = [ndim]

        shape_dict[f"h.{i}.mlp.c_fc.weight"] = [ndim, 4 * ndim]
        shape_dict[f"h.{i}.mlp.c_fc.bias"] = [4 * ndim]

        shape_dict[f"h.{i}.mlp.c_proj.weight"] = [4 * ndim, ndim]
        shape_dict[f"h.{i}.mlp.c_proj.bias"] = [ndim]

    return tg.model.Model(shape_dict)

model = gpt2_new(n_vocab=n_vocab, n_layers=n_layers, n_heads=n_heads, ndim=ndim)

t = Tensor(123, requires_grad=False)
t1 = t + t

t1.requires_grad is False
t1.parents is []


t1.requires_grad(True)

t1.requires_grad is True

But it has no parents!!!1

t1.op should be Load, not Add



In [None]:
def gpt2_init(model):
    for k in model.params.keys():
        if k.endswith(".weight"):
            model.params[k] = Tensor(np.random.randn(*model.params[k].shape), name=k) * 0.02
        elif k.endswith(".bias"):
            model.params[k] = Tensor(np.zeros(model.params[k].shape), name=k)

    model.params["wte"] = Tensor(np.random.randn(*model.params["wte"].shape), name="wte") * 0.02
    model.params["wpe"] = Tensor(np.random.randn(*model.params["wpe"].shape), name="wpe") * 0.01
    

gpt2_init(model)
model.requires_grad(True)


In [None]:
import tidygrad.func as F

In [None]:
def gpt2_transformer_block(model: tg.model.Model, x, n_heads, i):
    def get_params(s):
        return model.params[f"h.{i}.{s}"]

    ln_1 = F.layer_norm(x, get_params("ln_1.weight"), get_params("ln_1.bias"))

    attn_w_qkv = get_params("attn.c_attn.weight")
    attn_b_qkv = get_params("attn.c_attn.bias")

    attn_w_q, attn_w_k, attn_w_v = attn_w_qkv.split(3, axis=-1)
    attn_b_q, attn_b_k, attn_b_v = attn_b_qkv.split(3, axis=-1)

    q = ln_1.mmul(attn_w_q) + attn_b_q
    k = ln_1.mmul(attn_w_k) + attn_b_k
    v = ln_1.mmul(attn_w_v) + attn_b_v



    q_chunked = F.stack(q.split(n=n_heads, axis=-1), axis=0)
    k_chunked = F.stack(k.split(n=n_heads, axis=-1), axis=0)
    v_chunked = F.stack(v.split(n=n_heads, axis=-1), axis=0)

    dim = q_chunked.shape[-1]
    attention = q_chunked.mmul(k_chunked.transpose(-1, -2)) / np.sqrt(dim / n_heads)

    mask = np.tril(np.ones(attention.shape), k=0)
    ee = np.exp(attention) * mask

    softmaxed = ee / ee.sum(axis=-1, keepdims=True)

    attention_output = softmaxed.mmul(v_chunked)
    attention_chunks = attention_output.split(axis=0, n=n_heads)
    # print("attention_chunks", attention_chunks)

    attention_reshaped = F.concat(attention_chunks, axis=-1)
    attention_reshaped = attention_reshaped[0]
    # print("attention_reshaped", attention_reshaped)

    cproj_w = get_params("attn.c_proj.weight")
    cproj_b = get_params("attn.c_proj.bias")
    # attention_reshaped = Tensor(attention_reshaped_np)

    crosstalk = attention_reshaped.mmul(cproj_w) + cproj_b

    after_residual = crosstalk + x
    # print("after_residual", after_residual)
    ln2_w = get_params("ln_2.weight")
    ln2_b = get_params("ln_2.bias")

    after_ln2 = F.layer_norm(after_residual, ln2_w, ln2_b)

    mlp_c_fc_w = get_params("mlp.c_fc.weight")
    mlp_c_fc_b = get_params("mlp.c_fc.bias")

    after_up = after_ln2.mmul(mlp_c_fc_w) + mlp_c_fc_b
    # print("after_up", after_up)

    after_up_a = F.gelu(after_up)
    # print("after_up_a", after_up_a)

    mlp_c_proj_w = get_params("mlp.c_proj.weight")
    mlp_c_proj_b = get_params("mlp.c_proj.bias")

    after_down = after_up_a.mmul(mlp_c_proj_w) + mlp_c_proj_b

    output = after_down + after_residual
    return output

def gpt2(model, input, n_layers, n_heads):
    def get_params(s):
        return model.params[s]

    token_embeddings = F.embedding(get_params("wte"), input)
    position_embeddings = F.embedding(get_params("wpe"), np.arange(input.shape[-1]))

    x = token_embeddings + position_embeddings

    # print("first embedding", x)

    for i in range(n_layers):
        print("layer", i)
        x = gpt2_transformer_block(model=model, x=x, n_heads=n_heads, i=i)


    return F.layer_norm(x, w=get_params("ln_f.weight"), b=get_params("ln_f.bias"))


In [None]:
# res = gpt2(model, np.arange(256).reshape(2, -1), n_layers=n_layers, n_heads=n_heads)
# res.sum().backward()

In [None]:
# from tidygrad.training import one_hot_encode_batch

In [None]:
def one_hot_encode_batch(y, n_classes):
    diag = np.eye(n_classes)
    return Tensor(diag[y])

In [None]:
def language_modeling_loss(model, input, target, n_layers, n_heads):
    res = gpt2(model, input, n_layers, n_heads)
    # print("res", res)
    # print("wte", model.params["wte"])
    logits = res.mmul(model.params["wte"].transpose(-1, -2), name="logits")

    # print("logits", logits)
    loss = F.CrossEntropy_loss(logits, one_hot_encode_batch(target, n_classes=n_vocab))
    return loss


loss = language_modeling_loss(
    model,
    input=np.random.randint(0, n_vocab, size=(2, ctx_len)),
    target=np.random.randint(0, n_vocab, size=(2, ctx_len)),
    n_layers=n_layers,
    n_heads=n_heads
)

# print("loss", loss)

layer 0
layer 1


In [None]:
np.seterr(all="raise")
l = loss.sum()
print(loss)

l.backward()

Tensor[2, 128, 1](name="" op=Div parents=[,]):
    v=array[2, 128, 1] n=256 (2Kb) x∈[0.007, 0.007] μ=0.007 σ=9.689e-06
    ∇=array[2, 128, 1] n=256 (2Kb) [38;2;127;127;127mall_zeros[0m
