In [None]:
import torch
from torch import nn
torch.set_printoptions(precision=2, sci_mode=False)
torch.set_default_dtype(torch.float64)
import numpy as np
import torch.nn.functional as F
import math
from pathlib import Path
import torch.nn as nn


## GPT

In [653]:
from safetensors import safe_open

path = "/Users/uonliaquat/Downloads/model.safetensors"

with safe_open(path, framework="pt", device="cpu") as f:
    h0_keys = [k for k in f.keys() if k.startswith("h.0")]

for key in h0_keys:
    print(key)

h.0.attn.bias
h.0.attn.c_attn.bias
h.0.attn.c_attn.weight
h.0.attn.c_proj.bias
h.0.attn.c_proj.weight
h.0.ln_1.bias
h.0.ln_1.weight
h.0.ln_2.bias
h.0.ln_2.weight
h.0.mlp.c_fc.bias
h.0.mlp.c_fc.weight
h.0.mlp.c_proj.bias
h.0.mlp.c_proj.weight


## Read C Model

In [None]:
from safetensors import safe_open

filename = "/Users/uonliaquat/workspace/zerograd/c_model.safetensors"

output_c = {}

with safe_open(filename, framework="pt", device="cpu") as f:
    for key in f.keys():
        output_c[key] = f.get_tensor(key)

# Inspect
print(len(output_c))
print(list(output_c.keys()))

In [None]:
import torch
import torch.nn as nn

class LayerNorm(nn.Module):
    def __init__(self, normalized_shape, weight, bias, eps=1e-5):
        super().__init__()
        self.normalized_shape = normalized_shape
        self.eps = eps

        # MUST be Parameters
        self.weight = nn.Parameter(weight.clone())
        self.bias = nn.Parameter(bias.clone())

    def forward(self, x):
        # Compute mean & variance over last dim
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)

        x_hat = (x - mean) / torch.sqrt(var + self.eps)

        # Explicit reshape (matches PyTorch internals)
        return x_hat * self.weight.view(1, 1, -1) + self.bias.view(1, 1, -1)
    

ln_ref = torch.nn.LayerNorm(768)
ln_custom = LayerNorm(
    768,
    ln_ref.weight.data,
    ln_ref.bias.data,
    ln_ref.eps
)

x = torch.randn(2, 5, 768)

print(torch.allclose(ln_ref(x), ln_custom(x), atol=1e-6))

## Python GPT

In [667]:
import math
import torch
import torch.nn as nn
from transformers import GPT2Model

import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module

import torch
from torch import nn

import torch
from torch import nn

class LayerNorm(nn.Module):
    def __init__(
        self,
        normalized_shape,
        eps: float = 1e-5,
        elementwise_affine: bool = True,
        device=None,
        dtype=None,
    ):
        super().__init__()

        if isinstance(normalized_shape, int):
            normalized_shape = (normalized_shape,)
        self.normalized_shape = tuple(normalized_shape)
        self.eps = eps
        self.elementwise_affine = elementwise_affine

        factory_kwargs = {"device": device, "dtype": dtype}

        if elementwise_affine:
            self.weight = nn.Parameter(
                torch.ones(self.normalized_shape, **factory_kwargs)
            )
            self.bias = nn.Parameter(
                torch.zeros(self.normalized_shape, **factory_kwargs)
            )
        else:
            self.register_parameter("weight", None)
            self.register_parameter("bias", None)

    def forward(self, x, out, h, l) -> torch.Tensor:
        # Normalize over the last len(normalized_shape) dimensions
        dims = tuple(range(-len(self.normalized_shape), 0))

        mean = x.mean(dim=dims, keepdim=True)
        #out[f'h.{h}.ln_1.mean'] = mean
        var = x.var(dim=dims, keepdim=True, unbiased=False)
        #out[f'h.{h}.ln_1.var'] = var
        out[f'h.{h}.ln_{l}.mean_var'] = torch.cat([mean, var], dim=-1)
        x_hat = (x - mean) / torch.sqrt(var + self.eps)

        out[f'h.{h}.ln_{l}.x_norm'] = x_hat

        if self.elementwise_affine:
            x_hat = x_hat * self.weight
            out[f'h.{h}.ln_{l}.x_norm_scaled'] = x_hat
            x_hat = x_hat + self.bias
            out[f'h.{h}.ln_{l}.x_norm_shifted'] = x_hat
        return x_hat

    def extra_repr(self) -> str:
        return (
            f"normalized_shape={self.normalized_shape}, "
            f"eps={self.eps}, "
            f"elementwise_affine={self.elementwise_affine}"
        )
    

class GPT2_Full_Debug(nn.Module):
    def __init__(self, model_name="gpt2", device="cpu", dtype=torch.float32):
        super().__init__()

        gpt2 = GPT2Model.from_pretrained(model_name)
        gpt2.eval()

        self.n_heads = gpt2.config.n_head
        self.hidden_size = gpt2.config.n_embd
        self.head_dim = self.hidden_size // self.n_heads
        self.n_layers = gpt2.config.n_layer
        self.vocab_size = gpt2.config.vocab_size

        # Embeddings
        self.wte = nn.Embedding.from_pretrained(gpt2.wte.weight.detach().to(dtype), freeze=True)
        self.wpe = nn.Embedding.from_pretrained(gpt2.wpe.weight.detach().to(dtype), freeze=True)

        # Transformer blocks
        self.blocks = nn.ModuleList()
        for i, block in enumerate(gpt2.h):
            b = nn.ModuleDict({
                "ln_1": LayerNorm(self.hidden_size, eps=block.ln_1.eps),
                "c_attn": nn.Linear(self.hidden_size, 3 * self.hidden_size),
                "c_proj_attn": nn.Linear(self.hidden_size, self.hidden_size),
                "ln_2": LayerNorm(self.hidden_size, eps=block.ln_2.eps),
                "c_fc": nn.Linear(self.hidden_size, 4 * self.hidden_size),
                "c_proj_mlp": nn.Linear(4 * self.hidden_size, self.hidden_size),
            })
            # Copy weights
            b["ln_1"].weight.data.copy_(block.ln_1.weight)
            b["ln_1"].bias.data.copy_(block.ln_1.bias)

            b["c_attn"].weight.data.copy_(block.attn.c_attn.weight.T)
            b["c_attn"].bias.data.copy_(block.attn.c_attn.bias)

            b["c_proj_attn"].weight.data.copy_(block.attn.c_proj.weight.T)
            b["c_proj_attn"].bias.data.copy_(block.attn.c_proj.bias)

            b["ln_2"].weight.data.copy_(block.ln_2.weight)
            b["ln_2"].bias.data.copy_(block.ln_2.bias)

            b["c_fc"].weight.data.copy_(block.mlp.c_fc.weight.T)
            b["c_fc"].bias.data.copy_(block.mlp.c_fc.bias)

            b["c_proj_mlp"].weight.data.copy_(block.mlp.c_proj.weight.T)
            b["c_proj_mlp"].bias.data.copy_(block.mlp.c_proj.bias)

            self.blocks.append(b)

        # Final layer norm
        self.ln_f = nn.LayerNorm(self.hidden_size, eps=gpt2.ln_f.eps)
        self.ln_f.weight.data.copy_(gpt2.ln_f.weight)
        self.ln_f.bias.data.copy_(gpt2.ln_f.bias)

        # LM head (weight tied)
        self.lm_head = nn.Linear(self.hidden_size, self.vocab_size, bias=False)
        self.lm_head.weight.data.copy_(self.wte.weight)

        self.to(device=device, dtype=dtype)

    def forward(self, input_ids):
        bsz, seq_len = input_ids.shape
        device = input_ids.device
        out = {}

        # Embeddings
        pos_ids = torch.arange(seq_len, device=device).unsqueeze(0)
        tok_emb = self.wte(input_ids)
        pos_emb = self.wpe(pos_ids)
        x = tok_emb + pos_emb
        out["token_emb"] = tok_emb
        out["pos_emb"] = pos_emb

        # Iterate over blocks
        for layer_idx, b in enumerate(self.blocks):
            #layer_out = {}
            out[f"h.{layer_idx}.input_embedding"] = x

            # LN1
            x_ln1 = b["ln_1"](x, out, layer_idx, 1)
            out[f"h.{layer_idx}.ln_1"] = x_ln1

            # QKV
            qkv = b["c_attn"](x_ln1)
            q, k, v = qkv.split(self.hidden_size, dim=2)
            out[f"h.{layer_idx}.qkv"] = qkv
            out[f"h.{layer_idx}.attn.q"] = q
            out[f"h.{layer_idx}.attn.k"] = k
            out[f"h.{layer_idx}.attn.v"] = v

            # Split heads
            def split_heads(x):
                return x.view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)

            qh = split_heads(q)[0]
            kh = split_heads(k)[0]
            vh = split_heads(v)[0]

            qh_list = torch.split(qh, 1, dim=0)  # split into chunks of size 1
            kh_list = torch.split(kh, 1, dim=0)  # split into chunks of size 1
            vh_list = torch.split(vh, 1, dim=0)  # split into chunks of size 1 
            
            for i, (q, k, v) in enumerate(zip(qh_list, kh_list, vh_list)):

                out[f"h.{layer_idx}.attn.{i}.q_head"] = q
                out[f"h.{layer_idx}.attn.{i}.k_head"] = k
                out[f"h.{layer_idx}.attn.{i}.v_head"] = v

            # Attention
            kh_t = kh.transpose(-2, -1)
            qk = torch.matmul(qh, kh_t)
            qk_scaled = qk / math.sqrt(self.head_dim)
            causal_mask = torch.tril(torch.ones(seq_len, seq_len, device=device)).view(1, 1, seq_len, seq_len)
            qk_masked = qk_scaled.masked_fill(causal_mask == 0, float("-inf"))
            attn_probs = torch.softmax(qk_masked, dim=-1)
            attn_ctx_heads = torch.matmul(attn_probs, vh)
            attn_ctx = attn_ctx_heads.transpose(1, 2).contiguous().view(bsz, seq_len, self.hidden_size)
            attn_out = b["c_proj_attn"](attn_ctx)

            out[f"h.{layer_idx}.attn.c_proj"] = attn_out

            #resid 1
            x = x + attn_out
            out[f"h.{layer_idx}.resid.1"] = x

            # LN2 + MLP
            x_ln2 = b["ln_2"](x, out, layer_idx, 2)
            mlp_out = b["c_proj_mlp"](torch.nn.functional.gelu(b["c_fc"](x_ln2)))
            out[f"h.{layer_idx}.mlp"] = mlp_out

            # resid 2
            x = x + mlp_out
            out[f"h.{layer_idx}.resid.2"] = x

            # out[f"h.{layer_idx}.ln_2"] = x_ln2
            # out[f"h.{layer_idx}.mlp_fc_out"] = mlp_out
            # out[f"h.{layer_idx}.block_output"] = x

            #out[f"layer_{layer_idx}"] = layer_out

        # Final layer norm
        x_final_ln = self.ln_f(x)
        out["ln_f"] = x_final_ln

        # LM head
        logits = self.lm_head(x_final_ln)
        out["head"] = logits
        probs = torch.softmax(logits, dim=-1)
        # out["logits"] = logits
        out["probs"] = probs

        # Greedy next token
        next_token_prob_dist = probs[:, -1, :]
        out['next_token_prob_dist'] = next_token_prob_dist
        next_token_id = torch.argmax(next_token_prob_dist, dim=-1)
        out["next_token_id"] = next_token_id

        return out

In [675]:
import torch
from transformers import GPT2Tokenizer

# -------------------------------
# Device
# -------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------------
# Paragraph
# -------------------------------
#paragraph = ("Hello I'm a language model")

paragraph = ("Artificial intelligence in 2026 is")



# -------------------------------
# Tokenizer
# -------------------------------
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
input_ids = tokenizer.encode(paragraph, return_tensors="pt").to(device)
#print("token_ids: ", input_ids)


# -------------------------------
# Set context window to 6
# -------------------------------
context_len = 7
if input_ids.shape[1] > context_len:
    input_ids = input_ids[:, -context_len:]  # keep last 52 tokens

cur_len = input_ids.shape[1]

# -------------------------------
# Initialize GPT2 debug model
# -------------------------------
model = GPT2_Full_Debug(model_name="gpt2-medium", device=device, dtype=torch.float32)
model.eval()

# -------------------------------
# Prepare for generation
# -------------------------------
num_generate = 50  # number of new tokens
generated_ids = input_ids[0].tolist()  # start with initial context

# Store intermediate outputs for all steps
all_steps_outputs = []

# -------------------------------
# Generation loop
# -------------------------------
for step in range(num_generate):
    cur_input = torch.tensor([generated_ids[-context_len:]], device=device)
    print(cur_input)
    with torch.no_grad():
        outputs = model(cur_input)

    # Save intermediate outputs for this step
    all_steps_outputs.append(outputs)

    # Get next token (greedy)
    next_token_id = outputs['next_token_id'].item()

    # Append to generated list
    generated_ids.append(next_token_id)

# -------------------------------
# Decode full generated sequence
# -------------------------------
full_text = tokenizer.decode(generated_ids)
#print("\n=== Full generated text (initial + 50 new tokens) ===\n")
print(full_text)

# -------------------------------
# Optional: print next token for first 5 steps
# -------------------------------
# print("\n=== First 5 generated token IDs and strings ===")
# for step_out in all_steps_outputs:
#     tid = step_out['next_token_id'].item()
#     print(tid, tokenizer.decode([tid]))

output_p = all_steps_outputs[0]
# list(output_p.keys())

tensor([[8001, 9542, 4430,  287, 1160, 2075,  318]])
tensor([[9542, 4430,  287, 1160, 2075,  318, 1016]])
tensor([[4430,  287, 1160, 2075,  318, 1016,  284]])
tensor([[ 287, 1160, 2075,  318, 1016,  284,  307]])
tensor([[1160, 2075,  318, 1016,  284,  307,  257]])
tensor([[2075,  318, 1016,  284,  307,  257, 1263]])
tensor([[ 318, 1016,  284,  307,  257, 1263, 1730]])
tensor([[1016,  284,  307,  257, 1263, 1730,   13]])
tensor([[ 284,  307,  257, 1263, 1730,   13,  198]])
tensor([[ 307,  257, 1263, 1730,   13,  198,  198]])
tensor([[ 257, 1263, 1730,   13,  198,  198,    1]])
tensor([[1263, 1730,   13,  198,  198,    1,   40]])
tensor([[1730,   13,  198,  198,    1,   40,  892]])
tensor([[ 13, 198, 198,   1,  40, 892, 340]])
tensor([[198, 198,   1,  40, 892, 340, 338]])
tensor([[198,   1,  40, 892, 340, 338, 257]])
tensor([[  1,  40, 892, 340, 338, 257, 922]])
tensor([[  40,  892,  340,  338,  257,  922, 1517]])
tensor([[ 892,  340,  338,  257,  922, 1517,  326]])
tensor([[ 340,  338, 

In [None]:
atol = 1e-4
h = 0
for h in range(0, 12):
    print(torch.allclose(output_p['token_emb'],  output_c['token_emb'], atol=atol))
    print(torch.allclose(output_p['pos_emb'],  output_c['pos_emb'], atol=atol))
    print(torch.allclose(output_p[f'h.{h}.input_embedding'], output_c[f'h.{h}.input_embedding'], atol=atol))
    print(torch.allclose(output_p[f'h.{h}.ln_1.mean_var'], output_c[f'h.{h}.ln_1.mean_var'], atol=atol))
    print(torch.allclose(output_p[f'h.{h}.ln_1.x_norm'], output_c[f'h.{h}.ln_1.x_norm'], atol=atol))
    print(torch.allclose(output_p[f'h.{h}.ln_1.x_norm_shifted'], output_c[f'h.{h}.ln_1.x_norm_shifted'], atol=atol))
    print(torch.allclose(output_p[f'h.{h}.ln_1.x_norm_scaled'], output_c[f'h.{h}.ln_1.x_norm_scaled'], atol=atol))
    print(torch.allclose(output_p[f'h.{h}.ln_1'], output_c[f'h.{h}.ln_1'], atol=atol))
    print(torch.allclose(output_p[f'h.{h}.attn.q'], output_c[f'h.{h}.attn.q'], atol=atol))
    print(torch.allclose(output_p[f'h.{h}.attn.k'], output_c[f'h.{h}.attn.k'], atol=atol))
    print(torch.allclose(output_p[f'h.{h}.attn.v'], output_c[f'h.{h}.attn.v'], atol=atol))


    for i in range(0, 12):
        print(torch.allclose(output_p[f"h.{h}.attn.{i}.q_head"], output_c[f"h.{h}.attn.{i}.q_head"], atol=atol))
        print(torch.allclose(output_p[f"h.{h}.attn.{i}.k_head"], output_c[f"h.{h}.attn.{i}.k_head"], atol=atol))
        print(torch.allclose(output_p[f"h.{h}.attn.{i}.v_head"], output_c[f"h.{h}.attn.{i}.v_head"], atol=atol))


    #c_proj
    print(torch.allclose(output_p[f"h.{0}.attn.c_proj"], output_c[f"h.{0}.attn.c_proj"], atol=atol))

    #residual connection 1
    print(torch.allclose(output_p[f"h.{0}.resid.1"], output_c[f"h.{0}.resid.1"], atol=atol))

    # ln_2
    print(torch.allclose(output_p[f'h.{h}.ln_2.mean_var'], output_c[f'h.{h}.ln_2.mean_var'], atol=atol))
    print(torch.allclose(output_p[f'h.{h}.ln_2.x_norm'], output_c[f'h.{h}.ln_2.x_norm'], atol=atol))
    print(torch.allclose(output_p[f'h.{h}.ln_2.x_norm_shifted'], output_c[f'h.{h}.ln_2.x_norm_shifted'], atol=atol))
    print(torch.allclose(output_p[f'h.{h}.ln_2.x_norm_scaled'], output_c[f'h.{h}.ln_2.x_norm_scaled'], atol=atol))
    print(torch.allclose(output_p[f'h.{h}.mlp'], output_c[f'h.{h}.mlp'], atol=atol))

    #residual connection 2
    print(torch.allclose(output_p[f"h.{0}.resid.2"], output_c[f"h.{0}.resid.2"], atol=atol))


print(torch.allclose(output_p[f"ln_f"], output_c[f"ln_f"], atol=atol))
print(torch.allclose(output_p[f"head"], output_c[f"head"], atol=atol))
print(torch.allclose(output_p[f"probs"], output_c[f"probs"], atol=atol))
print(torch.allclose(output_p[f"next_token_prob_dist"], output_c[f"next_token_prob_dist"], atol=atol))



## Gnerating Vocab

In [None]:
from transformers import GPT2Tokenizer

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokens = tokenizer.convert_ids_to_tokens(list(range(tokenizer.vocab_size)))

# Choose a separator (must not appear in any token)
SEP = "|"  # safe ASCII separator

def clean_token(tok):
    tok = tok.replace('Ġ', ' ')   # space marker
    tok = tok.replace('Ċ', '<NL>')  # newline marker as visible string
    # Optional: replace any other non-ASCII sequences
    tok = ''.join(c if ord(c) < 128 else '<U>' for c in tok)
    return tok

with open("gpt2_vocab.txt", "w", encoding="utf-8") as f:
    f.write(SEP.join(clean_token(tok) for tok in tokens))

## GPT Hugging Face

In [665]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# GPT-2 has no pad token by default, use eos token
tokenizer.pad_token = tokenizer.eos_token
pad_token_id = tokenizer.pad_token_id  # now this is 50256

prompt = "Artificial intelligence in 2026 is"

input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
max_context = len(input_ids[0])
print(f"max_context: {max_context}")
print(input_ids)

if input_ids.shape[1] > max_context:
    input_ids = input_ids[:, -max_context:]
elif input_ids.shape[1] < max_context:
    pad_length = max_context - input_ids.shape[1]
    pad_ids = torch.full((1, pad_length), pad_token_id, device=device)
    input_ids = torch.cat([pad_ids, input_ids], dim=1)

print("Input IDs shape:", input_ids.shape)

output_ids = model.generate(
    input_ids,
    max_length=max_context + 50,  # generate 50 new tokens after context
    do_sample=False              # greedy
)

generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print('\n\n', generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


max_context: 7
tensor([[8001, 9542, 4430,  287, 1160, 2075,  318]])
Input IDs shape: torch.Size([1, 7])


 Artificial intelligence in 2026 is a new field of research in which the size of a feature set is increased or decreased with each passing year. This is because of the way in which the brain processes information.

The researchers used a computer program called DeepMind to create a neural


In [660]:
input_ids[0]

tensor([318])