In [281]:
import torch
from torch import nn
torch.set_printoptions(precision=2, sci_mode=False)
torch.set_default_dtype(torch.float64)
import numpy as np
import torch.nn.functional as F
import math
from pathlib import Path
import torch.nn as nn


In [54]:
def load_tensors(filename):
    tensors_dict = {}
    
    # Tracking current tensor state
    current_name = "default_tensor"
    current_metadata = {}
    current_data = []
    
    meta_keys = ['size', 'ndim', 'shape', 'stride', 'elem_size', 'requires_grad']

    def finalize_tensor(name, meta, data):
        """Helper to reshape data and store in the dictionary."""
        if not data:
            return
        
        target_shape = [int(s) for s in meta.get('shape', [len(data)])]
        # Using float64 (double) as your images show high precision decimals
        # if name == current_name:
        #     tensors_dict = torch.tensor(data, dtype=torch.float32).reshape(target_shape)
        # else:
        tensors_dict[name] = torch.tensor(data, dtype=torch.float64).reshape(target_shape)

    with open(filename, 'r') as f:
        for line in f:
            # Split by comma and remove empty strings/whitespace
            parts = [p.strip() for p in line.split(',') if p.strip()]
            
            if not parts:
                continue

            label = parts[0].replace(':', '')

            # 1. Check if it's a known metadata key
            if label in meta_keys:
                vals = [float(v) for v in parts[1:]]
                current_metadata[label] = vals[0] if len(vals) == 1 else vals
            
            else:
                try:
                    # 2. Try to parse as numeric data
                    row_data = [float(p) for p in parts]
                    current_data.extend(row_data)
                except ValueError:
                    # 3. If it's a string but NOT metadata, it's a new tensor name
                    # Save the previous tensor first
                    if current_data:
                        finalize_tensor(current_name, current_metadata, current_data)
                    
                    # Reset for the new tensor
                    current_name = label
                    current_metadata = {}
                    current_data = []

    # Finalize the last tensor in the file
    finalize_tensor(current_name, current_metadata, current_data)
    if "default_tensor" in tensors_dict.keys():
        tensors_dict = tensors_dict["default_tensor"]
    return tensors_dict

In [55]:
def load_gpt_model(path):
    gpt_model = {}
    folder_path = Path(path)
    for file in folder_path.iterdir():
        if file.is_file():  # Ensure it's a file and not a subfolder
            filename = Path(file.name).stem
            # print(f"Filename: {filename}")
            gpt_model[filename] = load_tensors(file)
            # print(f"Full Path: {file}")
    return gpt_model

In [56]:
gpt_model_c = load_gpt_model("/Users/uonliaquat/workspace/zerograd/models")
gpt_model_c["transformer_layer_0__self_attention_layer_heads_proj"]

{'Weights': tensor([[ 0.85,  0.90, -0.04,  ..., -0.58, -0.18,  0.63],
         [ 0.28, -0.13, -0.90,  ..., -0.52,  0.49,  0.62],
         [ 0.62,  0.66, -0.76,  ...,  0.83,  0.33, -0.73],
         ...,
         [ 0.94,  0.39,  0.47,  ...,  0.83,  0.61, -0.22],
         [-0.61,  0.08, -0.45,  ...,  0.30, -0.30, -0.89],
         [-0.06,  0.91, -0.10,  ..., -0.90,  0.63, -0.89]]),
 'Output': tensor([[[    -2.76,     -0.48,      5.78,      0.43,      5.98,      6.62,
               -6.77,      1.45,      6.02,     -8.07,      5.96,     14.18,
               -1.58,      3.50,      8.39,      1.56,      2.96,    -13.13,
               -3.33,      4.71,      5.05,      7.32,      7.52,      9.71,
                4.27,    -10.34,      2.62,      6.44,    -10.13,     -6.80,
               -1.95,    -16.84],
          [    -0.42,     -5.68,     -5.70,     10.29,     12.53,      4.34,
                4.55,      1.88,      2.43,     -8.14,     -1.08,      5.87,
               10.10,     -5.06,    

In [57]:
# base_path = '../models'
# gpt_model_c = {
#     "input_tokens":  load_tensors(f'{base_path}/input_tokens.csv')['default_tensor'].long(),
#     "token_embeddings_layer": load_tensors(f'{base_path}/gpt_model.token_embed_layer.csv'),
#     "pos_embeddings_layer": load_tensors(f'{base_path}/gpt_model.pos_embed_layer.csv'),
#     "position_indicies":  load_tensors(f'{base_path}/gpt_model.workspace.position_indicies.csv')['default_tensor'].long(),
#     "input_embeddings": load_tensors(f'{base_path}/gpt_model.workspace.input_embeddings.csv')['default_tensor']
# }
# gpt_model_c

In [58]:
def tensors_within_tolerance(a, b, atol):
    if a.shape != b.shape:
        print("Shape mismatch:", a.shape, b.shape)
        return False

    max_diff = round((a - b).abs().max().item(), 2)
    print("max |diff|:", max_diff)

    return max_diff <= atol

## GPT

In [59]:
for key in gpt_model_c.keys():
    print(key)

transformer_layer_0__self_attention_layer_attention_scores_0
gpt_model.pos_embed_layer
transformer_layer_0__self_attention_layer_attention_scores_1
input_tokens
transformer_layer_0__self_attention_layer_context_vecs_0
transformer_layer_0__self_attention_layer_w_key
transformer_layer_0__self_attention_layer_context_vecs_1
transformer_layer_0__self_attention_layer_w_query
gpt_model.token_embed_layer
transformer_layer_0__self_attention_layer_attention_scores_scaled_0
gpt_model.workspace.position_indices
transformer_layer_0__self_attention_layer_attention_scores_scaled_1
transformer_layer_0__self_attention_layer_keys_transposed_0
transformer_layer_0__self_attention_layer_queries_chunks_1
transformer_layer_0__self_attention_layer_queries_chunks_0
transformer_layer_0__self_attention_layer_keys_transposed_1
transformer_layer_0__self_attention_layer_w_value
transformer_layer_0__self_attention_layer_keys_chunks_0
transformer_layer_0__self_attention_layer_keys_chunks_1
transformer_layer_0__self_

In [60]:
def find_key(target_dict, substring):
    """
    Returns the first key that contains the substring.
    Returns None if no match is found.
    """
    return next((k for k in target_dict if substring in k), None)

# Usage:

In [61]:
key_transposed_global = None
class SelfAttentionMultiHead(nn.Module):
    def __init__(self, self_attention_layer_c, n_heads, atol):
        super().__init__()
        
        self.n_heads = n_heads
        self.atol = atol
        self.self_attention_layer_c = self_attention_layer_c
        
        W_query_weights =   self_attention_layer_c[find_key(self_attention_layer_c, 'w_query')]['Weights']
        W_key_weights =     self_attention_layer_c[find_key(self_attention_layer_c, 'w_key')]['Weights']
        W_value_weights =   self_attention_layer_c[find_key(self_attention_layer_c, 'w_value')]['Weights']
        head_proj_weights =   self_attention_layer_c[find_key(self_attention_layer_c, 'heads_proj')]['Weights']

        # print("W_query_weights\n ", W_query_weights)
        W_query_weights = W_query_weights.t()
        W_key_weights = W_key_weights.t()
        W_value_weights = W_value_weights.t()
        head_proj_weights = head_proj_weights.t()

        self.W_query = nn.Linear(W_query_weights.shape[0], W_query_weights.shape[1], bias=False)
        self.W_query.weight = nn.Parameter(W_query_weights)

        self.W_key = nn.Linear(W_key_weights.shape[0], W_key_weights.shape[1], bias=False)
        self.W_key.weight = nn.Parameter(W_key_weights)

        self.W_value = nn.Linear(W_value_weights.shape[0], W_value_weights.shape[1], bias=False)
        self.W_value.weight = nn.Parameter(W_value_weights)

        self.heads_proj = nn.Linear(head_proj_weights.shape[0], head_proj_weights.shape[1], bias=False)
        self.heads_proj.weight = nn.Parameter(head_proj_weights)

    
    def forward(self, x):
        # print(f"Input Embeddings: \nShape: {x.shape}\n{x}")
        # print(f"W_Query\nShape: {self.W_query.weight.shape}\n{self.W_query.weight}")
        # print(f"W_Key\nShape:   {self.W_key.weight.shape}\n{self.W_key.weight}")
        # print(f"W_Value\nShape: {self.W_value.weight.shape}\n{self.W_value.weight}")
        # print(f"heads_proj\nShape: {self.heads_proj.weight.shape}\n{self.heads_proj.weight}")
        print("===========================================")

        query = self.W_query(x)
        key = self.W_key(x)
        value = self.W_value(x)
        self.layer_name = 'self_attention_layer'
        query_matched = tensors_within_tolerance(query, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_w_query')]['Output'], self.atol)
        key_matched = tensors_within_tolerance(key,  self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_w_key')]['Output'], self.atol)
        value_matched = tensors_within_tolerance(value,  self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_w_value')]['Output'], self.atol)

        print(f"query_matched:   {query_matched}")
        print(f"key_matched:     {key_matched}")
        print(f"value_matched:   {value_matched}")
        # print("Python", query)
        # print("C", self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_w_query')]['Output'])

        queries_chnuks  = torch.chunk(query, self.n_heads, -1)
        keys_chnuks     = torch.chunk(key , self.n_heads, -1)
        values_chnuks   = torch.chunk(value, self.n_heads, -1)

        for head in range(0, self.n_heads):
            #print(self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_queries_chunks_{head}')])
            #print(self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_queries_chunks_{head}')]['Output'].shape)
            query_chnuks_matched    = tensors_within_tolerance(queries_chnuks[head],  self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_queries_chunks_{head}')], self.atol)
            key_chnuks_matched      = tensors_within_tolerance(keys_chnuks[head],     self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_keys_chunks_{head}')], self.atol)
            value_chnuks_matched    = tensors_within_tolerance(values_chnuks[head],   self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_values_chunks_{head}')], self.atol)
            print(f"query_chnuks_matched:   {query_chnuks_matched}")
            print(f"key_chnuks_matched:     {key_chnuks_matched}")
            print(f"value_chnuks_matched:   {value_chnuks_matched}")

        context_vecs = []
        for head in range(0, self.n_heads):
            #print(self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_context_vecs_{head}')])
            
            print(f"=================================== HEAD {head} ===================================\n")
            key_transposed = keys_chnuks[head].transpose(1, 2)
            key_transposed_matched = tensors_within_tolerance(key_transposed, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_keys_transposed_{head}')], self.atol)
            print(f"key_transposed_matched:   {key_transposed_matched}")

            attention_scores = queries_chnuks[head] @ key_transposed
            attention_scores_matched = tensors_within_tolerance(attention_scores, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_attention_scores_{head}')], self.atol)
            print(f"attention_scores_matched:   {attention_scores_matched}")


            attention_scores_scaled = attention_scores * 1/math.sqrt(keys_chnuks[head].shape[1])
            attention_scores_scaled_matched = tensors_within_tolerance(attention_scores_scaled, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_attention_scores_scaled_{head}')], self.atol)
            print(f"attention_scores_scaled_matched:   {attention_scores_scaled_matched}")

            attention_weights = F.softmax(attention_scores_scaled, dim=-1)
            attention_weights_matched = tensors_within_tolerance(attention_weights, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_attention_weights_{head}')], self.atol)
            print(f"attention_weights_matched:   {attention_weights_matched}")

            context_vec = attention_weights @ values_chnuks[head]
            context_vec_matched = tensors_within_tolerance(context_vec, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_context_vecs_{head}')], self.atol)
            print(f"context_vec_matched:   {context_vec_matched}")

            context_vecs.append(context_vec)
        #     print(f"=================================================================================\n")

        concat_heads = torch.cat(context_vecs, dim=-1)
        concat_heads_matched = tensors_within_tolerance(concat_heads, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_concat_heads')], self.atol)
        print(f"concat_heads_matched:   {concat_heads_matched}")

        projected_context_vecs = self.heads_proj(concat_heads)
        projected_context_vecs_matched = tensors_within_tolerance(projected_context_vecs, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_heads_proj')]['Output'], self.atol)
        print(f"projected_context_vecs_matched:   {projected_context_vecs_matched}")



In [62]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, feed_forward_network_c):
        super().__init__()
        self.input = nn.Linear(2, 5)
        self.output = nn.Linear(5, 2)

    def forward(self, x):
        pass

In [63]:
class Transformer(nn.Module):
    def __init__(self, transformer_layer_c, n_heads, atol):
        super().__init__() 
        self.n_heads = n_heads
        self.atol = atol
        self_attention_layer_c = {k: v for k, v in transformer_layer_c.items() if "self_attention_layer" in k}
        feed_forward_network_c = {k: v for k, v in transformer_layer_c.items() if "feed_forward_network" in k}
        self.self_attention_multi_head  = SelfAttentionMultiHead(self_attention_layer_c, n_heads, atol)
        #self.feed_forward_network       = FeedForwardNetwork(feed_forward_network_c)
    
    def forward(self, x):
        x = self.self_attention_multi_head(x)
        #x = self.feed_forward_network(x)
        return x


In [64]:
class TransformerBlock(nn.Module):
    def __init__(self, transformer_block_c, n_layers, n_heads, atol):
        super().__init__()
        self.n_layers = n_layers
        self.atol = atol
        self.transformer_layers = {}
        for layer_no in range(0, n_layers):
            transformer_layer_c = {k: v for k, v in transformer_block_c.items() if k.startswith(f"transformer_layer_{layer_no}")}
            self.transformer_layers[layer_no] = Transformer(transformer_layer_c, n_heads, atol)

    def forward(self, x):
        for layer_no in range(0, self.n_layers):
            x = self.transformer_layers[layer_no](x)
        return x

In [65]:
class GPT(nn.Module):
    def __init__(self, gpt_model_c, n_layers, n_heads, atol):
        super().__init__()   
        self.atol = atol
           
        num_token_embeddings            = gpt_model_c['gpt_model.token_embed_layer']['Weights'].shape[0]
        token_embedding_dim             = gpt_model_c['gpt_model.token_embed_layer']['Weights'].shape[1]
        num_pos_embeddings              = gpt_model_c['gpt_model.pos_embed_layer']['Weights'].shape[0]
        pos_embedding_dim               = gpt_model_c['gpt_model.pos_embed_layer']['Weights'].shape[1]

        print(f"num_token_embeddings: {num_token_embeddings}")
        print(f"token_embedding_dim: {token_embedding_dim}")
        print(f"num_pos_embeddings: {num_pos_embeddings}")
        print(f"pos_embedding_dim: {pos_embedding_dim}")

        self.token_embeddings_layer     = nn.Embedding(num_embeddings=num_token_embeddings, embedding_dim=token_embedding_dim)
        self.pos_embeddings_layer       = nn.Embedding(num_embeddings=num_pos_embeddings, embedding_dim=pos_embedding_dim)

        self.token_embeddings_layer.weight.data.copy_(gpt_model_c['gpt_model.token_embed_layer']['Weights'])
        self.pos_embeddings_layer.weight.data.copy_(gpt_model_c['gpt_model.pos_embed_layer']['Weights'])
        
 
        assert(gpt_model_c['gpt_model.token_embed_layer']['Weights'].shape == self.token_embeddings_layer.weight.shape)
        assert(gpt_model_c['gpt_model.token_embed_layer']['Weights'].shape == self.pos_embeddings_layer.weight.shape)

        
        self.token_embeddings_c         = gpt_model_c['gpt_model.token_embed_layer']['Output']
        self.pos_embeddings_c           = gpt_model_c['gpt_model.pos_embed_layer']['Output']
        self.input_embeddings_c         = gpt_model_c['gpt_model.workspace.input_embeddings']
        self.position_indicies_c        = gpt_model_c['gpt_model.workspace.position_indices'].long()

        transformer_block_c = {k: v for k, v in gpt_model_c.items() if "transformer_layer" in k}
        self.transformer_block = TransformerBlock(transformer_block_c, n_layers, n_heads, self.atol)

    def forward(self, input_tokens_c):
        #print(f"x.shape:    {input_tokens_c.shape}")
        token_embeddings    = self.token_embeddings_layer(input_tokens_c)[0]
        print(self.position_indicies_c)
        pos_embeddings      = self.pos_embeddings_layer(self.position_indicies_c)[0]
        input_embeddings    = token_embeddings + pos_embeddings


        token_embeddings_matched    = tensors_within_tolerance(token_embeddings,    self.token_embeddings_c,    self.atol)
        pos_embeddings_matched      = tensors_within_tolerance(pos_embeddings,      self.pos_embeddings_c,      self.atol)
        input_embeddings_matched    = tensors_within_tolerance(input_embeddings,    self.input_embeddings_c,    self.atol)

        print(f"token_embeddings_matched:   {token_embeddings_matched}")
        print(f"pos_embeddings_matched:     {pos_embeddings_matched}")
        print(f"input_embeddings_matched:   {input_embeddings_matched}")


        # print("***********Token Embeddings***********\n")
        # print(self.token_embeddings_c[0], "\n")
        # print(token_embeddings[0], "\n\n")

        # print("***********Pose Embeddings***********\n")
        # print(self.pos_embeddings_c[0], "\n")
        # print(pos_embeddings[0], "\n\n")


        # print("***********Input Embeddings***********\n")
        # print(self.input_embeddings_c[0], "\n")
        # print(input_embeddings[0], "\n\n")

        # assert(
        #     token_embeddings_matched and 
        #     pos_embeddings_matched and 
        #     input_embeddings_matched
        # )
        
        contextual_embddings = self.transformer_block(input_embeddings)


        # print(token_embeddings - self.token_embed_layer_output_c)
        return contextual_embddings

In [67]:
atol = 0.00000001
gpt = GPT(gpt_model_c=gpt_model_c, n_layers=1, n_heads=2, atol=atol)
input_embeddings = gpt(gpt_model_c['input_tokens'].long())

num_token_embeddings: 6
token_embedding_dim: 32
num_pos_embeddings: 6
pos_embedding_dim: 32
tensor([[[0, 1, 2, 3, 4, 5],
         [0, 1, 2, 3, 4, 5]]])
max |diff|: 0.0
max |diff|: 0.0
max |diff|: 0.0
token_embeddings_matched:   True
pos_embeddings_matched:     True
input_embeddings_matched:   True
max |diff|: 0.0
max |diff|: 0.0
max |diff|: 0.0
query_matched:   True
key_matched:     True
value_matched:   True
max |diff|: 0.0
max |diff|: 0.0
max |diff|: 0.0
query_chnuks_matched:   True
key_chnuks_matched:     True
value_chnuks_matched:   True
max |diff|: 0.0
max |diff|: 0.0
max |diff|: 0.0
query_chnuks_matched:   True
key_chnuks_matched:     True
value_chnuks_matched:   True

max |diff|: 0.0
key_transposed_matched:   True
max |diff|: 0.0
attention_scores_matched:   True
max |diff|: 0.0
attention_scores_scaled_matched:   True
max |diff|: 0.0
attention_weights_matched:   True
max |diff|: 0.0
context_vec_matched:   True

max |diff|: 0.0
key_transposed_matched:   True
max |diff|: 0.0
attent

In [57]:
for key in gpt_model_c.keys():
    print(key)

transformer_layer_0__self_attention_layer_context_vecs_3
transformer_layer_0__self_attention_layer_attention_scores_0
gpt_model.pos_embed_layer
transformer_layer_0__self_attention_layer_attention_scores_1
transformer_layer_0__self_attention_layer_context_vecs_2
input_tokens
transformer_layer_0__self_attention_layer_context_vecs_0
transformer_layer_0__self_attention_layer_w_key
transformer_layer_0__self_attention_layer_attention_scores_3
transformer_layer_0__self_attention_layer_attention_scores_2
transformer_layer_0__self_attention_layer_context_vecs_1
transformer_layer_0__self_attention_layer_context_vecs_5
transformer_layer_0__self_attention_layer_values_chunks_9
transformer_layer_0__self_attention_layer_attention_scores_6
transformer_layer_0__self_attention_layer_attention_scores_7
transformer_layer_0__self_attention_layer_values_chunks_8
transformer_layer_0__self_attention_layer_context_vecs_4
transformer_layer_0__self_attention_layer_context_vecs_6
transformer_layer_0__self_attent

In [595]:
key_transposed_global.shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [333]:
# class SelfAttention:
#     def __init__(self, self_attention_layer_c):
#         W_query_weights =   self_attention_layer_c['W_Query']
#         W_key_weights =     self_attention_layer_c['W_Key']
#         W_value_weights =   self_attention_layer_c['W_Value']

#         W_query_weights = W_query_weights.t()
#         W_key_weights = W_key_weights.t()
#         W_value_weights = W_value_weights.t()

#         self.W_query = nn.Linear(W_query_weights.shape[0], W_query_weights.shape[1], bias=False)
#         self.W_query.weight = nn.Parameter(W_query_weights)

#         self.W_key = nn.Linear(W_key_weights.shape[0], W_key_weights.shape[1], bias=False)
#         self.W_key.weight = nn.Parameter(W_key_weights)

#         self.W_value = nn.Linear(W_value_weights.shape[0], W_value_weights.shape[1], bias=False)
#         self.W_value.weight = nn.Parameter(W_value_weights)
    
#     def forward(self, x):
#         print(f"Input Embeddings: \nShape: {x.shape}\n{x}")
#         print(f"W_Query\nShape: {self.W_query.weight.shape}\n{self.W_query.weight}")
#         print(f"W_Key\nShape:   {self.W_key.weight.shape}\n{self.W_key.weight}")
#         print(f"W_Value\nShape: {self.W_value.weight.shape}\n{self.W_value.weight}")
#         print("===========================================")

#         query = self.W_query(x)
#         key = self.W_key(x)
#         value = self.W_value(x)
        

#         # print(f"Query\nShape: {query.shape}\n{query}")
#         # print(f"Key\nShape:   {key.shape}\n{key}")
#         # print(f"Value\nShape: {value.shape}\n{value}")
#         # print("===========================================")

#         key_transposed = key.t()
#         print(f"Key transposed\nShape: {key_transposed.shape}\n{key_transposed}")

#         attention_scores = query @ key_transposed
#         print(f"Attention Scores\nShape: {attention_scores.shape}\n{attention_scores}")

#         attention_scores_scaled = attention_scores * 1/math.sqrt(key.shape[1])
#         print(f"Attention Scores Scaled\nShape: {attention_scores_scaled.shape}\n{attention_scores_scaled}")

#         attention_weights = F.softmax(attention_scores_scaled, dim=1)
#         print(f"Attention Weights\nShape: {attention_weights.shape}\n{attention_weights}")

#         context_vecs = attention_weights @ value
#         print(f"Context Vecs\nShape: {context_vecs.shape}\n{context_vecs}")





In [332]:
# self_attention = SelfAttention(self_attention_layer_c)

# self_attention.forward(input_embeddings_c)

## Safe Tensors

In [547]:
from safetensors import safe_open

path = "/Users/uonliaquat/Downloads/model.safetensors"

with safe_open(path, framework="pt", device="cpu") as f:
    keys = list(f.keys())

for key in keys:
    print(key)

h.0.attn.bias
h.0.attn.c_attn.bias
h.0.attn.c_attn.weight
h.0.attn.c_proj.bias
h.0.attn.c_proj.weight
h.0.ln_1.bias
h.0.ln_1.weight
h.0.ln_2.bias
h.0.ln_2.weight
h.0.mlp.c_fc.bias
h.0.mlp.c_fc.weight
h.0.mlp.c_proj.bias
h.0.mlp.c_proj.weight
h.1.attn.bias
h.1.attn.c_attn.bias
h.1.attn.c_attn.weight
h.1.attn.c_proj.bias
h.1.attn.c_proj.weight
h.1.ln_1.bias
h.1.ln_1.weight
h.1.ln_2.bias
h.1.ln_2.weight
h.1.mlp.c_fc.bias
h.1.mlp.c_fc.weight
h.1.mlp.c_proj.bias
h.1.mlp.c_proj.weight
h.10.attn.bias
h.10.attn.c_attn.bias
h.10.attn.c_attn.weight
h.10.attn.c_proj.bias
h.10.attn.c_proj.weight
h.10.ln_1.bias
h.10.ln_1.weight
h.10.ln_2.bias
h.10.ln_2.weight
h.10.mlp.c_fc.bias
h.10.mlp.c_fc.weight
h.10.mlp.c_proj.bias
h.10.mlp.c_proj.weight
h.11.attn.bias
h.11.attn.c_attn.bias
h.11.attn.c_attn.weight
h.11.attn.c_proj.bias
h.11.attn.c_proj.weight
h.11.ln_1.bias
h.11.ln_1.weight
h.11.ln_2.bias
h.11.ln_2.weight
h.11.mlp.c_fc.bias
h.11.mlp.c_fc.weight
h.11.mlp.c_proj.bias
h.11.mlp.c_proj.weight
h.2.at

In [546]:
import torch
from transformers import GPT2Model

# Load GPT-2 small
model = GPT2Model.from_pretrained("gpt2")
model.eval()

# Dictionary to store all activations
activations = {}

# Generic hook for all modules
def register_all_hooks(model):
    """
    Registers a forward hook for every submodule in the model.
    Stores the output of each module in the activations dict.
    """
    def make_hook(name):
        def hook(module, inputs, output):
            # Only store tensors (or tuples of tensors)
            if torch.is_tensor(output):
                activations[name] = output.detach().cpu()
            elif isinstance(output, (tuple, list)):
                # handle modules that return tuples
                activations[name] = tuple(
                    o.detach().cpu() if torch.is_tensor(o) else o
                    for o in output
                )
        return hook

    for name, module in model.named_modules():
        module.register_forward_hook(make_hook(name))

# Register hooks on all modules
register_all_hooks(model)

# Example input
input_ids = torch.tensor([[464, 318, 257, 1332]])  # "Once upon a time"

# Forward pass
with torch.no_grad():
    model(input_ids)

# Check all stored activations
for k in activations:
    print(f"{k}: {activations[k].shape if torch.is_tensor(activations[k]) else 'tuple'}")

wte: torch.Size([1, 4, 768])
wpe: torch.Size([1, 4, 768])
drop: torch.Size([1, 4, 768])
h.0.ln_1: torch.Size([1, 4, 768])
h.0.attn.c_attn: torch.Size([1, 4, 2304])
h.0.attn.c_proj: torch.Size([1, 4, 768])
h.0.attn.resid_dropout: torch.Size([1, 4, 768])
h.0.attn: tuple
h.0.ln_2: torch.Size([1, 4, 768])
h.0.mlp.c_fc: torch.Size([1, 4, 3072])
h.0.mlp.act: torch.Size([1, 4, 3072])
h.0.mlp.c_proj: torch.Size([1, 4, 768])
h.0.mlp.dropout: torch.Size([1, 4, 768])
h.0.mlp: torch.Size([1, 4, 768])
h.0: tuple
h.1.ln_1: torch.Size([1, 4, 768])
h.1.attn.c_attn: torch.Size([1, 4, 2304])
h.1.attn.c_proj: torch.Size([1, 4, 768])
h.1.attn.resid_dropout: torch.Size([1, 4, 768])
h.1.attn: tuple
h.1.ln_2: torch.Size([1, 4, 768])
h.1.mlp.c_fc: torch.Size([1, 4, 3072])
h.1.mlp.act: torch.Size([1, 4, 3072])
h.1.mlp.c_proj: torch.Size([1, 4, 768])
h.1.mlp.dropout: torch.Size([1, 4, 768])
h.1.mlp: torch.Size([1, 4, 768])
h.1: tuple
h.2.ln_1: torch.Size([1, 4, 768])
h.2.attn.c_attn: torch.Size([1, 4, 2304])
h.

In [563]:
import torch
from transformers import GPT2Model

model = GPT2Model.from_pretrained("gpt2")
model.eval()

activations = {}

def register_all_hooks(model):
    def make_hook(name):
        def hook(module, inputs, output):
            if torch.is_tensor(output):
                activations[name] = output.detach().cpu()
            elif isinstance(output, (tuple, list)):
                # handle modules that return tuples (e.g., attention)
                activations[name] = tuple(
                    o.detach().cpu() if torch.is_tensor(o) else o
                    for o in output
                )
        return hook

    for name, module in model.named_modules():
        module.register_forward_hook(make_hook(name))

register_all_hooks(model)

# Example run
input_ids = torch.tensor([[464, 318, 257, 1332]])  # "Once upon a time"
with torch.no_grad():
    model(input_ids)

In [564]:
for key in activations.keys():
    print(key)

wte
wpe
drop
h.0.ln_1
h.0.attn.c_attn
h.0.attn.c_proj
h.0.attn.resid_dropout
h.0.attn
h.0.ln_2
h.0.mlp.c_fc
h.0.mlp.act
h.0.mlp.c_proj
h.0.mlp.dropout
h.0.mlp
h.0
h.1.ln_1
h.1.attn.c_attn
h.1.attn.c_proj
h.1.attn.resid_dropout
h.1.attn
h.1.ln_2
h.1.mlp.c_fc
h.1.mlp.act
h.1.mlp.c_proj
h.1.mlp.dropout
h.1.mlp
h.1
h.2.ln_1
h.2.attn.c_attn
h.2.attn.c_proj
h.2.attn.resid_dropout
h.2.attn
h.2.ln_2
h.2.mlp.c_fc
h.2.mlp.act
h.2.mlp.c_proj
h.2.mlp.dropout
h.2.mlp
h.2
h.3.ln_1
h.3.attn.c_attn
h.3.attn.c_proj
h.3.attn.resid_dropout
h.3.attn
h.3.ln_2
h.3.mlp.c_fc
h.3.mlp.act
h.3.mlp.c_proj
h.3.mlp.dropout
h.3.mlp
h.3
h.4.ln_1
h.4.attn.c_attn
h.4.attn.c_proj
h.4.attn.resid_dropout
h.4.attn
h.4.ln_2
h.4.mlp.c_fc
h.4.mlp.act
h.4.mlp.c_proj
h.4.mlp.dropout
h.4.mlp
h.4
h.5.ln_1
h.5.attn.c_attn
h.5.attn.c_proj
h.5.attn.resid_dropout
h.5.attn
h.5.ln_2
h.5.mlp.c_fc
h.5.mlp.act
h.5.mlp.c_proj
h.5.mlp.dropout
h.5.mlp
h.5
h.6.ln_1
h.6.attn.c_attn
h.6.attn.c_proj
h.6.attn.resid_dropout
h.6.attn
h.6.ln_2
h.6.m

In [565]:
activations["h.0.ln_1"].shape

torch.Size([1, 4, 768])

In [737]:
from safetensors import safe_open

path = "/Users/uonliaquat/Downloads/model.safetensors"

with safe_open(path, framework="pt", device="cpu") as f:
    h0_keys = [k for k in f.keys() if k.startswith("h.0")]

for key in h0_keys:
    print(key)

h.0.attn.bias
h.0.attn.c_attn.bias
h.0.attn.c_attn.weight
h.0.attn.c_proj.bias
h.0.attn.c_proj.weight
h.0.ln_1.bias
h.0.ln_1.weight
h.0.ln_2.bias
h.0.ln_2.weight
h.0.mlp.c_fc.bias
h.0.mlp.c_fc.weight
h.0.mlp.c_proj.bias
h.0.mlp.c_proj.weight


In [1276]:
path = "/Users/uonliaquat/Downloads/model.safetensors"

with safe_open(path, framework="pt", device="cpu") as f:
    # ln_weight = f.get_tensor("h.0.ln_1.weight")
    # ln_bias = f.get_tensor("h.0.ln_1.bias")
    h0_ln1_weight = f.get_tensor("h.0.ln_1.weight")
    h0_ln1_bias = f.get_tensor("h.0.ln_1.bias")


h0_ln1_weight.shape, h0_ln1_bias.shape,

(torch.Size([768]), torch.Size([768]))

## Read C Model

In [1391]:
from safetensors import safe_open

filename = "/Users/uonliaquat/workspace/zerograd/c_model.safetensors"

gpt_c = {}

with safe_open(filename, framework="pt", device="cpu") as f:
    for key in f.keys():
        gpt_c[key] = f.get_tensor(key)

# Inspect
print(len(gpt_c))
print(list(gpt_c.keys())[:10])

148
['gpt.embeddings.0', 'gpt.h.0.attn.0.attention_scores', 'gpt.h.0.attn.0.attention_scores_scaled', 'gpt.h.0.attn.0.attention_weights', 'gpt.h.0.attn.0.context_vecs', 'gpt.h.0.attn.0.key_transposed', 'gpt.h.0.attn.0.output', 'gpt.h.0.attn.1.attention_scores', 'gpt.h.0.attn.1.attention_scores_scaled', 'gpt.h.0.attn.1.attention_weights']


## Python GPT

In [1439]:
import math
import torch
import torch.nn as nn
from transformers import GPT2Model


class GPT2_Full_Debug(nn.Module):
    def __init__(self, model_name="gpt2", device="cpu", dtype=torch.float32):
        super().__init__()

        gpt2 = GPT2Model.from_pretrained(model_name)
        gpt2.eval()

        # ============================================================
        # Constants
        # ============================================================
        self.n_heads = gpt2.config.n_head      # 12
        self.hidden_size = gpt2.config.n_embd  # 768
        self.head_dim = self.hidden_size // self.n_heads
        self.vocab_size = gpt2.config.vocab_size

        # ============================================================
        # Embeddings
        # ============================================================
        self.wte = nn.Embedding.from_pretrained(gpt2.wte.weight.detach().to(dtype), freeze=True)
        self.wpe = nn.Embedding.from_pretrained(gpt2.wpe.weight.detach().to(dtype), freeze=True)

        # ============================================================
        # First Transformer Block
        # ============================================================
        block0 = gpt2.h[0]

        # ---- LN1 ----
        self.ln_1 = nn.LayerNorm(self.hidden_size, eps=block0.ln_1.eps, elementwise_affine=True)
        self.ln_1.weight.data.copy_(block0.ln_1.weight)
        self.ln_1.bias.data.copy_(block0.ln_1.bias)

        # ---- Attention QKV ----
        self.c_attn = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=True)
        self.c_attn.weight.data.copy_(block0.attn.c_attn.weight.T)
        self.c_attn.bias.data.copy_(block0.attn.c_attn.bias)

        # ---- Attention output proj ----
        self.c_proj_attn = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.c_proj_attn.weight.data.copy_(block0.attn.c_proj.weight.T)
        self.c_proj_attn.bias.data.copy_(block0.attn.c_proj.bias)

        # ---- LN2 ----
        self.ln_2 = nn.LayerNorm(self.hidden_size, eps=block0.ln_2.eps, elementwise_affine=True)
        self.ln_2.weight.data.copy_(block0.ln_2.weight)
        self.ln_2.bias.data.copy_(block0.ln_2.bias)

        # ---- MLP ----
        self.c_fc = nn.Linear(self.hidden_size, 4 * self.hidden_size, bias=True)
        self.c_proj_mlp = nn.Linear(4 * self.hidden_size, self.hidden_size, bias=True)
        self.c_fc.weight.data.copy_(block0.mlp.c_fc.weight.T)
        self.c_fc.bias.data.copy_(block0.mlp.c_fc.bias)
        self.c_proj_mlp.weight.data.copy_(block0.mlp.c_proj.weight.T)
        self.c_proj_mlp.bias.data.copy_(block0.mlp.c_proj.bias)

        # ============================================================
        # Final GPT-2 LayerNorm
        # ============================================================
        self.ln_f = nn.LayerNorm(self.hidden_size, eps=gpt2.ln_f.eps, elementwise_affine=True)
        self.ln_f.weight.data.copy_(gpt2.ln_f.weight)
        self.ln_f.bias.data.copy_(gpt2.ln_f.bias)

        # ============================================================
        # LM Head (tied weights)
        # ============================================================
        self.lm_head = nn.Linear(self.hidden_size, self.vocab_size, bias=False)
        self.lm_head.weight.data.copy_(self.wte.weight)  # weight tying

        self.to(device=device, dtype=dtype)

    # ============================================================
    # Forward with FULL DEBUG TRACE
    # ============================================================
    def forward(self, input_ids):
        bsz, seq_len = input_ids.shape
        device = input_ids.device
        out = {}

        # ============================================================
        # Embeddings
        # ============================================================
        pos_ids = torch.arange(seq_len, device=device).unsqueeze(0)
        tok_emb = self.wte(input_ids)
        pos_emb = self.wpe(pos_ids)
        x = tok_emb + pos_emb

        out["token_emb"] = tok_emb
        out["pos_emb"] = pos_emb
        out["embeddings"] = x

        # ============================================================
        # LN 1
        # ============================================================
        x_ln1 = self.ln_1(x)
        out["ln_1"] = x_ln1

        # ============================================================
        # QKV Projection
        # ============================================================
        qkv = self.c_attn(x_ln1)
        q, k, v = qkv.split(self.hidden_size, dim=2)

        out["qkv"] = qkv
        out["q"] = q
        out["k"] = k
        out["v"] = v

        # ============================================================
        # Split Heads
        # ============================================================
        def split_heads(x):
            return x.view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)

        qh = split_heads(q)
        kh = split_heads(k)
        vh = split_heads(v)

        out["q_heads"] = qh
        out["k_heads"] = kh
        out["v_heads"] = vh

        # ============================================================
        # K^T
        # ============================================================
        kh_t = kh.transpose(-2, -1)
        out["k_transpose"] = kh_t

        # ============================================================
        # QK^T
        # ============================================================
        qk = torch.matmul(qh, kh_t)
        out["qk"] = qk

        # ============================================================
        # Scale
        # ============================================================
        qk_scaled = qk * (1.0 / math.sqrt(self.head_dim))
        out["qk_scaled"] = qk_scaled

        # ============================================================
        # Causal Mask
        # ============================================================
        causal_mask = torch.tril(torch.ones(seq_len, seq_len, device=device)).view(1, 1, seq_len, seq_len)
        qk_masked = qk_scaled.masked_fill(causal_mask == 0, float("-inf"))
        out["qk_masked"] = qk_masked

        # ============================================================
        # Softmax
        # ============================================================
        attn_probs = torch.softmax(qk_masked, dim=-1)
        out["attn_probs"] = attn_probs

        # ============================================================
        # Attention Output per head
        # ============================================================
        attn_ctx_heads = torch.matmul(attn_probs, vh)
        out["attn_ctx_heads"] = attn_ctx_heads

        # ============================================================
        # Merge Heads
        # ============================================================
        attn_ctx = attn_ctx_heads.transpose(1, 2).contiguous()
        attn_ctx = attn_ctx.view(bsz, seq_len, self.hidden_size)
        out["attn_ctx"] = attn_ctx

        # ============================================================
        # Attention Projection + Residual
        # ============================================================
        attn_out = self.c_proj_attn(attn_ctx)
        out["attn_out"] = attn_out

        x_resid1 = x + attn_out
        out["resid_1"] = x_resid1

        # ============================================================
        # LN 2
        # ============================================================
        x_ln2 = self.ln_2(x_resid1)
        out["ln_2"] = x_ln2

        # ============================================================
        # MLP
        # ============================================================
        mlp_fc = self.c_fc(x_ln2)
        mlp_gelu = torch.nn.functional.gelu(mlp_fc)
        mlp_out = self.c_proj_mlp(mlp_gelu)

        out["mlp_fc"] = mlp_fc
        out["mlp_gelu"] = mlp_gelu
        out["mlp_out"] = mlp_out

        # ============================================================
        # Block Residual
        # ============================================================
        x_block = x_resid1 + mlp_out
        out["block_output"] = x_block

        # ============================================================
        # FINAL GPT-2 LayerNorm
        # ============================================================
        x_final_ln = self.ln_f(x_block)
        out["ln_final"] = x_final_ln

        # ============================================================
        # LM HEAD
        # ============================================================
        logits = self.lm_head(x_final_ln)
        probs = torch.softmax(logits, dim=-1)

        out["logits"] = logits
        out["probs"] = probs

        out["lm_head.weight"] = self.lm_head.weight

        last_token_probs = probs[:, -1, :]  # [bsz, vocab_size]

        # Option 1: Greedy (argmax)
        next_token_id = torch.argmax(last_token_probs, dim=-1)  # [bsz]

        out["next_token_id"] = next_token_id

        return out

In [1440]:
# Create embedding module
gpt2_p = GPT2_Full_Debug().eval()

# Example input
input_ids = torch.tensor([[7454, 2402, 257, 640]])

with torch.no_grad():
    out = gpt2_p(input_ids)

# print(out["embeddings"].shape)  # (1, 4, 768)
# print(out["ln_1"].shape)        # (1, 4, 768)
# print(out["c_attn"].shape)      # (1, 4, 2304)

out

{'token_emb': tensor([[[-0.05,  0.02,  0.08,  ...,  0.27,  0.00,  0.09],
          [-0.03,  0.00,  0.01,  ..., -0.12, -0.05,  0.07],
          [-0.05,  0.01,  0.05,  ...,  0.04,  0.07, -0.04],
          [ 0.15,  0.02,  0.03,  ...,  0.01, -0.16, -0.12]]],
        dtype=torch.float32),
 'pos_emb': tensor([[[    -0.02,     -0.20,      0.00,  ...,     -0.04,      0.03,
                0.05],
          [     0.02,     -0.05,     -0.09,  ...,      0.03,      0.01,
               -0.00],
          [     0.00,     -0.08,      0.05,  ...,      0.02,      0.02,
               -0.02],
          [    -0.00,     -0.07,      0.11,  ...,      0.01,      0.02,
               -0.01]]], dtype=torch.float32),
 'embeddings': tensor([[[-0.07, -0.18,  0.08,  ...,  0.23,  0.03,  0.15],
          [-0.01, -0.05, -0.09,  ..., -0.09, -0.04,  0.07],
          [-0.05, -0.08,  0.10,  ...,  0.06,  0.09, -0.06],
          [ 0.15, -0.06,  0.14,  ...,  0.02, -0.14, -0.13]]],
        dtype=torch.float32),
 'ln_1': tenso

In [1441]:
out["next_token_id"]

tensor([640])

In [1400]:
gpt_c['head.weight'].shape, out["lm_head.weight"].shape

(torch.Size([768, 50257]), torch.Size([50257, 768]))

In [1401]:
torch.max(gpt_c['head.weight'] - out["lm_head.weight"].T)

tensor(0., dtype=torch.float32, grad_fn=<MaxBackward1>)

In [1404]:
torch.max(out["embeddings"] - gpt_c['gpt.embeddings.0'])

tensor(0., dtype=torch.float32)

In [1405]:
torch.max(out["ln_1"] - gpt_c['gpt.h.0.ln_0.output'])

tensor(    0.00, dtype=torch.float32)

In [1406]:
torch.max(out["qkv"] - gpt_c['gpt.h.0.c_attn.output'])

tensor(    0.00, dtype=torch.float32)

In [1407]:
torch.max(out['q'] - gpt_c["gpt.h.0.q"])

tensor(    0.00, dtype=torch.float32)

In [1408]:
torch.max(out['k'] - gpt_c["gpt.h.0.k"])

tensor(    0.00, dtype=torch.float32)

In [1409]:
torch.max(out['v'] - gpt_c["gpt.h.0.v"])

tensor(    0.00, dtype=torch.float32)

In [1410]:
for h in range(12):
    head = out['k_heads'][:, h, :, :]
    print(h, head.shape, gpt_c[f'gpt.h.0.k_head.{h}'].shape)
    print(torch.max(head - gpt_c[f'gpt.h.0.k_head.{h}']))

0 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
1 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
2 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
3 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
4 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
5 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
6 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
7 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
8 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
9 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
10 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
11 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tens

In [1411]:
for h in range(12):
    head = out['q_heads'][:, h, :, :]
    print(h, head.shape, gpt_c[f'gpt.h.0.q_head.{h}'].shape)
    print(torch.max(head - gpt_c[f'gpt.h.0.q_head.{h}']))

0 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
1 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
2 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
3 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
4 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
5 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
6 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
7 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
8 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
9 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
10 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
11 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tens

In [1412]:
for h in range(12):
    head = out['v_heads'][:, h, :, :]
    print(h, head.shape, gpt_c[f'gpt.h.0.v_head.{h}'].shape)
    print(torch.max(head - gpt_c[f'gpt.h.0.v_head.{h}']))

0 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
1 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
2 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
3 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
4 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
5 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
6 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
7 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
8 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
9 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
10 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
11 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tens

# K^T

In [1413]:
for h in range(12):
    k_transpose = out['k_transpose'][:, h, :, :]
    print(h, k_transpose.shape, gpt_c[f'gpt.h.0.attn.{h}.key_transposed'].shape)
    print(torch.max(k_transpose - gpt_c[f'gpt.h.0.attn.{h}.key_transposed']))


0 torch.Size([1, 64, 4]) torch.Size([1, 64, 4])
tensor(    0.00, dtype=torch.float32)
1 torch.Size([1, 64, 4]) torch.Size([1, 64, 4])
tensor(    0.00, dtype=torch.float32)
2 torch.Size([1, 64, 4]) torch.Size([1, 64, 4])
tensor(    0.00, dtype=torch.float32)
3 torch.Size([1, 64, 4]) torch.Size([1, 64, 4])
tensor(    0.00, dtype=torch.float32)
4 torch.Size([1, 64, 4]) torch.Size([1, 64, 4])
tensor(    0.00, dtype=torch.float32)
5 torch.Size([1, 64, 4]) torch.Size([1, 64, 4])
tensor(    0.00, dtype=torch.float32)
6 torch.Size([1, 64, 4]) torch.Size([1, 64, 4])
tensor(    0.00, dtype=torch.float32)
7 torch.Size([1, 64, 4]) torch.Size([1, 64, 4])
tensor(    0.00, dtype=torch.float32)
8 torch.Size([1, 64, 4]) torch.Size([1, 64, 4])
tensor(    0.00, dtype=torch.float32)
9 torch.Size([1, 64, 4]) torch.Size([1, 64, 4])
tensor(    0.00, dtype=torch.float32)
10 torch.Size([1, 64, 4]) torch.Size([1, 64, 4])
tensor(    0.00, dtype=torch.float32)
11 torch.Size([1, 64, 4]) torch.Size([1, 64, 4])
tens

# Q.K^T

In [1414]:
for h in range(12):
    qk = out['qk'][:, h, :, :]
    print(h, qk.shape, gpt_c[f'gpt.h.0.attn.{h}.attention_scores'].shape)
    print(torch.max(qk - gpt_c[f'gpt.h.0.attn.{h}.attention_scores']))


0 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
1 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
2 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
3 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
4 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
5 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
6 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
7 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
8 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
9 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
10 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
11 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch

## (Q.K^T)/sqrt(embd)

In [1415]:
for h in range(12):
    qk_masked = out['qk_masked'][:, h, :, :]
    print(h, qk_masked.shape, gpt_c[f'gpt.h.0.attn.{h}.attention_scores_scaled'].shape)
    print(torch.max(qk_masked - gpt_c[f'gpt.h.0.attn.{h}.attention_scores_scaled']))
    # print(qk_scaled)
    # print(gpt_c[f'gpt.h.0.attn.{h}.attention_scores_scaled'])


0 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
1 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
2 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
3 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
4 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
5 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
6 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(0., dtype=torch.float32)
7 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
8 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
9 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
10 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
11 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float

## softmaxx((Q.K^T)/sqrt(embd))

In [1416]:
for h in range(12):
    attn_probs = out['attn_probs'][:, h, :, :]
    print(h, attn_probs.shape, gpt_c[f'gpt.h.0.attn.{h}.attention_weights'].shape)
    print(torch.max(attn_probs - gpt_c[f'gpt.h.0.attn.{h}.attention_weights']))
    # print(qk_scaled)
    # print(gpt_c[f'gpt.h.0.attn.{h}.attention_scores_scaled'])


0 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
1 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
2 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
3 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
4 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
5 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
6 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
7 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
8 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
9 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
10 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch.float32)
11 torch.Size([1, 4, 4]) torch.Size([1, 4, 4])
tensor(    0.00, dtype=torch

## softmax((Q.K^T)/sqrt(embd)).V

In [1417]:
for h in range(12):
    attn_ctx_heads = out['attn_ctx_heads'][:, h, :, :]
    print(h, attn_ctx_heads.shape, gpt_c[f'gpt.h.0.attn.{h}.context_vecs'].shape)
    print(torch.max(attn_ctx_heads - gpt_c[f'gpt.h.0.attn.{h}.context_vecs']))
    # print(qk_scaled)
    # print(gpt_c[f'gpt.h.0.attn.{h}.attention_scores_scaled'])


0 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
1 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
2 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
3 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
4 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
5 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
6 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
7 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
8 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
9 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
10 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tensor(    0.00, dtype=torch.float32)
11 torch.Size([1, 4, 64]) torch.Size([1, 4, 64])
tens

In [1418]:
attn_ctx = out['attn_ctx']
print(h, attn_ctx.shape, gpt_c[f'gpt.h.0.context_vec'].shape)
print(torch.max(attn_ctx - gpt_c[f'gpt.h.0.context_vec']))

11 torch.Size([1, 4, 768]) torch.Size([1, 4, 768])
tensor(    0.00, dtype=torch.float32)


In [1419]:
attn_out = out['attn_out']
print(h, attn_out.shape, gpt_c[f'gpt.h.0.c_proj.output'].shape)
print(torch.max(attn_out - gpt_c[f'gpt.h.0.c_proj.output']))

11 torch.Size([1, 4, 768]) torch.Size([1, 4, 768])
tensor(    0.00, dtype=torch.float32)


In [1420]:
torch.max(out['resid_1'] - gpt_c["gpt.h.0.res_out.0"])

tensor(    0.00, dtype=torch.float32)

In [1421]:
torch.max(out['resid_1'] - gpt_c["gpt.h.0.res_out.0"])

tensor(    0.00, dtype=torch.float32)

In [1422]:
torch.max(out['ln_2'] - gpt_c["gpt.h.0.ln_1.output"])

tensor(    0.00, dtype=torch.float32)

In [1423]:
torch.max(out['mlp_fc'] - gpt_c["gpt.h.0.mlp.c_fc.output"])

tensor(    0.00, dtype=torch.float32)

In [1424]:
torch.max(out['mlp_gelu'] - gpt_c["gpt.h.0.mlp.gelu"])

tensor(    0.00, dtype=torch.float32)

In [1425]:
torch.max(out['mlp_out'] - gpt_c["gpt.h.0.mlp.output"])

tensor(    0.00, dtype=torch.float32)

In [1426]:
torch.max(out['block_output'] - gpt_c["gpt.h.0.res_out.1"])

tensor(    0.00, dtype=torch.float32)

In [1427]:
torch.max(out['ln_final'] - gpt_c["gpt.ln_f.output"])

tensor(    0.00, dtype=torch.float32)

In [1432]:
torch.max(out['logits'] - gpt_c["gpt.head.output"])

tensor(    0.00, dtype=torch.float32)

In [1438]:
torch.max(out['probs'] - gpt_c["gpt.output"])

tensor(    -0.00, dtype=torch.float32)

In [1355]:
gpt_c["gpt.out_proj.output"]

tensor([[[-0.03, -0.03, -0.03,  ...,  0.00,  0.00,  0.00],
         [ 0.05,  0.05,  0.05,  ...,  0.00,  0.00,  0.00],
         [ 0.10,  0.10,  0.10,  ...,  0.00,  0.00,  0.00],
         [-0.01, -0.01, -0.01,  ...,  0.00,  0.00,  0.00]]],
       dtype=torch.float32)

In [1376]:
list(out.keys())

['token_emb',
 'pos_emb',
 'embeddings',
 'ln_1',
 'qkv',
 'q',
 'k',
 'v',
 'q_heads',
 'k_heads',
 'v_heads',
 'k_transpose',
 'qk',
 'qk_scaled',
 'qk_masked',
 'attn_probs',
 'attn_ctx_heads',
 'attn_ctx',
 'attn_out',
 'resid_1',
 'ln_2',
 'mlp_fc',
 'mlp_gelu',
 'mlp_out',
 'block_output',
 'ln_final',
 'logits',
 'probs',
 'lm_head.weight']

In [1431]:
gpt_c["gpt.head.output"]

tensor([[[ -0.74,  -1.82,  -5.20,  ..., -10.75,  -7.67,  -2.77],
         [ -9.92, -12.56, -16.40,  ..., -19.87, -12.17, -12.26],
         [-15.63, -11.38, -16.86,  ..., -22.26, -17.58, -14.94],
         [ -8.44,  -8.45, -14.33,  ..., -16.61, -18.35, -10.95]]],
       dtype=torch.float32)

In [1430]:
list(gpt_c.keys())

['gpt.embeddings.0',
 'gpt.h.0.attn.0.attention_scores',
 'gpt.h.0.attn.0.attention_scores_scaled',
 'gpt.h.0.attn.0.attention_weights',
 'gpt.h.0.attn.0.context_vecs',
 'gpt.h.0.attn.0.key_transposed',
 'gpt.h.0.attn.0.output',
 'gpt.h.0.attn.1.attention_scores',
 'gpt.h.0.attn.1.attention_scores_scaled',
 'gpt.h.0.attn.1.attention_weights',
 'gpt.h.0.attn.1.context_vecs',
 'gpt.h.0.attn.1.key_transposed',
 'gpt.h.0.attn.1.output',
 'gpt.h.0.attn.10.attention_scores',
 'gpt.h.0.attn.10.attention_scores_scaled',
 'gpt.h.0.attn.10.attention_weights',
 'gpt.h.0.attn.10.context_vecs',
 'gpt.h.0.attn.10.key_transposed',
 'gpt.h.0.attn.10.output',
 'gpt.h.0.attn.11.attention_scores',
 'gpt.h.0.attn.11.attention_scores_scaled',
 'gpt.h.0.attn.11.attention_weights',
 'gpt.h.0.attn.11.context_vecs',
 'gpt.h.0.attn.11.key_transposed',
 'gpt.h.0.attn.11.output',
 'gpt.h.0.attn.2.attention_scores',
 'gpt.h.0.attn.2.attention_scores_scaled',
 'gpt.h.0.attn.2.attention_weights',
 'gpt.h.0.attn.2.co

In [1442]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

paragraph = (
    "In recent years, artificial intelligence has become an important tool "
    "in many industries, helping researchers analyze data, automate tasks, "
    "and improve decision making. Machine learning models are trained on "
    "large datasets and can generate text, recognize patterns, and predict "
    "outcomes based on previous examples."
)

# Encode
token_ids = tokenizer.encode(paragraph)

print("Number of tokens:", len(token_ids))
print("Token IDs:")
print(token_ids)

# Optional: decode back to verify
decoded = tokenizer.decode(token_ids)
print("\nDecoded text:")
print(decoded)

Number of tokens: 52
Token IDs:
[818, 2274, 812, 11, 11666, 4430, 468, 1716, 281, 1593, 2891, 287, 867, 11798, 11, 5742, 4837, 16602, 1366, 11, 43511, 8861, 11, 290, 2987, 2551, 1642, 13, 10850, 4673, 4981, 389, 8776, 319, 1588, 40522, 290, 460, 7716, 2420, 11, 7564, 7572, 11, 290, 4331, 10906, 1912, 319, 2180, 6096, 13]

Decoded text:
In recent years, artificial intelligence has become an important tool in many industries, helping researchers analyze data, automate tasks, and improve decision making. Machine learning models are trained on large datasets and can generate text, recognize patterns, and predict outcomes based on previous examples.


In [1443]:
decoded

'In recent years, artificial intelligence has become an important tool in many industries, helping researchers analyze data, automate tasks, and improve decision making. Machine learning models are trained on large datasets and can generate text, recognize patterns, and predict outcomes based on previous examples.'

In [1445]:
len(token_ids)

52

In [None]:
818, 2274, 812, 11, 11666, 4430, 468, 1716, 281, 1593, 2891, 287, 867, 11798, 11, 5742, 4837, 16602, 1366, 11, 43511, 8861, 11, 290, 2987, 2551, 1642, 13, 10850, 4673, 4981, 389, 8776, 319, 1588, 40522, 290, 460, 7716, 2420, 11, 7564, 7572, 11, 290, 4331, 10906, 1912, 319, 2180, 6096, 13