In [53]:
import torch
from torch import nn
torch.set_printoptions(precision=2, sci_mode=False)
torch.set_default_dtype(torch.float64)
import numpy as np
import torch.nn.functional as F
import math
from pathlib import Path


In [54]:
def load_tensors(filename):
    tensors_dict = {}
    
    # Tracking current tensor state
    current_name = "default_tensor"
    current_metadata = {}
    current_data = []
    
    meta_keys = ['size', 'ndim', 'shape', 'stride', 'elem_size', 'requires_grad']

    def finalize_tensor(name, meta, data):
        """Helper to reshape data and store in the dictionary."""
        if not data:
            return
        
        target_shape = [int(s) for s in meta.get('shape', [len(data)])]
        # Using float64 (double) as your images show high precision decimals
        # if name == current_name:
        #     tensors_dict = torch.tensor(data, dtype=torch.float32).reshape(target_shape)
        # else:
        tensors_dict[name] = torch.tensor(data, dtype=torch.float64).reshape(target_shape)

    with open(filename, 'r') as f:
        for line in f:
            # Split by comma and remove empty strings/whitespace
            parts = [p.strip() for p in line.split(',') if p.strip()]
            
            if not parts:
                continue

            label = parts[0].replace(':', '')

            # 1. Check if it's a known metadata key
            if label in meta_keys:
                vals = [float(v) for v in parts[1:]]
                current_metadata[label] = vals[0] if len(vals) == 1 else vals
            
            else:
                try:
                    # 2. Try to parse as numeric data
                    row_data = [float(p) for p in parts]
                    current_data.extend(row_data)
                except ValueError:
                    # 3. If it's a string but NOT metadata, it's a new tensor name
                    # Save the previous tensor first
                    if current_data:
                        finalize_tensor(current_name, current_metadata, current_data)
                    
                    # Reset for the new tensor
                    current_name = label
                    current_metadata = {}
                    current_data = []

    # Finalize the last tensor in the file
    finalize_tensor(current_name, current_metadata, current_data)
    if "default_tensor" in tensors_dict.keys():
        tensors_dict = tensors_dict["default_tensor"]
    return tensors_dict

In [55]:
def load_gpt_model(path):
    gpt_model = {}
    folder_path = Path(path)
    for file in folder_path.iterdir():
        if file.is_file():  # Ensure it's a file and not a subfolder
            filename = Path(file.name).stem
            # print(f"Filename: {filename}")
            gpt_model[filename] = load_tensors(file)
            # print(f"Full Path: {file}")
    return gpt_model

In [56]:
gpt_model_c = load_gpt_model("/Users/uonliaquat/workspace/zerograd/models")
gpt_model_c["transformer_layer_0__self_attention_layer_heads_proj"]

{'Weights': tensor([[ 0.85,  0.90, -0.04,  ..., -0.58, -0.18,  0.63],
         [ 0.28, -0.13, -0.90,  ..., -0.52,  0.49,  0.62],
         [ 0.62,  0.66, -0.76,  ...,  0.83,  0.33, -0.73],
         ...,
         [ 0.94,  0.39,  0.47,  ...,  0.83,  0.61, -0.22],
         [-0.61,  0.08, -0.45,  ...,  0.30, -0.30, -0.89],
         [-0.06,  0.91, -0.10,  ..., -0.90,  0.63, -0.89]]),
 'Output': tensor([[[    -2.76,     -0.48,      5.78,      0.43,      5.98,      6.62,
               -6.77,      1.45,      6.02,     -8.07,      5.96,     14.18,
               -1.58,      3.50,      8.39,      1.56,      2.96,    -13.13,
               -3.33,      4.71,      5.05,      7.32,      7.52,      9.71,
                4.27,    -10.34,      2.62,      6.44,    -10.13,     -6.80,
               -1.95,    -16.84],
          [    -0.42,     -5.68,     -5.70,     10.29,     12.53,      4.34,
                4.55,      1.88,      2.43,     -8.14,     -1.08,      5.87,
               10.10,     -5.06,    

In [57]:
# base_path = '../models'
# gpt_model_c = {
#     "input_tokens":  load_tensors(f'{base_path}/input_tokens.csv')['default_tensor'].long(),
#     "token_embeddings_layer": load_tensors(f'{base_path}/gpt_model.token_embed_layer.csv'),
#     "pos_embeddings_layer": load_tensors(f'{base_path}/gpt_model.pos_embed_layer.csv'),
#     "position_indicies":  load_tensors(f'{base_path}/gpt_model.workspace.position_indicies.csv')['default_tensor'].long(),
#     "input_embeddings": load_tensors(f'{base_path}/gpt_model.workspace.input_embeddings.csv')['default_tensor']
# }
# gpt_model_c

In [58]:
def tensors_within_tolerance(a, b, atol):
    if a.shape != b.shape:
        print("Shape mismatch:", a.shape, b.shape)
        return False

    max_diff = round((a - b).abs().max().item(), 2)
    print("max |diff|:", max_diff)

    return max_diff <= atol

## GPT

In [59]:
for key in gpt_model_c.keys():
    print(key)

transformer_layer_0__self_attention_layer_attention_scores_0
gpt_model.pos_embed_layer
transformer_layer_0__self_attention_layer_attention_scores_1
input_tokens
transformer_layer_0__self_attention_layer_context_vecs_0
transformer_layer_0__self_attention_layer_w_key
transformer_layer_0__self_attention_layer_context_vecs_1
transformer_layer_0__self_attention_layer_w_query
gpt_model.token_embed_layer
transformer_layer_0__self_attention_layer_attention_scores_scaled_0
gpt_model.workspace.position_indices
transformer_layer_0__self_attention_layer_attention_scores_scaled_1
transformer_layer_0__self_attention_layer_keys_transposed_0
transformer_layer_0__self_attention_layer_queries_chunks_1
transformer_layer_0__self_attention_layer_queries_chunks_0
transformer_layer_0__self_attention_layer_keys_transposed_1
transformer_layer_0__self_attention_layer_w_value
transformer_layer_0__self_attention_layer_keys_chunks_0
transformer_layer_0__self_attention_layer_keys_chunks_1
transformer_layer_0__self_

In [60]:
def find_key(target_dict, substring):
    """
    Returns the first key that contains the substring.
    Returns None if no match is found.
    """
    return next((k for k in target_dict if substring in k), None)

# Usage:

In [61]:
key_transposed_global = None
class SelfAttentionMultiHead(nn.Module):
    def __init__(self, self_attention_layer_c, n_heads, atol):
        super().__init__()
        
        self.n_heads = n_heads
        self.atol = atol
        self.self_attention_layer_c = self_attention_layer_c
        
        W_query_weights =   self_attention_layer_c[find_key(self_attention_layer_c, 'w_query')]['Weights']
        W_key_weights =     self_attention_layer_c[find_key(self_attention_layer_c, 'w_key')]['Weights']
        W_value_weights =   self_attention_layer_c[find_key(self_attention_layer_c, 'w_value')]['Weights']
        head_proj_weights =   self_attention_layer_c[find_key(self_attention_layer_c, 'heads_proj')]['Weights']

        # print("W_query_weights\n ", W_query_weights)
        W_query_weights = W_query_weights.t()
        W_key_weights = W_key_weights.t()
        W_value_weights = W_value_weights.t()
        head_proj_weights = head_proj_weights.t()

        self.W_query = nn.Linear(W_query_weights.shape[0], W_query_weights.shape[1], bias=False)
        self.W_query.weight = nn.Parameter(W_query_weights)

        self.W_key = nn.Linear(W_key_weights.shape[0], W_key_weights.shape[1], bias=False)
        self.W_key.weight = nn.Parameter(W_key_weights)

        self.W_value = nn.Linear(W_value_weights.shape[0], W_value_weights.shape[1], bias=False)
        self.W_value.weight = nn.Parameter(W_value_weights)

        self.heads_proj = nn.Linear(head_proj_weights.shape[0], head_proj_weights.shape[1], bias=False)
        self.heads_proj.weight = nn.Parameter(head_proj_weights)

    
    def forward(self, x):
        # print(f"Input Embeddings: \nShape: {x.shape}\n{x}")
        # print(f"W_Query\nShape: {self.W_query.weight.shape}\n{self.W_query.weight}")
        # print(f"W_Key\nShape:   {self.W_key.weight.shape}\n{self.W_key.weight}")
        # print(f"W_Value\nShape: {self.W_value.weight.shape}\n{self.W_value.weight}")
        # print(f"heads_proj\nShape: {self.heads_proj.weight.shape}\n{self.heads_proj.weight}")
        print("===========================================")

        query = self.W_query(x)
        key = self.W_key(x)
        value = self.W_value(x)
        self.layer_name = 'self_attention_layer'
        query_matched = tensors_within_tolerance(query, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_w_query')]['Output'], self.atol)
        key_matched = tensors_within_tolerance(key,  self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_w_key')]['Output'], self.atol)
        value_matched = tensors_within_tolerance(value,  self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_w_value')]['Output'], self.atol)

        print(f"query_matched:   {query_matched}")
        print(f"key_matched:     {key_matched}")
        print(f"value_matched:   {value_matched}")
        # print("Python", query)
        # print("C", self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_w_query')]['Output'])

        queries_chnuks  = torch.chunk(query, self.n_heads, -1)
        keys_chnuks     = torch.chunk(key , self.n_heads, -1)
        values_chnuks   = torch.chunk(value, self.n_heads, -1)

        for head in range(0, self.n_heads):
            #print(self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_queries_chunks_{head}')])
            #print(self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_queries_chunks_{head}')]['Output'].shape)
            query_chnuks_matched    = tensors_within_tolerance(queries_chnuks[head],  self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_queries_chunks_{head}')], self.atol)
            key_chnuks_matched      = tensors_within_tolerance(keys_chnuks[head],     self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_keys_chunks_{head}')], self.atol)
            value_chnuks_matched    = tensors_within_tolerance(values_chnuks[head],   self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_values_chunks_{head}')], self.atol)
            print(f"query_chnuks_matched:   {query_chnuks_matched}")
            print(f"key_chnuks_matched:     {key_chnuks_matched}")
            print(f"value_chnuks_matched:   {value_chnuks_matched}")

        context_vecs = []
        for head in range(0, self.n_heads):
            #print(self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_context_vecs_{head}')])
            
            print(f"=================================== HEAD {head} ===================================\n")
            key_transposed = keys_chnuks[head].transpose(1, 2)
            key_transposed_matched = tensors_within_tolerance(key_transposed, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_keys_transposed_{head}')], self.atol)
            print(f"key_transposed_matched:   {key_transposed_matched}")

            attention_scores = queries_chnuks[head] @ key_transposed
            attention_scores_matched = tensors_within_tolerance(attention_scores, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_attention_scores_{head}')], self.atol)
            print(f"attention_scores_matched:   {attention_scores_matched}")


            attention_scores_scaled = attention_scores * 1/math.sqrt(keys_chnuks[head].shape[1])
            attention_scores_scaled_matched = tensors_within_tolerance(attention_scores_scaled, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_attention_scores_scaled_{head}')], self.atol)
            print(f"attention_scores_scaled_matched:   {attention_scores_scaled_matched}")

            attention_weights = F.softmax(attention_scores_scaled, dim=-1)
            attention_weights_matched = tensors_within_tolerance(attention_weights, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_attention_weights_{head}')], self.atol)
            print(f"attention_weights_matched:   {attention_weights_matched}")

            context_vec = attention_weights @ values_chnuks[head]
            context_vec_matched = tensors_within_tolerance(context_vec, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_context_vecs_{head}')], self.atol)
            print(f"context_vec_matched:   {context_vec_matched}")

            context_vecs.append(context_vec)
        #     print(f"=================================================================================\n")

        concat_heads = torch.cat(context_vecs, dim=-1)
        concat_heads_matched = tensors_within_tolerance(concat_heads, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_concat_heads')], self.atol)
        print(f"concat_heads_matched:   {concat_heads_matched}")

        projected_context_vecs = self.heads_proj(concat_heads)
        projected_context_vecs_matched = tensors_within_tolerance(projected_context_vecs, self.self_attention_layer_c[find_key(self.self_attention_layer_c, f'{self.layer_name}_heads_proj')]['Output'], self.atol)
        print(f"projected_context_vecs_matched:   {projected_context_vecs_matched}")



In [62]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, feed_forward_network_c):
        super().__init__()
        self.input = nn.Linear(2, 5)
        self.output = nn.Linear(5, 2)

    def forward(self, x):
        pass

In [63]:
class Transformer(nn.Module):
    def __init__(self, transformer_layer_c, n_heads, atol):
        super().__init__() 
        self.n_heads = n_heads
        self.atol = atol
        self_attention_layer_c = {k: v for k, v in transformer_layer_c.items() if "self_attention_layer" in k}
        feed_forward_network_c = {k: v for k, v in transformer_layer_c.items() if "feed_forward_network" in k}
        self.self_attention_multi_head  = SelfAttentionMultiHead(self_attention_layer_c, n_heads, atol)
        #self.feed_forward_network       = FeedForwardNetwork(feed_forward_network_c)
    
    def forward(self, x):
        x = self.self_attention_multi_head(x)
        #x = self.feed_forward_network(x)
        return x


In [64]:
class TransformerBlock(nn.Module):
    def __init__(self, transformer_block_c, n_layers, n_heads, atol):
        super().__init__()
        self.n_layers = n_layers
        self.atol = atol
        self.transformer_layers = {}
        for layer_no in range(0, n_layers):
            transformer_layer_c = {k: v for k, v in transformer_block_c.items() if k.startswith(f"transformer_layer_{layer_no}")}
            self.transformer_layers[layer_no] = Transformer(transformer_layer_c, n_heads, atol)

    def forward(self, x):
        for layer_no in range(0, self.n_layers):
            x = self.transformer_layers[layer_no](x)
        return x

In [65]:
class GPT(nn.Module):
    def __init__(self, gpt_model_c, n_layers, n_heads, atol):
        super().__init__()   
        self.atol = atol
           
        num_token_embeddings            = gpt_model_c['gpt_model.token_embed_layer']['Weights'].shape[0]
        token_embedding_dim             = gpt_model_c['gpt_model.token_embed_layer']['Weights'].shape[1]
        num_pos_embeddings              = gpt_model_c['gpt_model.pos_embed_layer']['Weights'].shape[0]
        pos_embedding_dim               = gpt_model_c['gpt_model.pos_embed_layer']['Weights'].shape[1]

        print(f"num_token_embeddings: {num_token_embeddings}")
        print(f"token_embedding_dim: {token_embedding_dim}")
        print(f"num_pos_embeddings: {num_pos_embeddings}")
        print(f"pos_embedding_dim: {pos_embedding_dim}")

        self.token_embeddings_layer     = nn.Embedding(num_embeddings=num_token_embeddings, embedding_dim=token_embedding_dim)
        self.pos_embeddings_layer       = nn.Embedding(num_embeddings=num_pos_embeddings, embedding_dim=pos_embedding_dim)

        self.token_embeddings_layer.weight.data.copy_(gpt_model_c['gpt_model.token_embed_layer']['Weights'])
        self.pos_embeddings_layer.weight.data.copy_(gpt_model_c['gpt_model.pos_embed_layer']['Weights'])
        
 
        assert(gpt_model_c['gpt_model.token_embed_layer']['Weights'].shape == self.token_embeddings_layer.weight.shape)
        assert(gpt_model_c['gpt_model.token_embed_layer']['Weights'].shape == self.pos_embeddings_layer.weight.shape)

        
        self.token_embeddings_c         = gpt_model_c['gpt_model.token_embed_layer']['Output']
        self.pos_embeddings_c           = gpt_model_c['gpt_model.pos_embed_layer']['Output']
        self.input_embeddings_c         = gpt_model_c['gpt_model.workspace.input_embeddings']
        self.position_indicies_c        = gpt_model_c['gpt_model.workspace.position_indices'].long()

        transformer_block_c = {k: v for k, v in gpt_model_c.items() if "transformer_layer" in k}
        self.transformer_block = TransformerBlock(transformer_block_c, n_layers, n_heads, self.atol)

    def forward(self, input_tokens_c):
        #print(f"x.shape:    {input_tokens_c.shape}")
        token_embeddings    = self.token_embeddings_layer(input_tokens_c)[0]
        print(self.position_indicies_c)
        pos_embeddings      = self.pos_embeddings_layer(self.position_indicies_c)[0]
        input_embeddings    = token_embeddings + pos_embeddings


        token_embeddings_matched    = tensors_within_tolerance(token_embeddings,    self.token_embeddings_c,    self.atol)
        pos_embeddings_matched      = tensors_within_tolerance(pos_embeddings,      self.pos_embeddings_c,      self.atol)
        input_embeddings_matched    = tensors_within_tolerance(input_embeddings,    self.input_embeddings_c,    self.atol)

        print(f"token_embeddings_matched:   {token_embeddings_matched}")
        print(f"pos_embeddings_matched:     {pos_embeddings_matched}")
        print(f"input_embeddings_matched:   {input_embeddings_matched}")


        # print("***********Token Embeddings***********\n")
        # print(self.token_embeddings_c[0], "\n")
        # print(token_embeddings[0], "\n\n")

        # print("***********Pose Embeddings***********\n")
        # print(self.pos_embeddings_c[0], "\n")
        # print(pos_embeddings[0], "\n\n")


        # print("***********Input Embeddings***********\n")
        # print(self.input_embeddings_c[0], "\n")
        # print(input_embeddings[0], "\n\n")

        # assert(
        #     token_embeddings_matched and 
        #     pos_embeddings_matched and 
        #     input_embeddings_matched
        # )
        
        contextual_embddings = self.transformer_block(input_embeddings)


        # print(token_embeddings - self.token_embed_layer_output_c)
        return contextual_embddings

In [67]:
atol = 0.00000001
gpt = GPT(gpt_model_c=gpt_model_c, n_layers=1, n_heads=2, atol=atol)
input_embeddings = gpt(gpt_model_c['input_tokens'].long())

num_token_embeddings: 6
token_embedding_dim: 32
num_pos_embeddings: 6
pos_embedding_dim: 32
tensor([[[0, 1, 2, 3, 4, 5],
         [0, 1, 2, 3, 4, 5]]])
max |diff|: 0.0
max |diff|: 0.0
max |diff|: 0.0
token_embeddings_matched:   True
pos_embeddings_matched:     True
input_embeddings_matched:   True
max |diff|: 0.0
max |diff|: 0.0
max |diff|: 0.0
query_matched:   True
key_matched:     True
value_matched:   True
max |diff|: 0.0
max |diff|: 0.0
max |diff|: 0.0
query_chnuks_matched:   True
key_chnuks_matched:     True
value_chnuks_matched:   True
max |diff|: 0.0
max |diff|: 0.0
max |diff|: 0.0
query_chnuks_matched:   True
key_chnuks_matched:     True
value_chnuks_matched:   True

max |diff|: 0.0
key_transposed_matched:   True
max |diff|: 0.0
attention_scores_matched:   True
max |diff|: 0.0
attention_scores_scaled_matched:   True
max |diff|: 0.0
attention_weights_matched:   True
max |diff|: 0.0
context_vec_matched:   True

max |diff|: 0.0
key_transposed_matched:   True
max |diff|: 0.0
attent

In [57]:
for key in gpt_model_c.keys():
    print(key)

transformer_layer_0__self_attention_layer_context_vecs_3
transformer_layer_0__self_attention_layer_attention_scores_0
gpt_model.pos_embed_layer
transformer_layer_0__self_attention_layer_attention_scores_1
transformer_layer_0__self_attention_layer_context_vecs_2
input_tokens
transformer_layer_0__self_attention_layer_context_vecs_0
transformer_layer_0__self_attention_layer_w_key
transformer_layer_0__self_attention_layer_attention_scores_3
transformer_layer_0__self_attention_layer_attention_scores_2
transformer_layer_0__self_attention_layer_context_vecs_1
transformer_layer_0__self_attention_layer_context_vecs_5
transformer_layer_0__self_attention_layer_values_chunks_9
transformer_layer_0__self_attention_layer_attention_scores_6
transformer_layer_0__self_attention_layer_attention_scores_7
transformer_layer_0__self_attention_layer_values_chunks_8
transformer_layer_0__self_attention_layer_context_vecs_4
transformer_layer_0__self_attention_layer_context_vecs_6
transformer_layer_0__self_attent

In [595]:
key_transposed_global.shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [333]:
# class SelfAttention:
#     def __init__(self, self_attention_layer_c):
#         W_query_weights =   self_attention_layer_c['W_Query']
#         W_key_weights =     self_attention_layer_c['W_Key']
#         W_value_weights =   self_attention_layer_c['W_Value']

#         W_query_weights = W_query_weights.t()
#         W_key_weights = W_key_weights.t()
#         W_value_weights = W_value_weights.t()

#         self.W_query = nn.Linear(W_query_weights.shape[0], W_query_weights.shape[1], bias=False)
#         self.W_query.weight = nn.Parameter(W_query_weights)

#         self.W_key = nn.Linear(W_key_weights.shape[0], W_key_weights.shape[1], bias=False)
#         self.W_key.weight = nn.Parameter(W_key_weights)

#         self.W_value = nn.Linear(W_value_weights.shape[0], W_value_weights.shape[1], bias=False)
#         self.W_value.weight = nn.Parameter(W_value_weights)
    
#     def forward(self, x):
#         print(f"Input Embeddings: \nShape: {x.shape}\n{x}")
#         print(f"W_Query\nShape: {self.W_query.weight.shape}\n{self.W_query.weight}")
#         print(f"W_Key\nShape:   {self.W_key.weight.shape}\n{self.W_key.weight}")
#         print(f"W_Value\nShape: {self.W_value.weight.shape}\n{self.W_value.weight}")
#         print("===========================================")

#         query = self.W_query(x)
#         key = self.W_key(x)
#         value = self.W_value(x)
        

#         # print(f"Query\nShape: {query.shape}\n{query}")
#         # print(f"Key\nShape:   {key.shape}\n{key}")
#         # print(f"Value\nShape: {value.shape}\n{value}")
#         # print("===========================================")

#         key_transposed = key.t()
#         print(f"Key transposed\nShape: {key_transposed.shape}\n{key_transposed}")

#         attention_scores = query @ key_transposed
#         print(f"Attention Scores\nShape: {attention_scores.shape}\n{attention_scores}")

#         attention_scores_scaled = attention_scores * 1/math.sqrt(key.shape[1])
#         print(f"Attention Scores Scaled\nShape: {attention_scores_scaled.shape}\n{attention_scores_scaled}")

#         attention_weights = F.softmax(attention_scores_scaled, dim=1)
#         print(f"Attention Weights\nShape: {attention_weights.shape}\n{attention_weights}")

#         context_vecs = attention_weights @ value
#         print(f"Context Vecs\nShape: {context_vecs.shape}\n{context_vecs}")





In [332]:
# self_attention = SelfAttention(self_attention_layer_c)

# self_attention.forward(input_embeddings_c)

## Multi Head Attention

In [331]:
# self_attention = SelfAttentionMultiHead(self_attention_layer_c, 2)

# self_attention.forward(input_embeddings_c)