In [138]:
import torch
from torch import nn
torch.set_printoptions(precision=2, sci_mode=False)
import numpy as np
import torch.nn.functional as F
import math


In [140]:
def load_tensors(filename):
    tensors_dict = {}
    
    # Tracking current tensor state
    current_name = "default_tensor"
    current_metadata = {}
    current_data = []
    
    meta_keys = ['size', 'ndim', 'shape', 'stride', 'elem_size', 'requires_grad']

    def finalize_tensor(name, meta, data):
        """Helper to reshape data and store in the dictionary."""
        if not data:
            return
        
        target_shape = [int(s) for s in meta.get('shape', [len(data)])]
        # Using float64 (double) as your images show high precision decimals
        tensors_dict[name] = torch.tensor(data, dtype=torch.float32).reshape(target_shape)

    with open(filename, 'r') as f:
        for line in f:
            # Split by comma and remove empty strings/whitespace
            parts = [p.strip() for p in line.split(',') if p.strip()]
            
            if not parts:
                continue

            label = parts[0].replace(':', '')

            # 1. Check if it's a known metadata key
            if label in meta_keys:
                vals = [float(v) for v in parts[1:]]
                current_metadata[label] = vals[0] if len(vals) == 1 else vals
            
            else:
                try:
                    # 2. Try to parse as numeric data
                    row_data = [float(p) for p in parts]
                    current_data.extend(row_data)
                except ValueError:
                    # 3. If it's a string but NOT metadata, it's a new tensor name
                    # Save the previous tensor first
                    if current_data:
                        finalize_tensor(current_name, current_metadata, current_data)
                    
                    # Reset for the new tensor
                    current_name = label
                    current_metadata = {}
                    current_data = []

    # Finalize the last tensor in the file
    finalize_tensor(current_name, current_metadata, current_data)
    
    return tensors_dict

In [None]:
# base_path = '../tensors'
# input_tokens = load_tensors(f'{base_path}/input_tokens.csv')['default_tensor'].long()
# token_embeddings_layer = load_tensors(f'{base_path}/gpt_model.token_embed_layer.csv')
# pos_embeddings_layer = load_tensors(f'{base_path}/gpt_model.pos_embed_layer.csv')
# pos_embeddings_layer = load_tensors(f'{base_path}/gpt_model.workspace.position_indicies.csv')
# input_embeddings = load_tensors(f'{base_path}/gpt_model.input_embeddings.csv')['default_tensor']

In [150]:
base_path = '../tensors'
gpt_model_c = {
    "input_tokens":  load_tensors(f'{base_path}/input_tokens.csv')['default_tensor'].long(),
    "token_embeddings_layer": load_tensors(f'{base_path}/gpt_model.token_embed_layer.csv'),
    "pos_embeddings_layer": load_tensors(f'{base_path}/gpt_model.pos_embed_layer.csv'),
    "position_indicies":  load_tensors(f'{base_path}/gpt_model.workspace.position_indicies.csv')['default_tensor'].long(),
    "input_embeddings": load_tensors(f'{base_path}/gpt_model.input_embeddings.csv')['default_tensor']
}
gpt_model_c

{'input_tokens': tensor([[[1, 2, 0, 2, 5, 3],
          [0, 0, 4, 3, 2, 1]]]),
 'token_embeddings_layer': {'Weights': tensor([[-1.00, -0.74,  0.51, -0.08,  0.07, -0.56],
          [-0.91,  0.36,  0.36,  0.87, -0.23,  0.04],
          [ 0.66, -0.93, -0.89,  0.06,  0.34, -0.98],
          [-0.23, -0.87, -0.17,  0.37,  0.18,  0.86],
          [ 0.69,  0.05, -0.82,  0.31, -0.17,  0.40],
          [ 0.82,  0.52, -0.48, -0.91,  0.47, -0.34]]),
  'Output': tensor([[[-0.91,  0.36,  0.36,  0.87, -0.23,  0.04],
           [ 0.66, -0.93, -0.89,  0.06,  0.34, -0.98],
           [-1.00, -0.74,  0.51, -0.08,  0.07, -0.56],
           [ 0.66, -0.93, -0.89,  0.06,  0.34, -0.98],
           [ 0.82,  0.52, -0.48, -0.91,  0.47, -0.34],
           [-0.23, -0.87, -0.17,  0.37,  0.18,  0.86]],
  
          [[-1.00, -0.74,  0.51, -0.08,  0.07, -0.56],
           [-1.00, -0.74,  0.51, -0.08,  0.07, -0.56],
           [ 0.69,  0.05, -0.82,  0.31, -0.17,  0.40],
           [-0.23, -0.87, -0.17,  0.37,  0.18,  0

In [164]:
gpt_model_c['position_indicies']

tensor([[[0, 1, 2, 3, 4, 5],
         [0, 1, 2, 3, 4, 5]]])

In [189]:
def tensors_within_tolerance(a, b, atol):
    if a.shape != b.shape:
        print("Shape mismatch:", a.shape, b.shape)
        return False

    max_diff = round((a - b).abs().max().item(), 2)
    print("max |diff|:", max_diff)

    return max_diff <= atol

## GPT

In [190]:
class GPT(nn.Module):
    def __init__(self, gpt_model_c):
        super().__init__()      
        num_token_embeddings            = gpt_model_c['token_embeddings_layer']['Weights'].shape[0]
        token_embedding_dim             = gpt_model_c['token_embeddings_layer']['Weights'].shape[1]
        num_pos_embeddings              = gpt_model_c['pos_embeddings_layer']['Weights'].shape[0]
        pos_embedding_dim               = gpt_model_c['pos_embeddings_layer']['Weights'].shape[1]

        self.token_embeddings_layer     = nn.Embedding(num_embeddings=num_token_embeddings, embedding_dim=token_embedding_dim)
        self.pos_embeddings_layer       = nn.Embedding(num_embeddings=num_pos_embeddings, embedding_dim=pos_embedding_dim)

        self.token_embeddings_layer.weight.data.copy_(gpt_model_c['token_embeddings_layer']['Weights'])
        self.pos_embeddings_layer.weight.data.copy_(gpt_model_c['pos_embeddings_layer']['Weights'])
        
        assert(gpt_model_c['token_embeddings_layer']['Weights'].shape == self.token_embeddings_layer.weight.shape)
        assert(gpt_model_c['token_embeddings_layer']['Weights'].shape == self.pos_embeddings_layer.weight.shape)

        
        self.token_embeddings_c         = gpt_model_c['token_embeddings_layer']['Output']
        self.pos_embeddings_c           = gpt_model_c['pos_embeddings_layer']['Output']
        self.input_embeddings_c         = gpt_model_c['input_embeddings']
        self.position_indicies_c        = gpt_model_c['position_indicies']

        self.atol = 0.02

    def forward(self, input_tokens_c):
        print(f"x.shape:    {input_tokens_c.shape}")
        token_embeddings    = self.token_embeddings_layer(input_tokens_c)[0]
        pos_embeddings      = self.pos_embeddings_layer(self.position_indicies_c)[0]
        input_embeddings    = token_embeddings + pos_embeddings

        print(f"token_embeddings.shape: {token_embeddings.shape}")
        print(f"pos_embeddings.shape:   {pos_embeddings.shape}")
        print(f"input_embeddings.shape: {input_embeddings.shape}")

        token_embeddings_matched    = tensors_within_tolerance(token_embeddings,    self.token_embeddings_c,    self.atol)
        pos_embeddings_matched      = tensors_within_tolerance(pos_embeddings,      self.pos_embeddings_c,      self.atol)
        input_embeddings_matched    = tensors_within_tolerance(input_embeddings,    self.input_embeddings_c,    self.atol)

        print(f"token_embeddings_matched:   {token_embeddings_matched}")
        print(f"pos_embeddings_matched:     {pos_embeddings_matched}")
        print(f"input_embeddings_matched:   {input_embeddings_matched}")

        assert(
            token_embeddings_matched and 
            pos_embeddings_matched and 
            input_embeddings_matched
        )
        


        # print(token_embeddings - self.token_embed_layer_output_c)
        return input_embeddings

In [191]:
gpt = GPT(gpt_model_c=gpt_model_c)
input_embeddings = gpt(gpt_model_c['input_tokens'])

x.shape:    torch.Size([1, 2, 6])
token_embeddings.shape: torch.Size([2, 6, 6])
pos_embeddings.shape:   torch.Size([2, 6, 6])
input_embeddings.shape: torch.Size([2, 6, 6])
max |diff|: 0.0
max |diff|: 0.0
max |diff|: 0.01
token_embeddings_matched:   True
pos_embeddings_matched:     True
input_embeddings_matched:   True


In [107]:
class SelfAttention:
    def __init__(self, self_attention_layer_c):
        W_query_weights =   self_attention_layer_c['W_Query']
        W_key_weights =     self_attention_layer_c['W_Key']
        W_value_weights =   self_attention_layer_c['W_Value']

        W_query_weights = W_query_weights.t()
        W_key_weights = W_key_weights.t()
        W_value_weights = W_value_weights.t()

        self.W_query = nn.Linear(W_query_weights.shape[0], W_query_weights.shape[1], bias=False)
        self.W_query.weight = nn.Parameter(W_query_weights)

        self.W_key = nn.Linear(W_key_weights.shape[0], W_key_weights.shape[1], bias=False)
        self.W_key.weight = nn.Parameter(W_key_weights)

        self.W_value = nn.Linear(W_value_weights.shape[0], W_value_weights.shape[1], bias=False)
        self.W_value.weight = nn.Parameter(W_value_weights)
    
    def forward(self, x):
        print(f"Input Embeddings: \nShape: {x.shape}\n{x}")
        print(f"W_Query\nShape: {self.W_query.weight.shape}\n{self.W_query.weight}")
        print(f"W_Key\nShape:   {self.W_key.weight.shape}\n{self.W_key.weight}")
        print(f"W_Value\nShape: {self.W_value.weight.shape}\n{self.W_value.weight}")
        print("===========================================")

        query = self.W_query(x)
        key = self.W_key(x)
        value = self.W_value(x)
        

        # print(f"Query\nShape: {query.shape}\n{query}")
        # print(f"Key\nShape:   {key.shape}\n{key}")
        # print(f"Value\nShape: {value.shape}\n{value}")
        # print("===========================================")

        key_transposed = key.t()
        print(f"Key transposed\nShape: {key_transposed.shape}\n{key_transposed}")

        attention_scores = query @ key_transposed
        print(f"Attention Scores\nShape: {attention_scores.shape}\n{attention_scores}")

        attention_scores_scaled = attention_scores * 1/math.sqrt(key.shape[1])
        print(f"Attention Scores Scaled\nShape: {attention_scores_scaled.shape}\n{attention_scores_scaled}")

        attention_weights = F.softmax(attention_scores_scaled, dim=1)
        print(f"Attention Weights\nShape: {attention_weights.shape}\n{attention_weights}")

        context_vecs = attention_weights @ value
        print(f"Context Vecs\nShape: {context_vecs.shape}\n{context_vecs}")





In [108]:
self_attention = SelfAttention(self_attention_layer_c)

self_attention.forward(input_embeddings_c)

Input Embeddings: 
Shape: torch.Size([6, 4])
tensor([[-1.00, -0.74,  0.51, -0.08],
        [ 0.07, -0.56, -0.91,  0.36],
        [ 0.36,  0.87, -0.23,  0.04],
        [ 0.66, -0.93, -0.89,  0.06],
        [ 0.34, -0.98, -0.23, -0.87],
        [-0.17,  0.37,  0.18,  0.86]])
W_Query
Shape: torch.Size([4, 4])
Parameter containing:
tensor([[ 0.69, -0.17, -0.48,  0.27],
        [ 0.05,  0.40, -0.91,  0.51],
        [-0.82,  0.82,  0.47,  0.98],
        [ 0.31,  0.52, -0.34, -0.27]], requires_grad=True)
W_Key
Shape:   torch.Size([4, 4])
Parameter containing:
tensor([[-0.51,  0.30, -0.45, -0.52],
        [ 0.97, -0.85, -0.13, -0.45],
        [ 0.45,  0.26,  0.53, -0.28],
        [ 0.51,  0.77, -0.04, -0.67]], requires_grad=True)
W_Value
Shape: torch.Size([4, 4])
Parameter containing:
tensor([[-0.03,  0.81,  0.97,  0.90],
        [ 0.80,  0.01, -0.01, -0.85],
        [ 0.82,  0.03, -0.47,  0.00],
        [-0.88, -0.36, -0.82, -0.23]], requires_grad=True)
Key transposed
Shape: torch.Size([4, 6]

## Multi Head Attention

In [111]:
class SelfAttentionMultiHead(nn.Module):
    def __init__(self, self_attention_layer_c, num_heads):
        super().__init__()
        W_query_weights =   self_attention_layer_c['W_Query']
        W_key_weights =     self_attention_layer_c['W_Key']
        W_value_weights =   self_attention_layer_c['W_Value']
        head_proj_weights =   self_attention_layer_c['heads_proj']

        W_query_weights = W_query_weights.t()
        W_key_weights = W_key_weights.t()
        W_value_weights = W_value_weights.t()
        head_proj_weights = head_proj_weights.t()

        self.W_query = nn.Linear(W_query_weights.shape[0], W_query_weights.shape[1], bias=False)
        self.W_query.weight = nn.Parameter(W_query_weights)

        self.W_key = nn.Linear(W_key_weights.shape[0], W_key_weights.shape[1], bias=False)
        self.W_key.weight = nn.Parameter(W_key_weights)

        self.W_value = nn.Linear(W_value_weights.shape[0], W_value_weights.shape[1], bias=False)
        self.W_value.weight = nn.Parameter(W_value_weights)

        print(head_proj_weights.shape[0], head_proj_weights.shape[1])
        self.heads_proj = nn.Linear(head_proj_weights.shape[0], head_proj_weights.shape[1], bias=False)
        self.heads_proj.weight = nn.Parameter(head_proj_weights)

        self.num_heads = num_heads
    
    def forward(self, x):
        # print(f"Input Embeddings: \nShape: {x.shape}\n{x}")
        print(f"W_Query\nShape: {self.W_query.weight.shape}\n{self.W_query.weight}")
        print(f"W_Key\nShape:   {self.W_key.weight.shape}\n{self.W_key.weight}")
        print(f"W_Value\nShape: {self.W_value.weight.shape}\n{self.W_value.weight}")
        print(f"heads_proj\nShape: {self.heads_proj.weight.shape}\n{self.heads_proj.weight}")
        print("===========================================")

        query = self.W_query(x)
        key = self.W_key(x)
        value = self.W_value(x)
        

        print(f"queries\nShape: {query.shape}\n{query}")
        print(f"keys\nShape:   {key.shape}\n{key}")
        print(f"values\nShape: {value.shape}\n{value}")
        # print("===========================================")

        queries_chnuks = torch.chunk(query, self.num_heads, 1)
        keys_chnuks = torch.chunk(key , self.num_heads, 1)
        values_chnuks = torch.chunk(value, self.num_heads, 1)

        # print(f"queries_chnuks\nShape: {queries_chnuks[0].shape}\n{queries_chnuks[0]}")
        # print(f"keys_chnuks\nShape: {keys_chnuks[0].shape}\n{keys_chnuks[0]}")
        # print(f"values_chnuks\nShape: {values_chnuks[0].shape}\n{values_chnuks[0]}")
        # print("===========================================")


        context_vecs = []
        for head in range(0, self.num_heads):
            print(f"=================================== HEAD {head} ===================================\n")
            key_transposed = keys_chnuks[head].t()
            print(f"Key transposed\nShape: {key_transposed.shape}\n{key_transposed}")

            attention_scores = queries_chnuks[head] @ key_transposed
            print(f"Attention Scores\nShape: {attention_scores.shape}\n{attention_scores}")

            attention_scores_scaled = attention_scores * 1/math.sqrt(keys_chnuks[head].shape[1])
            print(f"Attention Scores Scaled\nShape: {attention_scores_scaled.shape}\n{attention_scores_scaled}")

            attention_weights = F.softmax(attention_scores_scaled, dim=1)
            print(f"Attention Weights\nShape: {attention_weights.shape}\n{attention_weights}")
            context_vec = attention_weights @ values_chnuks[head]
            print(f"Context Vec\nShape: {context_vec.shape}\n{context_vec}")
            context_vecs.append(context_vec)
            print(f"=================================================================================\n")

        concat_heads = torch.cat(context_vecs, dim=1)
        print(f"concat_heads\nShape: {concat_heads.shape}\n{concat_heads}")
        print(self.heads_proj.weight)
        projected_context_vecs = self.heads_proj(concat_heads)
        print(f"projected_context_vecs\nShape: {projected_context_vecs.shape}\n{projected_context_vecs}")



In [112]:
self_attention = SelfAttentionMultiHead(self_attention_layer_c, 2)

self_attention.forward(input_embeddings_c)

4 4
W_Query
Shape: torch.Size([4, 4])
Parameter containing:
tensor([[ 0.69, -0.17, -0.48,  0.27],
        [ 0.05,  0.40, -0.91,  0.51],
        [-0.82,  0.82,  0.47,  0.98],
        [ 0.31,  0.52, -0.34, -0.27]], requires_grad=True)
W_Key
Shape:   torch.Size([4, 4])
Parameter containing:
tensor([[-0.51,  0.30, -0.45, -0.52],
        [ 0.97, -0.85, -0.13, -0.45],
        [ 0.45,  0.26,  0.53, -0.28],
        [ 0.51,  0.77, -0.04, -0.67]], requires_grad=True)
W_Value
Shape: torch.Size([4, 4])
Parameter containing:
tensor([[-0.03,  0.81,  0.97,  0.90],
        [ 0.80,  0.01, -0.01, -0.85],
        [ 0.82,  0.03, -0.47,  0.00],
        [-0.88, -0.36, -0.82, -0.23]], requires_grad=True)
heads_proj
Shape: torch.Size([4, 4])
Parameter containing:
tensor([[-0.45,  0.88,  0.66,  0.74],
        [ 0.83, -0.90, -0.75,  0.26],
        [ 0.06,  0.52, -0.97,  0.47],
        [-0.07,  0.54,  0.38,  0.45]], requires_grad=True)
queries
Shape: torch.Size([6, 4])
tensor([[-0.83, -0.85,  0.37, -0.85],
     