In [1]:
import torch
from torch import nn
torch.set_printoptions(precision=2, sci_mode=False)
import numpy as np
import torch.nn.functional as F


In [2]:
import torch

def load_multiple_tensors(filename):
    tensors_dict = {}
    
    # Tracking current tensor state
    current_name = "default_tensor"
    current_metadata = {}
    current_data = []
    
    meta_keys = ['size', 'ndim', 'shape', 'stride', 'elem_size', 'requires_grad']

    def finalize_tensor(name, meta, data):
        """Helper to reshape data and store in the dictionary."""
        if not data:
            return
        
        target_shape = [int(s) for s in meta.get('shape', [len(data)])]
        # Using float64 (double) as your images show high precision decimals
        tensors_dict[name] = torch.tensor(data, dtype=torch.float64).reshape(target_shape)

    with open(filename, 'r') as f:
        for line in f:
            # Split by comma and remove empty strings/whitespace
            parts = [p.strip() for p in line.split(',') if p.strip()]
            
            if not parts:
                continue

            label = parts[0].replace(':', '')

            # 1. Check if it's a known metadata key
            if label in meta_keys:
                vals = [float(v) for v in parts[1:]]
                current_metadata[label] = vals[0] if len(vals) == 1 else vals
            
            else:
                try:
                    # 2. Try to parse as numeric data
                    row_data = [float(p) for p in parts]
                    current_data.extend(row_data)
                except ValueError:
                    # 3. If it's a string but NOT metadata, it's a new tensor name
                    # Save the previous tensor first
                    if current_data:
                        finalize_tensor(current_name, current_metadata, current_data)
                    
                    # Reset for the new tensor
                    current_name = label
                    current_metadata = {}
                    current_data = []

    # Finalize the last tensor in the file
    finalize_tensor(current_name, current_metadata, current_data)
    
    return tensors_dict

# Usage
self_attention_layer_c = load_multiple_tensors('../output/self_attention_layer.csv')
input_embeddings_c= load_multiple_tensors('../output/input_embeddings.csv')
input_embeddings_c = input_embeddings_c['default_tensor']

In [3]:
self_attention_layer_c

{'W_Query': tensor([[ 0.69,  0.05, -0.82,  0.31],
         [-0.17,  0.40,  0.82,  0.52],
         [-0.48, -0.91,  0.47, -0.34],
         [ 0.27,  0.51,  0.98, -0.27]], dtype=torch.float64),
 'W_Key': tensor([[-0.51,  0.97,  0.45,  0.51],
         [ 0.30, -0.85,  0.26,  0.77],
         [-0.45, -0.13,  0.53, -0.04],
         [-0.52, -0.45, -0.28, -0.67]], dtype=torch.float64),
 'W_Value': tensor([[-0.03,  0.80,  0.82, -0.88],
         [ 0.81,  0.01,  0.03, -0.36],
         [ 0.97, -0.01, -0.47, -0.82],
         [ 0.90, -0.85,  0.00, -0.23]], dtype=torch.float64)}

In [16]:
import math
class SelfAttention:
    def __init__(self, self_attention_layer_c):
        W_query_weights =   self_attention_layer_c['W_Query']
        W_key_weights =     self_attention_layer_c['W_Key']
        W_value_weights =   self_attention_layer_c['W_Value']

        W_query_weights = W_query_weights.t()
        W_key_weights = W_key_weights.t()
        W_value_weights = W_value_weights.t()

        self.W_query = nn.Linear(W_query_weights.shape[0], W_query_weights.shape[1], bias=False)
        self.W_query.weight = nn.Parameter(W_query_weights)

        self.W_key = nn.Linear(W_key_weights.shape[0], W_key_weights.shape[1], bias=False)
        self.W_key.weight = nn.Parameter(W_key_weights)

        self.W_value = nn.Linear(W_value_weights.shape[0], W_value_weights.shape[1], bias=False)
        self.W_value.weight = nn.Parameter(W_value_weights)
    
    def forward(self, x):
        print(f"Input Embeddings: \nShape: {x.shape}\n{x}")
        print(f"W_Query\nShape: {self.W_query.weight.shape}\n{self.W_query.weight}")
        print(f"W_Key\nShape:   {self.W_key.weight.shape}\n{self.W_key.weight}")
        print(f"W_Value\nShape: {self.W_value.weight.shape}\n{self.W_value.weight}")
        print("===========================================")

        query = self.W_query(x)
        key = self.W_key(x)
        value = self.W_value(x)
        

        # print(f"Query\nShape: {query.shape}\n{query}")
        # print(f"Key\nShape:   {key.shape}\n{key}")
        # print(f"Value\nShape: {value.shape}\n{value}")
        # print("===========================================")

        key_transposed = key.t()
        print(f"Key transposed\nShape: {key_transposed.shape}\n{key_transposed}")

        attention_scores = query @ key_transposed
        print(f"Attention Scores\nShape: {attention_scores.shape}\n{attention_scores}")

        attention_scores_scaled = attention_scores * 1/math.sqrt(key.shape[1])
        print(f"Attention Scores Scaled\nShape: {attention_scores_scaled.shape}\n{attention_scores_scaled}")

        attention_weights = F.softmax(attention_scores_scaled, dim=1)
        print(f"Attention Weights\nShape: {attention_weights.shape}\n{attention_weights}")

        context_vecs = attention_weights @ value
        print(f"Context Vecs\nShape: {context_vecs.shape}\n{context_vecs}")





In [17]:
self_attention = SelfAttention(self_attention_layer_c)

self_attention.forward(input_embeddings_c)

Input Embeddings: 
Shape: torch.Size([6, 4])
tensor([[-1.00, -0.74,  0.51, -0.08],
        [ 0.07, -0.56, -0.91,  0.36],
        [ 0.36,  0.87, -0.23,  0.04],
        [ 0.66, -0.93, -0.89,  0.06],
        [ 0.34, -0.98, -0.23, -0.87],
        [-0.17,  0.37,  0.18,  0.86]], dtype=torch.float64)
W_Query
Shape: torch.Size([4, 4])
Parameter containing:
tensor([[ 0.69, -0.17, -0.48,  0.27],
        [ 0.05,  0.40, -0.91,  0.51],
        [-0.82,  0.82,  0.47,  0.98],
        [ 0.31,  0.52, -0.34, -0.27]], dtype=torch.float64, requires_grad=True)
W_Key
Shape:   torch.Size([4, 4])
Parameter containing:
tensor([[-0.51,  0.30, -0.45, -0.52],
        [ 0.97, -0.85, -0.13, -0.45],
        [ 0.45,  0.26,  0.53, -0.28],
        [ 0.51,  0.77, -0.04, -0.67]], dtype=torch.float64, requires_grad=True)
W_Value
Shape: torch.Size([4, 4])
Parameter containing:
tensor([[-0.03,  0.81,  0.97,  0.90],
        [ 0.80,  0.01, -0.01, -0.85],
        [ 0.82,  0.03, -0.47,  0.00],
        [-0.88, -0.36, -0.82, -0.23