Recall that the transpose of a product of matrices is equal to the product of their transposes in reverse order, i.e., $(AB)^T = B^T A^T$.

The transpose of a matrix product of more than two matrices can be written similarly. For instance, consider three matrices $A$, $B$, and $C$, then $(ABC)^T = C^T B^T A^T$.

Now, let's apply these properties to $W_{Q K}^{1.4} W_{O V}^{0.7} W_E^T A$:

$(W_{Q K}^{1.4} W_{O V}^{0.7} W_E^T A)^T = (A^T) (W_E)^T (W_{O V}^{0.7})^T (W_{Q K}^{1.4})^T $

However, the weights $W_{Q K}^{1.4}$, $W_{O V}^{0.7}$, and $W_E$ do not change when transposed because they are the parameters of the neural network and remain constant.

So, we have:

$(W_{Q K}^{1.4} W_{O V}^{0.7} W_E^T A)^T = A^T W_E W_{O V}^{0.7} W_{Q K}^{1.4}$

I hope this helps! Let me know if you have any further questions.

We can rearrange the terms in the matrix product like this:

\begin{align*}
W_{QK}^{1.4} W_{OV}^{0.7} W_E^T A &= \left(W_{QK}^{1.4} W_{OV}^{0.7}\right) \left(W_E^T A\right) \
&= \left(A^T W_E W_{OV}^{0.7}\right) \left(W_K^{1.4}\right)^T \
&= \underbrace{\left(A^T W_E W_{OV}^{0.7} W_K^{1.4}\right)^T}_{\text{rearranged terms}}
\end{align*}

The key steps are:

Break apart the matrix product into two matrix products using associativity of matrix multiplication.

Since matrix multiplication is not commutative in general, we swap the order of the two matrix factors in the second product.

We then recognize that $\left(W_K^{1.4}\right)^T = \left(W_{QK}^{1.4}\right)^T$ since $W_K$ and $W_Q$ are transposes of each other.

Finally, we rearrange the terms again into the desired form.

So by repeatedly using the associative property and transpose identities, we can rearrange the product into the desired form. The key is recognizing that we can rearrange matrices in a product however we want, as long as we maintain the same order of multiplications.

### Engineering

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
num_gpus = 4
world_size = 16

In [7]:
for rank in range(world_size):
    print(f"rank: {rank} > gpu: {rank % num_gpus}")

rank: 0 > gpu: 0
rank: 1 > gpu: 1
rank: 2 > gpu: 2
rank: 3 > gpu: 3
rank: 4 > gpu: 0
rank: 5 > gpu: 1
rank: 6 > gpu: 2
rank: 7 > gpu: 3
rank: 8 > gpu: 0
rank: 9 > gpu: 1
rank: 10 > gpu: 2
rank: 11 > gpu: 3
rank: 12 > gpu: 0
rank: 13 > gpu: 1
rank: 14 > gpu: 2
rank: 15 > gpu: 3


In [None]:
container runtime, kubelet, kube-proxy

In [8]:
import socketserver

In [None]:
class EchoRequestHandler(socketserver.StreamRequestHandler):
    def handle(self):
        print(self.rread)

In [9]:
class RegularState:
    def __init__(self, states):
        self._saved_state = states
        
        for k, v in states:
            setattr(self, k, v)
        
    def restore(self):
        for k, v in self._saved_state:
            setattr(self, k, v)
    
    def commit(self):
        pass

In [10]:
class State(RegularState):
    def __init__(self, model, optim, **kwargs):
        kwargs.update({"model": model, "optim": optim})
        handlers, regular_states = get_handlers(kwargs)
        super.__init__(self, regular_states)
        self._handlers = handlers
    
    def commit(self):
        for handler in self._handlers:
            handler.commit()
        RegularState.commit(self)
    
    def restore(self):
        for handler in self._handlers:
            handler.restore()
        RegularState.commit(self)

In [None]:
message passing, file system, shared mmeory

In [None]:
__global__ void add(int* a, int* b, int* c, int total_elements) {
    int tid = (blockIdx.x * blockDim.x) + threadIdx.x
    
    if (tid < total_elements) {
        c[tid] = a[tid] + b[tid]
    }
}

In [13]:
import copy

In [None]:
class ModelStateHandler:
    def __init__(self, model):
        self.value = model
        self._saved_model_state = model.state_dict()
    
    def commit(self):
        self._saved_model_state = copy.deepcopy(model.state_dict())
    
    def restore(self):
        self.value.load_state_dict(self._saved_model_state)
    
    def sync(self):
        broadcast_parameters(self.value)

In [12]:
def get_handler(v):
    for handler_type, handler_cls in handler_registry:
        if isinstance(v, handler_cls):
            return handler_cls(v)
    return None

In [11]:
def get_handlers(states):
    handlers = {}
    remainders = {}
    
    for k, v in states:
        handler = get_handler(v)
        if handler is None:
            remainders[k] = v
        else:
            handlers[k] = v
    return handlers, remainders

In [None]:
scheduler, api server, control manager, ectd

In [15]:
class RegularState:
    def __init__(self, states):
        self._saved_states = states
    
    def restore(self):
        for k, v in self._saved_states:
            setattr(self, k, v)
    
    def commit(self):
        new_states = {}
        for k in self._saved_states.keys():
            new_states[k] = getattr(self, k)
        self._saved_states = new_states

In [16]:
class State(RegularState):
    def __init__(self, model, optim, kwargs):
        kwargs.update({"model": model, "optim": optim})
        handlers, regular_states = get_handlers(kwargs)
        self._handlers = handlers
        
        for k, v in self.handlers:
            setattr(self, k, v)
        
        RegularState.__init__(self, states=regular_states)
    
    def restore(self):
        for handler in self._handlers.value():
            handler.restore()
        RegularState.restore(self)
    
    def commit(self):
        for handler in self._handlers.values():
            handler.commit()
        RegularState.commit(self)

In [21]:
import copy

In [22]:
class ModelStateHandler:
    def __init__(self, model):
        self.set_value(model)
        self._saved_model_state = copy.deepcopy(
            model.state_dict()
        )
    
    def restore(self):
        self.value.load_state_dict(self._saved_model_state)
    
    def commit(self):
        self._saved_model_state = copy.deepcopy(self.value.state_dict())

In [20]:
def get_handler(v):
    for handler_type, handler_cls in handler_registry:
        if isinstance(v, handler_cls):
            return handler_cls(v)
    
    return None

In [17]:
def get_handlers(states):
    handlers = {}
    remainders = {}
    
    for k, v in states:
        handler = get_handler(k)
        if handler is None:
            remainders[k] = v
        else:
            handlers[k] = handler
    return handlers, remainders

In [None]:
container runtime, sheduler, ktcd, api server

In [23]:
world_size = 16
tensor_model_parallel_size = 2
pipeline_model_parallel_size = 4

In [24]:
num_pipeline_model_parallel_groups = 4

In [26]:
for i in range(pipeline_model_parallel_size):
    start_rank = i*num_pipeline_model_parallel_groups
    end_rank = (i+1)*num_pipeline_model_parallel_groups
    
    for j in range(tensor_model_parallel_size):
        ranks = list(range(
            start_rank+j,
            end_rank,
            tensor_model_parallel_size
        ))
        print(ranks)

[0, 2]
[1, 3]
[4, 6]
[5, 7]
[8, 10]
[9, 11]
[12, 14]
[13, 15]


In [None]:
clock cycle 1: backward(4, 3), recompute(3, 3)
clock cycle 2: backward(3, 3), recompute(2, 3)
clock cycle 3: backward(2, 3), recompute(1, 3)

In [None]:
hostdiscovery, 3 notifications, elastic driver, torchstate

In [None]:
elastic driver, 3 notifs, torchstate, host discovery

### MechInterp

In [27]:
from itertools import product

In [None]:
combinations = product(range(n_heads), range(n_layers))

In [None]:
tokens = model.to_tokens(text)

In [None]:
_, cache = model.run_with_cache(tokens)

In [None]:
def compute_score():
    pass

In [None]:
results = torch.zeros(n_layers, n_heads)

In [None]:
for layer_idx in range(n_layers):
    hook_name = f"blocks.{layer_idx}.attn.hook_pattern"
    layer_attn_pattern = cache[hook_name]
    for head_idx in range(n_heads):
        attn_pattern = layer_attn_pattern[:, head_idx, :, :]
        score = compute_score(attn_pattern, target_pattern)
        results[layer_idx][head_idx] = score

In [28]:
from contextlib import contextmanager

In [29]:
@contextmanager
def use_hooks(model, hooks):
    try:
        handlers = []
        for hook in hooks:
            model.transformer.h[1].register_forward_pre_hook(hook)
        yield handlers
    finally:
        for handler in handlers:
            handler.remove()

In [None]:
step 1: residual = embed(tokens) + pos_embed(tokens)
step 2: residual = blocks(residual)
step 3: residual = ln_final(residual)
step 4: logits = unembed(residual)

In [None]:
mlp + mid_resid

In [None]:
step 1: ln1 = ln1(resid_pre)
step 2: attn_out = head1(ln1) + head2(ln2)
step 3: mid_resid = resid_pre + attn_out
step 4: ln2 = ln2(mid_resid)
step 5: mlp = mlp(ln2)
step 6: post_resid = mlp + mid_resid

In [30]:
import streamlit as st

In [None]:
st.sidebar.

In [None]:
mid_resid = pre_sid + attnout

In [None]:
MLP(Attn(t@W_E^T))@W_U^T

In [None]:
tokens = model.to_tokens(text)

In [None]:
_, cache = model.run_with_cache(tokens)

In [None]:
embed = cache["hook_embed"]
pos_embed = cache["hook_pos_embed"]
head_outputs = cache["result", prev_layer_idx]

In [None]:
input_components = torch.cat(
    [embed, pos_embed, head_outputs],
    dim=-1
)

In [31]:
from einops import einsum

In [None]:
W_Q = model.W_Q[layer_idx]
query_components = einsum(
    input_components, W_Q,
    ""
)

In [None]:
query_contributions = query_components.pow(2).sum(dim=-1)

In [None]:
W_pos = model.W_pos
W_Q = model.W_Q[layer_idx, head_idx]
W_K = model.W_K[layer_idx, head_idx]

In [None]:
QK_circuit = W_K @ W_Q

In [None]:
pos_by_pos_scores = W_pos @ QK_circuit @ W_pos.T

In [None]:
def mask_scores(scores):
    masks = torch.triu(torch.ones_like(scores)).bool()
    neg_inf = torch.tensor(-1e9)
    return torch.where(masks, scores, neg_inf)

In [None]:
masked_pos_by_pos = masked_scores(
    pos_by_pos_scores/(d_head**0.5)
)

In [None]:
pos_by_pos_pattern = F.softmax(mask_pos_by_pos, dim=-1)

In [None]:
tokens = model.to_tokens(text)

In [None]:
_, cache = model.run_with_cache(tokens)

In [None]:
embed = cache["hook_embed"]
pos_embed = cache["hook_pos_embed"]
head_outputs = cache["result", layer_idx-1]

In [None]:
input_components = torch.cat([
    embed, pos_embed, head_outputs
], dim=-1)

In [None]:
W_Q = model.W_Q[layer_idx, head_idx]
query_components = einsum(
    input_components,
    W_Q,
    ""
)

In [None]:
W_K = model.W_K[layer_idx, head_idx]
key_components = einsum(
    input_components,
    W_K,
    ""
)

In [None]:
decomposed_scores = einsum(
    query_components,
    key_components,
    ""
)

In [None]:
W_E = model.W_E

W_Q = model.W_Q[1, 4]
W_K = model.W_K[1, 4]

W_O = model.W_O[0, 7]
W_V = model.W_V[0, 7]

In [None]:
Q = W_E @ W_Q

In [None]:
model.resid_accumte

In [None]:
v@W_E^T@W_QK@[v1, v2]

In [None]:
probs @ W_O + bias

### Persissstence

In [32]:
import torch
from torch import nn
import torch.nn.functional as F

In [45]:
a = 10
a1 = hex(id(a))

In [46]:
a = 15
a2 = hex(id(a))

What is the `output`? Explain

In [47]:
output = a1 == a2

In [48]:
output

False

**Explain**

The `id()` function in Python returns the memory address of an object. When you do `a = 10`, Python creates an integer object with the value of `10` and assigns the variable `a` to point to that object. When you call `id(a)`, it returns the memory address of the object `10`.

Now, when you later reassign `a = 15`, Python creates a new integer object with the value of `15` and changes the variable a to point to this new object. So, the memory address returned by `id(a)` after the reassignment is the address of the new object `15`, which is different from the previous object `10`.

Hence, `a1` and `a2` hold the memory addresses of two different objects (`10` and `15`, respectively), so `a1 == a2` is `False`.

In [None]:
step 1: download the parqet file from s3
step 2: load the parqet fule using apache arrow
step 3: extract the schema


In [50]:
from sqlalchemy.orm import sessionmaker

In [None]:
Session = sessionmaker(bind=engie)

In [51]:
import pytest

In [None]:
@pytest.mark.parametrize(
    ("input", "expected"),
    [(1, 1), (2, 4)]
)
def test_square(input, expected):
    assert square(input) == expected

In [52]:
from typing import TypedDict

In [53]:
class NutritionInformation(TypedDict):
    value: int
    unit: str

In [56]:
class RecipeNutritionInformation(TypedDict):
    recipes_used: int
    calories: NutritionInformation
    carbs: NutritionInformation

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.register_buffer("means", _one_vector)

In [None]:
torch.cuda.current_device()

In [57]:
def mse(preds, targs):
    return (preds-targs).pow(2).mean()

In [58]:
def split_dataset(dataset, percent):
    split = int(len(dataset)*percent)
    return dataset[:split], dataset[split:]

In [None]:
class BatchNorm(nn.Module):
    def __init__(self, mom, eps):
        super().__init__()
        self.mom = mom
        self.eps = eps
        self.mean = self.register_buffer(torch.zeros(1))
        self.var = self.register_buffer(torch.ones(1))
    
    def scale(self, mean, var, x):
        self.mean.lerp_(mean, self.mom)
        self.var.lerp_(var, self.mom)
    
    def forward(self, x):
        with torch.no_grad():
            mean, var = x.mean(), x.var()
        
        self.scale(mean, var, x)

In [None]:
class BatchNorm(nn.Module):
    def __init__(self, mom, eps):
        super().__init__()
        self.mom = mom
        self.eps = eps
        self.mults = nn
        self.register_buffer("mean", torch.zeros(1))
        self.register_buffer("var", torch.ones(1))
    
    def update_stats(self, x):
        mean = x.mean(dim=0)
        var = x.mean(dim=0)
        self.mean.lerp_(mean, self.mom)
        self.var.lerp_(var, self.mom)
        return mean, var
    
    def forward(self, x):
        with torch.no_grad():
            mean, var = self.update_stats(x)
        
        x = (x-mean)/(var+self.eps)
        return x