### Engineering

In [1]:
import torch
from torch import torch
import torch.nn.functional as F

In [2]:
import torch.distributed as dist
import torch.distributed.rpc as rpc

In [None]:
set_device_map("worker1", {"cuda:0": "cuda:1"})

In [None]:
class ParallelContext:
    def init_rpc_workers(self, host, port):
        if self.pipeline_parallel_size > 1:
            rank = self.get_local_rank(ParallelMode.GLOBAL)
            world_size = self.get_world_size(ParallelMode.GLOBAL)
            
            init_method = f"tcp://{host}:{port}"
            options = rpc.RpcBackendOptions(
                init_method=init_method
            )
            
            if torch.cuda.is_available():
                ranks = self.get_ranks_in_group(ParallelMode.GLOBAL)
                for other_rank in ranks:
                    if other_rank == rank:
                        continue
                    options.set_device_map(
                        WORKER_NAME.format(other_rank),
                        {rank: other_rank}
                    )
            
            rpc.init_rpc(
                name=WORKER_NAME,
                rank=rank,
                world_size=world_size,
                rpc_backend_options=options
            )

In [None]:
int *h_a, *h_b, *h_c;

In [None]:
size_t bytes = sizeof(int)*n;

In [None]:
h_a = (int*)malloc(bytes)
h_b = (int*)malloc(bytes)
h_c = (int*)malloc(bytes)

In [None]:
message passing, collective communication, p2p communication

In [None]:
prob[0] = sigmoid(logit0 - logit1)

In [4]:
receiver_heads = [(7, 3), (7, 9), (8, 6), (8, 10)]
receiver_layers = [7, 8]

In [None]:
_, clean_cache = model.run_with_cache(clean_tokens)
_, corrupted_cache = model.run_with_cache(corrupted_tokens)

In [5]:
from itertools import product
from functools import partial

In [6]:
n_heads = 12

In [7]:
sender_heads = list(product(range(max(receiver_layers)), range(n_heads)))

In [11]:
def patch_sender_head_output(acts, hook, clean_cache, corrupted_cache, trg_head):
    trg_layer_idx, trg_head_idx = trg_head
    if hook.layer() == trg_layer_idx:
        acts[:, :, trg_head_idx] = corrupted_cache[hook.name][:, :, trg_head_idx]
    else:
        acts = corrupted_cache[hook.name]
    
    return acts

In [13]:
def patch_receiver_head_input(acts, hook, patched_cache, receiver_heads):
    head_idxs = [head_idx for layer_idx, head_idx in receiver_heads if layer_idx == hook.layer()]
    acts[:, :, head_idxs] = patched_cache[hook.name][:, :, head_idxs]
    return acts

In [12]:
from transformer_lens.utils import get_act_name

In [None]:
for layer_idx, head_idx in range(sender_heads):
    model.reset_hooks()
    hook_name = get_act_name("z", layer_idx)
    hook_func = partial(
        patch_sender_head_output,
        clean_cache=clean_cache,
        corrupted_cache=corrupted_cache,
        trg_head=(layer_idx, head_idx)
    )
    model.add_hook(hook_name, hook_func)
    _, patched_cache = model.run_with_cache(clean_tokens)
    
    hook_name = get_act_name("v", layer_idx)
    hook_func = partial(
        patch_receiver_head_input
    )
    patched_logits = model.run_with_hooks(
        clean_tokens,
        fwd_hooks=[hook_name]
    )

In [None]:
W_E = model.W_E
W_U = model.W_U

OV_circuit = model.W_V[layer_idx, head_idx] @ model.W_O[layer_idx, head_idx]

In [None]:
full_OV_circuit = W_E @ OV_circuit @ W_U

In [None]:
step 1: logit_diff = W_U[:, 0] - W_U[:, 1]
step 2: approximate layer norm
step 3: inverse transformation
step 4: coeff.T @ logit_diff

In [None]:
- clock cycle 1: F_{i, j}
- clock cycle 2: F_{i+1, j}, F_{i, j+1}
- clock cycle 3: F_{i+2, j}, F_{i+1, j+1}, F_{i, j+2} 

In [None]:
- clock cycle 1: F_{0, 0}
- clock cycle 2: F_{1, 0}, F_{0, 1}
- clock cycle 3: F_{2, 0}, F_{1, 1}
- clock cycle 4: F_{2, 1}

In [None]:
tokens = model.to_tokens(text)

In [None]:
_, cache = model.run_with_cache(tokens)

In [None]:
cache["pattern", layer_idx][:, head_idx][:, -1, 0]

In [None]:
- step 1: head_out = L0H00(pre_resid)
- step 2: resid0 = pre_resid + head_out
- step 3: mlp1_out = MLP1(resid0)
- step 4: resid1 = resid0 + mlp1_out
- step 5: mlp2_out = MLP2(resid1)
- step 6: resid2 = resid1 + mlp2_out

In [14]:
layer_idx, head_idx = 0, 0

In [None]:
W_OV = model.W_V[layer_idx, head_idx] @ model.W_O[layer_idx, head_idx]

In [None]:
W_E = model.W_E
open_embeddings = W_E[:, open_idx]
close_embeddings = W_E[:, close_idx]

In [None]:
open_resid = open_embeddings + open_embeddings @ W_OV
close_resid = close_embeddings + close_embeddings @ W_OV

In [None]:
similarity = torch.cosine_similarity(open_resid, close_resid)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the model
model = nn.Linear(5, 1)

# Initialize the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Dummy data
input_tensor = torch.randn(10, 5)
target_tensor = torch.randn(10, 1)

# Forward pass
output = model(input_tensor)

# Loss calculation
loss_function = nn.MSELoss()
loss = loss_function(output, target_tensor)

# Backward pass
loss.backward()

# Optimizer step
optimizer.step()


In [2]:
# Model Parameters
for name, param in model.named_parameters():
    print(f"{name}: {param.data}")

# Gradients
for name, param in model.named_parameters():
    print(f"{name} gradient: {param.grad}")

# Optimizer States
print("Optimizer states:", optimizer.state_dict()['state'])


weight: tensor([[-0.3182,  0.2353, -0.1796, -0.0823, -0.4102]])
bias: tensor([-0.3449])
weight gradient: tensor([[ 0.1302,  0.5341, -0.2746, -0.0766, -0.8658]])
bias gradient: tensor([0.5989])
Optimizer states: {0: {'step': tensor(1.), 'exp_avg': tensor([[ 0.0130,  0.0534, -0.0275, -0.0077, -0.0866]]), 'exp_avg_sq': tensor([[1.6940e-05, 2.8530e-04, 7.5381e-05, 5.8721e-06, 7.4961e-04]])}, 1: {'step': tensor(1.), 'exp_avg': tensor([0.0599]), 'exp_avg_sq': tensor([0.0004])}}


In [3]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
mlp > residual stream > attention head

In [None]:
accuracy, latency, data quality, distribution shift

In [4]:
layer_idx, head_idx = 9, 9

In [None]:
W_OV = model.W_V[layer_idx, head_idx] @ model.W_O[layer_idx, head_idx]

In [None]:
_, cache = model.run_with_cache(clean_tokens)

In [None]:
W_E = model.W_E
io_embeddings = model.W_E

In [None]:
text_similarity = text_embedding @ text_embedding.T
image_similairty = image_embedding @ image_embedding.T

In [None]:
target = F.softmax(
    (text_similarity + image_similarity)/2,
    dim=-1
)

In [None]:
_, cache = model.run_with_cache(clean_tokens)

In [5]:
from transformer_lens.utils import get_act_name

In [6]:
layer_idx, head_idx = 9, 9

In [7]:
hook_name = get_act_name("z", layer_idx)

In [8]:
hook_name

'blocks.9.attn.hook_z'

In [None]:
output = cache[hook_name][:, head_idx] @ model.W_O[layer_idx, head_idx]

In [None]:
io_dirs = model.W_U[:, io_idxs]
s_dirs = model.W_U[:, s_idxs]

In [None]:
projection_in_io_dirs = output @ io_dirs
projection_in_s_dirs = output @ s_dirs

In [None]:
pattern = cache["pattern", layer_idx][:, head_idx]

In [None]:
attn_from_end_to_io = pattern[:, -1, io_tokens]
attn_from_end_to_s = pattern[:, -1, s_tokens]

In [None]:
class ResidualLayerNorm(nn.Module):
    def __init__(self, d_model, dropout):
        super().__init__()
        self.ln = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, residual):
        return self.dropout(x) + residual

In [None]:
def func()

In [None]:
def add_hook(model):
    for layer in model.layers:
        layer.register_forward_hook(add)

In [9]:
class Optimizer:
    def __init__(self, params, lr):
        self.params = params
        self.lr = lr
    
    def zero_grad(self):
        for p in params:
            p.grad = 0

In [10]:
import uuid

In [11]:
int(uuid.uuid4())

283543319665403365987628748860915578569

In [21]:
int(uuid.uuid4()) % 10000 

2246

In [23]:
import random
random.randint(0, 9999)

9865

In [24]:
from enum import Enum, auto

In [25]:
class JobType(Enum):
    FORWARD = auto()
    BACKWARD = auto()

In [26]:
JobType.FORWARD

<JobType.FORWARD: 1>