### Engineering

In [3]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
scatter > all reduce > identity > gather

In [None]:
embedding, linear, layer norm, attention

start, begin forward, begin backward, finished backward, finished batch

In [None]:
{"worker1": {"cuda:0": "cuda:1"}}

In [4]:
import torch.distributed as dist
import torch.distributed.rpc as rpc

In [None]:
class ParallelContext:
    def init_rpc_workers(self, host, port):
        if self.pipeline_parallel_size > 1:
            rank = self.get_local_rank(ParallelMode.GLOBAL)
            world_size = self.get_world_size(ParallelMode.GLOBAL)
            
            init_method = f"tcp://{host}:{port}"
            options = rpc.RpcBackendOptions(
                init_method=init_method
            )
            
            if torch.cuda.is_available():
                ranks = self.get_ranks_in_group(ParallelMode.GLOBAL)
                worker_map = {
                    rank: WORKER_NAME.format(rank)
                    for rank in ranks
                }
                
                for other in ranks:
                    if other == rank:
                        continue
                    options.set_device_map(
                        WORKER_NAME.format(other),
                        {rank: other}
                    )
            
            rpc.init_rpc(
                name=WORKER_NAME.format(rank),
                rank=rank,
                world_size=world_size,
                rpc_backend_options=options
            )

In [8]:
class _P2P:
    def _send_metadata(self, data, dst_rank, parallel_context, parallel_mode):
        group = parallel_context.get_group(parallel_mode)
        
        dtype = torch.tensor(DTYPE_TO_ID[data.dtype])
        dist.send(dtype, dst=dst_rank, group=group)
        
        requires_grad = torch.tensor(1 if data.requires_grad else 0)
        dist.send(requires_grad, dst=dst_rank, group=group)
        
        shape = torch.tensor(data.shape)
        dist.send(shape, dst=dst_rank, group=group)
    
    def send(self, data, dst_rank, parallel_context, parallel_mode):
        group = parallel_context.get_group(parallel_mode)
        self._send_metadata(data, dst_rank, parallel_context, parallel_mode)
        dist.send(data, dst=dst_rank, group=group)

In [6]:
def send(data, src_rank, dst_rank, parallel_context, parallel_mode):
    rank = parallel_context.get_local_rank(parallel_mode)
    
    if rank == src_rank:
        _P2P().send(data, dst_rank, parallel_context, parallel_mode)

In [None]:
ptr = (int*)malloc(69)

In [None]:
int zero() {
    return 0
}

In [None]:
node > pod > container

In [None]:
_, cache = model.run_with_cache(corrupted_tokens)

In [None]:
batch_size, seq_len = corrupted_tokens.size()

In [None]:
means = torch.zeros(n_layers, batch_size, seq_len, n_heads, d_heads)

In [11]:
from einops import reduce

In [10]:
from transformer_lens.utils import get_act_name

In [None]:
for layer_idx in range(n_layers):
    hook_name = get_act_name("z", layer_idx)
    z = cache[hook_name]
    
    for sample_idxs in corrupted_groups:
        template_z = cache[hook_name]
        mean_z = reduce(template_z, "batch_size seq_len n_heads d_head")

In [13]:
receiver_heads = [(7, 3), (7, 9), (8, 6), (8, 10)]
receiver_layers = [7, 8]

In [22]:
def patch_sender_head_output(
    acts, hook, clean_cache, corrupted_cache, target_head
):
    trg_layer_idx, trg_head_idx = target_head
    
    if trg_layer_idx == hook.layer():
        acts[:, :, trg_head_idx] = corrupted_cache[hook.name][:, :, trg_head_idx]
    else:
        acts = clean_cache[hook.name]
    
    return acts

In [24]:
def patch_receiver_head_input(acts, hook, patched_cache, receiver_heads):
    head_idxs = [head_idx for layer_idx, head_idx in receiver_heads if layer_idx == hook.layer()]
    acts[:, :, head_idxs] = patched_cache[hook.name][:, :, head_idxs]
    return acts

In [23]:
from itertools import product
from functools import partial

In [17]:
n_heads = 12

In [19]:
sender_heads = list(product(range(max(receiver_layers)), range(n_heads)))

In [None]:
_, clean_cache = model.run_with_cache(clean_tokens)
_, corrupted_cache = model.run_with_cache(corrupted_tokens)

In [None]:

for layer_idx, head_idx in sender_heads:
    model.reset_hooks()
    hook_name = get_act_name("z", layer_idx)
    hook_func = partial(
        patch_sender_head_output,
        clean_cache=clean_cache,
        corrupted_cache=corrupted_cache,
        target_head=(layer_idx, head_idx)
    )
    model.add_hook(hook_name, hook_func)
    _, patched_cache = model.run_with_cache(clean_tokens)
    
    hook_name = get_act_name("v", layer_idx)
    hook_func = partial(
        patch_sender_head_output,
        patched_cache=patched_cache,
        receiver_heads=receiver_heads
    )
    patched_logits = model.run_with_hook(clean_tokens)

In [None]:
int zero() {
    return 0;
}

In [None]:
torch.roll(x, shifts=1, dim=0)

In [None]:
step 1: get global rank
step 2: resize vocab size
step 3: resize lm_head
step 4: parallelize embedding, linear, attn, layernorm

In [None]:
step 1: wait input
step 2: get input
step 3: construct task
step 4: put
step 5: wait for in
step 6:

In [None]:
def broadcast_with_forward_and_backward():
    pass

In [25]:
import threading

In [26]:
lock = threading.Lock()

In [27]:
def run():
    with lock:
        print_numbers()

In [None]:
t1 = threading.Thread(target=run)

In [None]:
forward(x) -> output = forward(x) -> backward(output)

In [28]:
from typing import Callable

In [29]:
def foo(func: Callable[[int, int], str]) -> str:
    pass

In [None]:
[x**2 for x in l]

In [None]:
batch_size, seq_len = corrupted_tokens.size()

In [None]:
means = torch.zeros(n_layers, batch_size, seq_len, n_heads, d_head)

In [None]:
_, cache = model.run_with_cache(corrupted_tokens)

In [30]:
from einops import reduce

In [None]:
for layer_idx in range(n_layers):
    hook_name = get_act_name("z", layer_idx)
    for sample_idxs in corrupted_groups:
        mean_z = reduce(
            cache[hook_name][:, sample_idxs],
            "batch_size seq_len n_heads d_head -> batch_size n_heads d_head"
        )
        

In [None]:
softmax(x@W_Q@W_K.T@x.T) @ x @ W_V @ W_O

In [None]:
name mover heads, s-inhibition heads, duplication token head

In [None]:
tokens = model.to_tokens(text)

In [31]:
n_layers = 3
n_heads = 12

In [None]:
mlp_names = [get_act_name("mlp_out", layer_idx) in range(n_layers)]
attn_names = [get_act_name("result", layer_idx), range(n_heads)]

In [None]:
hook_names = ["embed", "pos_embed"] + mlp_names + attn_names

In [32]:
from einops import einsum

In [None]:
W_Q = model.W_Q[layer_idx, head_idx]
decomposed_q = einsum(
    input_components, W_Q
)

In [None]:
W_K = model.W_K[layer_idx, head_idx]
decomposed_k = einsum(
    input_components, W_K
)

In [None]:
decomposed_scores = einsum(
    decomposed_q, decomposed_k, ""
)

In [None]:
mlp, attn, layer norm, embedding, residual

In [None]:
policy, value, ,q 

In [None]:
def discount_reward(rewards, discount_factor):
b    factors = torch.pow()

In [1]:
def hello():
    print("h")

In [4]:
xs = {lambda x: isinstance(x, int): hello()}

h


In [None]:
on run slice_weight if module don't have the attribution parallel_info, and 