### Engineering

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
import torch.distributed as dist
import torch.distributed.rpc as rpc

In [3]:
WORKER_NAME = "WORKER_{}"

In [None]:
class ParallelContext:
    def init_rpc_workers(self, host, port):
        rank = self.get_local_rank(ParallelMode.GLOBAL)
        ranks_in_a_group = self.get_ranks_in_a_group(ParallelMode.GLOBAL)
        
        options = rpc.RpcBackendOptions()
        
        for _rank in ranks_in_a_group:
            if _rank == rank:
                continue
            options.set_device_map(self.)
        
        rpc.init_rpc(
            WORKER_NAME.format(rnak)
        )

In [None]:
ptr = (int*)malloc(69)

In [None]:
class ParallelContext:
    def init_rpc_workers(self, host, port):
        if self.pipeline_parallel_size > 1:
            rank = self.get_global_rank()
            options = rpc.RpcBackendOptions(
                init_method= f"tpc://{host}:{port}"
            )
            
            if torch.cuda.is_available():
                ranks = self.get_ranks_in_group(ParallelMode.GLOBAL)
                rpc_worker_map = {
                    rank: WORKER_NAME.format(rank)
                    for rank in ranks
                }
                
                for _rank in ranks:
                    if _rank == rank:
                        continue
                    options.set_device_map(rpc_worker_map[_rank], {rank: _rank})
            
            rpc.init_rpc(
                name=WORKER_NAME.format(rank),
                rank=rank,
                world_size,
            )

In [None]:
class Recompute(torch.autograd.Function):
    @staticmethod
    def forward(ctx, phony, recomputed, function, input):
        ctx.recomputed = recomputed
        ctx.function = function
        ctx.input = input
        
        return phony
    
    @staticmethod
    def backward(ctx, grad_input):
        function = ctx.function
        input_leaf = ctx.input.detach().requires_grad_(
            ctx.input.requires_grad
        )
        
        with torch.enable_grad():
            output = function(input_leaf)
        
        ctx.recompute.append((output, input_leaf))
        return tuple([None, None, None, input_leaf.grad ])

In [None]:
m+n-1

In [None]:
step 1: mapping
step 2: extract
step 3: create a placeholder tensor with the mapped value
step 4: send

In [4]:
class Checkpoint(torch.autograd.Function):
    @staticmethod
    def forward(ctx, phony, recomputed, function, input):
        ctx.recomputed = recomputed
        with torch.no_grad():
            output = function(input)
        return output

    @staticmethod
    def backward(ctx, grad_input):
        output, input_leaf = ctx.recompute.pop()
        with torch.enable_grad():
            torch.autograd.backward(output, grad_input)
        
        return tuple([None, None, None, input_leaf.grad])

In [None]:
def by_row_parallelism(inputs, weights):
    inp_per_partition = inputs.shape[-1] // 2
    w_per_partition = weights.shape[0] // 2
    
    x1, x2 = inputs[:, :inp_per_partition], inputs[:, inp_per_partition:]
    w1, w2 = weights[:w_per_partition, :], w[w_per_partition:, :]
    
    out1 = x1 @ w1
    out2 = x2 @ w2
    
    return out1 + out2

In [None]:
step 1: logit lens across accumulated residual stream
step 2: logit lens across decomposed residual stream
step 3: logit lens across decomposed attention layer 

In [7]:
from transformer_lens import HookedTransformerConfig, HookedTransformer

In [None]:
cfg = HookedTransformerConfig(**params)

In [None]:
model = HookedTransformer(cfg=cfg)

In [None]:
handles = []

In [None]:
for hook_func in hooks:
    model.ln_f.register_forward_pre_hook(hook_func)

In [None]:
hook_func[1].remove()

In [None]:
_, clean_cache = model.run_with_cache(clean_tokens)
_, corrupted_cache = model.run_with_cache(corrupted_tokens)

In [8]:
def patch_head(acts, hook, clean_cache, corrupted_cache, target_head):
    trg_layer_idx, trg_head_idx = target_head
    
    if hook.layer() == trg_layer_idx:
        acts[:, :, trg_head_idx] = corrupted_cache[hook.name][:, :, trg_head_idx]
    else:
        acts = corrupted_cache[hook.name]
    
    return acts

In [11]:
from itertools import product
from functools import partial

In [10]:
from transformer_lens.utils import get_act_name

In [None]:
combinations = list(product(range(n_layers, n_heads)))
results = torch.zeros(n_layers, n_heads)

for layer_idx, head_idx in combinations:
    model.reset_hooks()
    hook_name = get_act_name("z", layer_idx)
    hook_func = partial(
        patch_head,
        clean_cache=clean_cache,
        corrupted_cache=corrupted_cache,
        target_head=(layer_idx, head_idx)
    )
    model.add_hook(hook_name, hook_func)
    
    patched_logits, _ = model.run_with_cache(clean_tokens)
    results[layer_idx, head_idx] = compute_ioi_metric(patched_logits)

In [None]:
step 1: convert input tokens to fourier basis
step 2: do trig 
step 3: 

In [None]:
W_pos = model.W_pos

torch.cosine_similarity(W_pos[:, 0], W_pos[:, 1])

In [None]:
A@x@W_OV^{0}@W_OV^{1}

In [12]:
from einops import reduce

In [None]:
induction_score = reduce(
    induction_stripe,
    "n_heads seq_len -> seq_len", reduction="mean"
)

In [14]:
d_model = 16
d_head = 4

In [15]:
W_V = torch.zeros(d_head, d_model)
W_V[torch.arange(4), torch.arange(4)] = 1.

In [16]:
W_V

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [18]:
W_O = torch.zeros(d_model, d_head)
W_O[8:12, :] = torch.eye(4)

In [19]:
W_O

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [21]:
n_heads = 69

In [22]:
scores = {
    "Q": torch.zeros(n_heads, n_heads),
    "K": torch.zeros(n_heads, n_heads),
    "V": torch.zeros(n_heads, n_heads)
}

In [23]:
def compute_composition_score(W_A, W_B):
    W_AB_norm = (W_A @ W_B).pow(2).sum()
    W_A_norm = W_A.pow(2).sum()
    W_B_norm = W_B.pow(2).sum()
    
    return W_AB_norm / (W_A_norm * W_B_norm)

In [24]:
from einops import rearrange

In [None]:
W_O = model.W_O
W_V = model.W_V

W_Q = model.W_Q
W_K = model.W_K

W_OV = W_V @ W_O
W_QK = W_Q @ rea

In [None]:
for i in range(n_heads):
    for j in range(n_heads):
        scores["Q"][i, j] = compute_composition_score(
            
        )

step 1: duplication head detects duplicated tokens, and write that information to the duplicated token
step 2: s-inhibition head move that information to end token
step 3:  

In [None]:
step 1: resid @ W_U
step 2: resid = embed + unembed + sum()

In [None]:
scores = {
    "Q": torch.zeros(n_heads, n_heads),
    "K": torch.zeros(n_heads, n_heads),
    "V": torch.zeros(n_heads, n_heads)
}

In [25]:
def compute_composition_score(W_A, W_B):
    W_AB_norm = (W_A @ W_B).pow(2).sum()
    W_A_norm = W_A.pow(2).sum()
    W_B_norm = W_B.pow(2).sum()
    
    return W_AB_norm/(W_A_norm*W_B_norm)

In [26]:
from einops import rearrange

In [None]:
W_O = model.W_O
W_V = model.W_V
W_Q = model.W_Q
W_K = model.W_K

W_OV = W_V @ W_O
W_QK = W_Q @ rearrange(
    W_K, "... d_model d_head -> ... d_head d_model"
)

In [None]:
for i in range(n_heads):
    for j in range(n_heads):
        scores["Q"][i, j] = compute_composition_score(
            W_OV[0, i],
            W_QK[1, j]
        )
        scores["K"][i, j] = compute_composition_score(
            W_OV[0, i],
            W_QK[1, j].T
        )
        scores["V"][i, j] = compute_composition_score(
            W_OV[0, i],
            W_OV[1, j]
        )

In [None]:
_, cache = model.run_with_cache(tokens)

In [None]:
accumulated_resid = model.accumulated_resid(layer=-1, pos_slice=-1)

In [None]:
def compute_logit_diff(resid, model, answer_tokens):
    W_U = model.W_U
    correct_tokens, incorrect_tokens = answer_tokens.unbind(dim=0)
    return resid @ (W_U[:, correct_tokens] - W_U[:, incorrect_tokens ])

In [None]:
W_E @ W_Q @ W_K @ W_E

In [None]:
MLP(Attn(x @ W_E)) @ W_U

In [None]:
v_2 @ W_E @ W_Q @ W_K @ W_E @ [v_0, v_1]

In [None]:
def by_row_parallelism(inputs, weights):
    w_per_partition = weights.shape[0] // 2
    inp_per_partition = inputs.shape[-1] // 2
    
    inp1, inp2 = inputs[:, :inp_per_partition], inputs[:, inp_per_partition:]
    w1, w2 = weights[:, :w_per_partition], weights[:, w_per_partition:]
    
    out1 = inp1 @ w1
    out2 = inp2 @ w2
    
    return out1 + out2

In [27]:
import torch.distributed as dist
import torch.distributed.rpc as rpc

In [None]:
class ParallelContext:
    def init_rpc_workers(self, host, port):
        if self.pipeline_parallel_size > 1:
            rank = self.parallel_context.get_global_rank()
            ranks = self.parallel_context.get_ranks_in_group(ParallelMode.GLOBAL)
            world_size = self.parallel_context.get_local_world_size(ParallelMode.GLOBAL)
            
            options = rpc.TensorPipeRpcBackendOptions(
                init_method=f"tcp://{host}:{port}"
            )
            
            if torch.cuda.is_available():
                device_maps = {
                    rank: WORKER_NAME.format(rank)
                    for rank in ranks
                }
                
                for _rank in ranks:
                    if _rank == rank:
                        continue
                    
                    options.set_device_map(
                        WORKER_NAME.format(_rank), {rank: _rank}
                    )
            
            rpc.init_rpc(
                name=WORKER_NAME.format(),
                rank=rank,
                world_size=world_size,
                rpc_backend_options=options
            )

global distributed group, tensor parallel group, pipeline parallel group, data 

In [None]:
min(clock_idx+1, n_partitions)

In [None]:
def by_column_parallelism(inputs, weights):
    per_partition = weights.shape[-1] // 2
    w1, w2 = weights[:, :per_partition], weights[:, per_partition:]
    out1 = inputs @ w1
    out2 = inputs @ w2
    
    return torch.cat([out1, out2])

In [None]:
scatter, all-reduce

In [None]:
forward(x) > output = forward(x) -> backward(output)

In [None]:
step 1: determine global rank
step 2: initialize global distributed group
step 3: initialize parallel groups
step 4: set device
step 5: set seed

In [None]:
two uncertainty principles
quantization of action, quantization of angular momentum

In [29]:
from torch.distributed.rpc import RRef

In [30]:
class Agent:
    def __init__(self):
        self.id = RRef(self)

In [None]:
agent_rref.rpc_sync().init() 

In [None]:
neural plasticity, reliable recording, biocompa

In [None]:
step 1: record
step 2: 

In [None]:
get_reward(states, actions) + gamma*value_function(states+1).mean()

In [None]:
discover
search space
inductive bias

In [1]:
def compress_gradient(grad, other_tensor_shape):
    """
        Returns the gradient but compressed (needed when gradient shape mismatch during reverse mode).

        Paramaters:
        - grad: gradient.
        - other_tensor_shape: shape of target tensor.
    """
    ndims_added = grad.ndim - len(other_tensor_shape)
    for _ in range(ndims_added): 
        grad = grad.sum(axis=0)         
    for i, dim in enumerate(other_tensor_shape):
        if dim == 1: 
            grad = grad.sum(axis=i, keepdims=True) 
    return grad

In [2]:
import numpy as np

In [3]:
grad = np.array([[1., 2., 3.], [4., 5., 6.]])

In [7]:
compress_gradient(grad, (2,))

array([5., 7., 9.])

In [8]:
compress_gradient(grad, (1,))

array([21.])