### Engineering

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
sync, data, handshake

In [None]:
class ParameterSharding:
    def __init__(self, params_groups, parallel_context):
        self.params_groups = params_groups
        self.parallel_context = parallel_context
    
    def shard(self):
        world_size = self.parallel_context.get_world_size()
        params_per_rank = [[] for _ in range(world_size)]
        numel_per_rank = [0 for _ in range(world_size)]
        
        for param_group in self.param_groups:
            # partitioned params per rank
            params = [[] for _ in range(world_size)]
            for p in param_groups["params"]:
                next_rank = numel_per_rank.index(min(numel_per_rank))
                params[next_rank].append(p)
                numel_per_rank[next_rank] += p.numel()
            
            # now assign those partitioned
            for p in params:
                

In [2]:
import torch.distributed as dist

In [None]:
local_max = torch.max(xs)[0]
global_max = dist.all_reduce(
    local_max,
    op=dist.ReduceOp.MAX
)

In [None]:
normalized_xs = xs - global_max

In [6]:
from einops import rearrange

In [7]:
class _ParallelCrossEntropy(torch.autograd.Function):
    @staticmethod
    def forward(ctx, parallel_logits, targets, parallel_context):
        def get_vocab_range(partition_size, rank):
            start_idx = partition_size*rank
            end_idx = start_idx+partition_size
            return start_idx, end_idx
    
        def get_predicted_logits(parallel_logits, targets):
            partition_size = parallel_logits.shape[-1]
            rank = parallel_context.get_local_rank(ParallelMode.TENSOR)
            vocab_start_idx, vocab_end_idx = get_vocab_range(partition_size, rank)
            
            target_mask = (targets < self.vocab_start_idx) >= (targets > self.vocab_end_idx)
            masked_targets = targets.clone() - self.vocab_start_idx
            masked_targets[target_mask] = 0
            
            masked_targets_1d = rearrange(
                targets, "b s -> (b s)"
            )
            parallel_logits = rearrange(
                parallel_logits, "b s v -> (b s) v"
            )
            predicted_logits = parallel_logits[torch.arange(masked_targets_1d.size(0)), masked_targets_1d]
            predicted_logits[masked_targets_1d] = 0.
            
            predicted_logits = all_reduce(
                predicted_logits,
                parallel_context=parallel_context,
                parallel_mode=ParalellMode.TENSOR
            )
            return predicted_logits
        
        predicted_logits = get_predicted_logits(parallel_logits, targets)
        exp_logits = all_reduce(
            parallel_logits.exp().sum(dim=-1),
            parallel_context=parallel_context,
            parallel_mode=ParallelMode.TENSOR
        )
        
        loss = exp_logits.log() - predicted_logits
        return loss

In [5]:
class ParallelCrossEntropy(nn.Module):
    def __init__(self, parallel_context):
        super().__init__()
        self.parallel_context = parallel_context
    
    def forward(self, logits, targets):
        loss = _ParallelCrossEntropy.apply(
            logits, targets, self.parallel_context
        )
        
        loss /= len(targets)

In [8]:
from contextlib import contextmanager

In [9]:
@contextmanager
def use_stream(stream):
    if not isinstance(stream, torch.cuda.Stream):
        yield
        return
    
    with torch.cuda.stream(stream):
        yield

In [None]:
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

In [None]:
start.record()

hardshit()

end.record()

torch.cuda.synchronize()

In [None]:
elapsed_time = start_event.elapsed_time(end_event)

In [None]:
int *h_a, *h_b, *h_c;

In [None]:
size_t bytes = sizeof(int)*n;

In [None]:
h_a = (int*)malloc(bytes)
h_b = (int*)malloc(bytes)
h_c = (int*)malloc(bytes)

In [None]:
elasticdriver, torchstate, 3notifications, hostdiscovery

In [11]:
class Copy(torch.autograd.Function):
    @staticmethod
    def forward(ctx, prev_stream, next_stream, input):
        ctx.prev_stream = prev_stream
        ctx.next_stream = next_stream
        compute_stream = torch.cuda.default_stream(
            next_stream.device
        )
        
        with torch.cuda.stream(prev_stream), torch.cuda.stream(next_stream):
            moved_input = input.to(next_stream.device)
            
            input.record_stream(prev_stream)
            moved_input.record_stream(next_stream)
        
        return moved_input
    
    @staticmethod
    def backward(ctx, grad_input):
        compute_stream = torch.cuda.default_stream(ctx.prev_stream.device)
        
        with torch.cuda.stream(ctx.prev_stream), torch.cuda.stream(ctx.next_stream):
            moved_grad = grad_input.to(ctx.prev_stream.device)
            
            grad_input.record_stream(ctx.next_stream)
            grad_input.record_stream(ctx.prev_stream)
        return moved_grad

In [None]:
message passing, file system, shared memor 

In [None]:
A @ W_E @ W_Q @ W_K @ W_E @ B

In [None]:
step 1: num b

In [None]:
handles = []
for hook_func in hooks:
    handles.append(model.ln_f.register_forward_pre_hook(hook_func))

In [None]:
handles[1].remove()

In [12]:
def compute_score(pattern, target_pattern):
    return (pattern*target_pattern).sum() / (pattern.sum())

In [None]:
tokens = model.to_tokens(text)
_, cache = model.run_with_cache(tokens)

In [None]:
data = torch.zeros(n_layers, n_heads)

for layer_idx in range(n_layers):
    for head_idx in range(n_heads):
        hook_name = f"blocks.{layer_idx}.attn.hook_pattern"
        pattern = cache[hook_name][:, head_idx]
        data[layer_idx, head_idx] = compute_score(
            pattern, target_pattern
        )

In [None]:
v_2 @ W_E @ W_Q @ W_K @ W_E @ [v0, v_1]

In [None]:
step 1:  logit0 - logit1
step 2: logit0 = final_resid @ W_U[0], logit1 = final_resid @ W_U[1]
step 3: logit0 - logit1 = final_resid @ (W_U[0] - W_U[1])

In [None]:
input_direction = model.W_in[layer_idx, :, neuron_idx]

In [None]:
softmax(x @ W_Q @ W_K.T @ x) @ x @ W_V @ W_O

In [None]:
W_E = model.W_E
W_V = model.W_V[layer_idx, head_idx]
W_O = model.W_O[layer_idx, head_idx]
W_U = model.W_U

In [None]:
full_OV_circuit = W_E @ W_V @ W_O @ W_U

In [None]:
W_E = model.W_E
W_Q = model.W_Q[1, 4]
W_K = model.W_K[1, 4]
W_O = model.W_O[0, 7]
W_V = model.W_V[0, 7]

In [None]:
Q = W_E @ W_Q
K = W_E @ W_V @ W_O @ W_K

In [None]:
layer_idx = 3

In [None]:
_, cache = model.run_with_cache(past_moves)

In [None]:
top_neurons = cache["post", layer_idx].std(dim=[0, 1]).argsort(descending=True)[:10]

In [13]:
from einops import einsum

In [14]:
def compute_consine_similarity(neuron_idx, feature):
    W_out = model.W_out[layer_idx, neuron_idx]
    W_out /= W_out.norm(dim=-1)
    
    feature /= feature.norm(dim=-1)
    
    return einsum(
        W_out, feature,
        ""
    )

In [None]:
heatmap_blank = []
for neuron_idx in top_neurons:
    heatmap_blank.append(compute_consine_similarity(
        neuron_idx,
        feature=blank_dir
    ))

In [None]:
_, cache = model.run_with_cache(board_history)

In [15]:
layer_idx, neuron_idx = 5, 1393

In [None]:
mlp_neurons = cache[hook_name][:, :, neuron_idx]
threshold = mlp_neurons.quantile(0.99)
top_neurons = mlp_neurons > threshold

In [None]:
(board_states == 2)[top_neurons].float().mean(dim=-1)

In [None]:
step 1: accumul
step 2: calcualte the logit difference direction
step 3: proj

In [None]:
step 1: duplication heads detect duplicated tokens, write the information to S
step 2: s-inhibition heads move that information to ENd
step 3: name mover heads bsaed on 

In [None]:
name_tokens = model.to_tokens(names)

In [None]:
embed = model.embed
mlp0 = model.blocks[0].mlp
ln1 = model.blocks[0].ln1
unembed = model.unembed

In [None]:
text_embeddings = embed(name_tokens)
resid_after_mlp0 = text_embeddings + mlp0(ln1(text_embeddings)a)

In [16]:
from transformer_lens.utils import get_act_name

In [None]:
def resid2logits(resid):
    return model.unembed(model.ln_final(resid))

In [None]:
for layer_idx in range(n_layers):
    for head_idx in range(n_heads):
        W_OV = model.W_V[layer_idx, head_idx] @ \
            model.W_O[layer_idx, head_idx]
        
        resid = resid_after_mlp0 @ W_OV
        logits = resid2logits(resid)
        top_predictions = torch.topk(top_logits, dim=-1, k=5)
        percentage = (top_predictions == name_tokens).any().mean()

In [None]:
register > cache > ram > hardrive > external

In [None]:
(local_rank - 1) % local_world_size

In [None]:
step 1: acts
step 2: split
step 3: local self attention
step 4:

In [None]:
step 1: wait inp
step 2: get inp
step 3: construct
step 4: put
step 5: wait oup
step 6: get
step 7: put

In [None]:

step 1: initialize global distributed group
step 2: initialize parallel groups
step 3: set device
step 4: set seed

In [17]:
import threading

In [18]:
event = threading.Event()

In [19]:
def run():
    print("wait")
    event.wait()
    print("received")

In [None]:
thread = threading.Thread(target=run)

In [None]:
thread.start()

In [None]:
class ParameterSharding:
    def __init__(self, param_groups, parallel_context):
        self.param_groups = param_groups
        self.parallel_context = parallel_context
    
    def shard(self):
        world_size = parallel_context.get_world_size()
        params_per_rank = [[] for _ in range(world_size)]
        numel_per_rank = [0 for _ in range(world_size)]
        
        for param_group in param_groups:
            # partitioned params of the current group
            param_lists = [[] for _ in range(world_size)]
            
            for p in param_groups["params"]:
                next_rank = numel_per_rank.index(min(numel_per_rank))
                param_lists[next_rank].append(p)
                numel_per_rank[next_rank] += p.numel()
            
            for rank, params for param_lists:
                param_g = copy.copy(param_group)
                param_g["params"] = params
                params_per_rank[rank].append(param_g)

In [20]:
def convert_to_distribution(states):
    q_values = q_function(states)
    return q_values.exp() / q_values.sum()

In [None]:
record, send, memory, process

In [None]:
for _ in range(N_EPISODES):
    state, _ = env.reset()
    done = False
    
    while not done:
        states = 

In [21]:
import torch

In [25]:
torch.cuda.memory_allocated(device=torch.device("cuda:0"))

0