### Engineering

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
import torch.distributed as dist
import torch.distributed.rpc as rpc

In [3]:
class DataParallel:
    def __init__(self, module, parallel_context):
        self.module = module
        self.parallel_context = parallel_context
    
    def parallelize(self):
        module = self.module
        
        if self.parallel_context.data_parallel_size > 1:
            self._register_hook(module)
                
        return module
    
    def _register_hook(self, module):
        for p in module.parameters():
            if p.requires_grad:
                p.register_hook(self._register_bw_hook())
    
    def _register_bw_hook(self, grad):
        data_parallel_size = self.parallel_context.data_parallel_size
        process_group = self.parallel_context.get_group(ParallelMode.DATA)
        
        new_grad = grad / data_parallel_size
        dist.all_reduce(new_grad, op=dist.ReduceOp.SUM, group=process_group)
        
        return new_grad

In [None]:
criteria 1: num_running = num_workers
criteria 2: num_workers < max

In [None]:
jobselector > spawn initial workers > pool watcher

In [None]:
node > pod > container

In [4]:
import socketserver

In [None]:
with socketserver.ThreadingTCPServer(
    (MASTER_HOST, MASTER_PORT),
    EchoRequestHandler
) as server:
    pass

In [None]:
partition2(microbatch1)

In [5]:
from transformer_lens.utils import get_act_name

In [None]:
_, cache = model.run_with_cache(tokens)

In [None]:
pre_final_ln_name = get_act_name("post", 2)

pre_head20_ln_name = get_act_name("pre", 1)
post_head20_ln_name = get_act_name("normalized", 1, "ln1")

In [None]:
pre_final_ln = cache[pre_final_ln_name][:, :, 0]
post_final_ln = cache[post_final_ln_name][:, :, 0]

pre_head20_ln = cache[pre_head20_ln_name][:, :, 1]
post_head20_ln = cache[post_head20_ln_name][:, :, 1]

In [None]:
final_ln_coefs = fit_ln(pre_final_ln, post_final_ln)
head20_ln_coefs = fit_ln(pre_head20_ln, post_head20_ln)

In [None]:
logit_diff = model.W_U[:, 0] - model.W_U[:, 1]

In [6]:
layer_idx, head_idx = 2, 0

In [None]:
W_OV = model.W_V[layer_idx, head_idx] @ model.W_O[layer_idx, head_idx]

In [None]:
head20_ln_coefs.T @ W_OV @ final_ln_coefs.T @ logit_diff

In [7]:
def get_k(tokens, layer_idx, head_idx):
    _, cache = model.run_with_cache(tokens)
    hook_name = get_act_name("k", layer_idx)
    return cache[hook_name][:, :, head_idx]

In [8]:
layer_idx, head_idx = 0, 0

In [None]:
all_open_k = get_k(all_open_tokens, layer_idx, head_idx)
all_close_k = get_k(all_close_tokens, layer_idx, head_idx)

In [None]:
all_k_avg = (all_open_k + all_close_k) / 2

In [9]:
def patch_k(acts, hook, new_k, head_idx):
    acts[:, :, head_idx] = new_k
    return acts

In [10]:
from functools import partial

In [None]:
hook_func = partial(
    patch_k,
    new_k=all_k_avg,
    head_idx=head_idx
)

In [None]:
hook_name = get_act_name("k", layer_idx)

In [None]:
model.add_hook(hook_name, hook_func)

In [None]:
_, patched_cache = model.run_with_cache(all_open_tokens)

In [None]:
pattern = patched_cache["pattern", layer_idx][:, head_idx]

In [None]:
W /= W.norm(dim=-1)

In [None]:
similarities = torch.cosine_similarity(W, W.T)

In [None]:
microbatch n > microbatch n-1 > microbatch n-2 > ...

In [15]:
class Scatter(torch.autograd.Function):
    @staticmethod
    def forward(self, input):
        world_size = dist.get_world_size()
        rank = dist.get_rank()
        
        chunks = torch.chunk(
            input,
            chunks=world_size,
            dim=-1
        )
        return chunks[rank]

    @staticmethod
    def backward(self, grad_input):
        world_size = dist.get_world_size()
        grads = [torch.zeros_like(grad_input) for _ in range(world_size)]
        dist.all_gather(grads, grad_input)
        grads = torch.cat(grads, dim=-1)
        return grads

In [14]:
class Reduce(torch.autograd.Function):
    @staticmethod
    def forward(self, input):
        dist.all_reduce(input)
        return input
    
    @staticmethod
    def backward(self, grad_input):
        return grad_input

In [11]:
class RowParallelLinear(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        world_size = dist.get_world_size()
        
        input_per_partition = input_size // world_size
        
        self.weight = nn.Parameter(torch.randn(
            output_size, input_per_partition
        ))
        self.bias = nn.Parameter(torch.randn(
            output_size
        ))
    
    def forward(self, input):
        input_parallel = Scatter.apply(input)
        output_parallel = F.linear(input_parallel, self.weight)
        outputs = Reduce.apply(output_parallel)
        return outputs + self.bias

In [None]:
new > ready > running > blocked> terminalted

In [16]:
import os

In [None]:
class MPU:
    def __init__(self, master_addr, master_port, backend):
        if not dist.is_initialized():
            rank = os.getenv("RANK")
            world_size = os.getenv("WORLD_SIZE")
            
            init_method = f"tcp://{host}:{port}"
            
            dist.init_process_group(
                rank=rank,
                world_size=world_size,
                backend=backend
            )
            
            self.set_device(self, rank)
    
    def set_device(self, rank):
        n_devices = torch.cuda.device_count()
        
        if n_devices > 0:
            torch.cuda.set_device(rank%n_devices)

In [None]:
dist.broadcast(x, src=0)

In [None]:
data prefetching, memory mapping, lazy loading

In [18]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.distributed as dist
import torch.distributed.rpc as rpc

In [None]:
step 1: calculate the gradients
step 2: average
step 3: 

In [None]:
dist.broadcast(x, src=0, async_op=True)

In [None]:
torch.split(x, split_size_or_sections=3, dim=0)

In [None]:
num_gpus_for_each_model = tensor_model_parallel_size * pipeline_model_parallel

In [None]:
torch.clamp_min_(2)

In [None]:
rpc.get_worker_info()

In [None]:
#!/bin/bash

In [19]:
def intervene_resid(resid, hook, scale, feature, position):
    feature /= feature.norm(dim=-1)
    feature_projection = resid[:, position] @ feature
    resid[:, position] -= feature_projection*scale
    return resid

In [20]:
import torch
from torch import nn
import torch.nn.functional as F

In [22]:
from collections import OrderedDict


In [23]:
 model = nn.Sequential(OrderedDict([
    ('layer1', nn.Sequential(OrderedDict([
        ('fc', nn.Linear(4, 8)),
        ('relu', nn.ReLU())
    ]))),
    ('layer2', nn.Sequential(OrderedDict([
        ('fc', nn.Linear(8, 4)),
        ('relu', nn.ReLU())
    ]))),
    ('layer3', nn.Sequential(OrderedDict([
        ('fc', nn.Linear(4, 8)),
        ('relu', nn.ReLU())
    ]))),
]))

In [25]:
for 

Sequential(
  (fc): Linear(in_features=4, out_features=8, bias=True)
  (relu): ReLU()
)

In [26]:
class SelfAttention(nn.Module):
    def __init__(self, d_head):
        super().__init__()
        self.d_head = d_head
    
    def forward(self, q, k, v):
        k = k.permute(-1, -2)
        scores = torch.matmul(q, k) / (self.d_head**0.5)
        probs = F.softmax(scores)
        output = torch.matmul(probs, v)
        return output, probs

In [27]:
import random
import string

def generate_random_string(length=15):
    characters = string.ascii_letters + string.digits
    return ''.join(random.choice(characters) for i in range(length))

random_string = generate_random_string() 
print(random_string)

APvaqEsh2Y76QvE
