In [None]:
friction, normal, gravity, air resistance, applied force

normal, friction, air resistance, applied force, gravity

### Engineering

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
class Copy(torch.autograd.Function):
    @staticmethod
    def forward(ctx, prev_stream, next_stream, input):
        ctx.prev_stream = prev_stream
        ctx.next_stream = next_stream
        
        compute_stream = torch.cuda.default_stream(next_stream.device)
        
        with torch.cuda.use_stream(prev_stream), torch.cuda.use_stream(next_stream):
            moved_input = input.to(next_stream.device)
            
            input.record_stream(prev_stream)
            moved_input.record_stream(compute_stream)
        
        return moved_input
    
    @staticmethod
    def backward(ctx, grad_input):
        prev_stream = ctx.prev_stream
        next_stream = ctx.next_stream
        
        compute_stream = torch.cuda.default_stream(prev_stream.device)
        
        with torch.cuda.use_stream(prev_stream), torch.cuda.use_stream(next_stream):
            moved_grad_input = grad_input.to(prev_stream.device)
            
            grad_input.record_stream(next_stream)
            moved_grad_input.record_stream(compute_stream)
        
        return tuple([None, None, moved_grad_input])

In [None]:
elasticdriver
notificationservice
notificationmanager

In [3]:
import os

In [4]:
class MPU:
    def __init__(self, master_addr, master_port, backend):
        if not torch.distributed.is_initialized():
            rank = os.getenv["RANK"]
            world_size = os.getenv["WORLD_SIZE"]
            
            os.environ["MASTER_ADDR"] = str(master_addr)
            os.environ["MASTER_PORT"] = str(master_port)
            
            torch.distributed.init_process_group(
                rank=rank,
                world_size=world_size,
                backend=backend
            )
            
            n_devices = torch.cuda.device_count()
            if n_devices > 0:
                torch.cuda.set_device(rank % n_devices)

In [None]:
step 1: b

In [None]:
class Pipeline:
    def __init__(self, batches, partritions, devices, scheduler=DetermisticScheduler()):
        self.batches = batches
        self.partritions = partritions
        self.devices = devices
        self.scheduler = scheduler
        
    def fit(self):
        batches = self.batches
        partritions = self.partritions
        devices = self.device
        scheduler = self.scheduler
        
        n_batches = len(batches)
        n_partritions = len(partritions)
        
        with spawn_workers(devices) as (in_queues, out_queues):
            for schedule in scheduler.generate(n_batches, n_partritions):
                self.compute(schedule, in_queues, out_queues)
    
    def compute(self, schedule, in_queues, out_queues):
        
        for batch_idx, partrition_idx in schedule:
            batch = batches[batch_idx]
            
            def compute(batch, partrition):
                def wrapper():
                    return partrition(batch)
                return wrapper
            
            task = Task(compute=compute)
            in_queues[partrition_idx].put(task)
            
        for batch_idx, partrition_idx in schedule:
            output_queue = out_queues[partrition_idx].get()
            batches[batch_idx] = output_queue.output

In [None]:
step 1: wait for data transfer
step 1: get input
step 2: construct a task
step 3: put the task into in_queues
step 4: wait and get the output
step 5: put the output

In [None]:
fence: backward dependency, data transfer
compute: 

In [None]:
step 1: partrition
step 2: split a mini-batch into micro-batches
step 3: cuda stream
step 4: pipe
step 5: collect results

In [None]:
clock cycle 1: f(0, 0)
clock cycle 2: f(1, 0), f(0, 1)
clock cycle 3: f(2, 0), f(1, 1)
clock cycle 4: f(2, 1)

### Mech

In [None]:
key=persis

In [5]:
from transformer_lens.utils import get_act_name

In [6]:
hook_input = get_act_name("resid_post", 2)

In [7]:
hook_output = get_act_name("normalized")

In [8]:
hook_output

'ln_final.hook_normalized'

In [None]:
_, cache = model.run_with_cache(
    tokens,
    names_filter=lambda x: x in [hook_input, hook_output]
)

In [None]:
from einops import re

embed + pos_embed + attn00 + attn01 + mlp0 + 

attn01 + attn11 + mlp1

In [None]:
W_U = model.W_U
logit_diffence_direction = W_U[:, 0] - W_U[:, 1]

In [None]:
head_names = [get_act_name("result", layer_idx) for layer_idx in range(n_layers)]
mlp_names = [get_act_name("mlp_out", layer_idx) for layer_idx in range(n_layers)]

In [None]:
hook_names = ["hook_embed", "hook_pos_embed"]

In [None]:
hook_names.append(head_names)
hook_names.append(mlp_names)

In [None]:
_, cache = model.run_with_cache(
    tokens,
    names_filter=lambda x: x in hook_names
)

In [None]:
weight x input = output

input = weight.T @ ouput

In [None]:
pre_ln = fn_ln_coefs.T @ fn_ln_output

In [None]:
input_components = torch.tensor([cache["embed"] + cache["pos_embed"]])

In [None]:
for head_name, mlp_name in zip(head_names, mlp_names):
    input_components = torch.cat([
        cache[head_name],
        cache[mlp_name]
    ])

In [9]:
from einops import einsum

In [None]:
contributions = einsum(
    input_components,
    pre_lnc
)

In [None]:
weight @ input = 

### Engineering

In [None]:
main worker > worker thread > task > cuda stream

In [None]:
step 1: fp32, fp16 of the weight
step 2: do forward and backward using fp16
step 3: cast 
ipda

In [None]:
def profile_times(model, batch):
    records = []
    
    for layer in model:
        

In [10]:
import socketserver

In [11]:
def find_port(server_factory):
    start_port = 1024
    end_port = 65536
    
    for port in range(start_port, end_port):
        try:
            addr = ("", port)
            server = server_factory(add)
            return server, port
        except Exception:
            continue

In [None]:
class RowParallelLinear(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        world_size = torch.distributed.get_world_size()
        self.input_size_per_partrition = input_size // world_size
        self.output_size = output_size
        
        self.weight = nn.Parameter(torch.randn(
            self.output_size,
            self.input_size_per_partrition
        ))
    
    def forward(self, input):
        rank = torch.distributed.get_rank()
        dim_size_per_partrition = inputs.shape[-1] // rank
        input_chunks = torch.split(input, dim_size_per_partrition)
        
        input_parallel = input_chunks[rank]
        output_parallel = F.linear(input_parallel, self.weight)
        
        torch.distributed.all_reduce(output_parallel)
        
        return output_parallel

In [12]:
tensor_model_parallel_size = 2
num_tensor_model_parallel_groups = 8

In [13]:
for i in range(num_tensor_model_parallel_groups):
    ranks = list(range(
        i*tensor_model_parallel_size,
        (i+1)*tensor_model_parallel_size
    ))
    
    print(ranks)

[0, 1]
[2, 3]
[4, 5]
[6, 7]
[8, 9]
[10, 11]
[12, 13]
[14, 15]


In [None]:
step 1: scale the loss using a scaling factor
step 2: compute the grad with respect to the scaled loss
step 3: unscale the grad using the scaling factor
step 4: update params with respect to the unscaled grad

### MechInterp

In [None]:
step 1: P(0) = sigmoid(logit0 - logit1)
step 2: logit0 = final_ln @ unembed
step 3: 

In [None]:
step 1: ln1 = ln1(resid_pre)
step 2: attn_outputs = head1(ln1) + head2(ln1) + 
step 3: mid_resid = resid_pre + attn_outputs
step 4: ln2 = ln2(mid_resid)
step 5: mlp = mlp(ln2)
step 6: pos_resid = mid_resid + mlp

In [None]:
MLP(Attn(x@W_E))@W_U

In [None]:
cache["blocks.0.attn.hook_pattern"]

In [None]:
step 1: approximate
step 2: logit diff
step 3: decompose logit
step 4: project 

### MLE

In [14]:
from functools import lru_cache

In [15]:
{x for x in "hello"}

{'e', 'h', 'l', 'o'}

In [None]:
class Wizard(User):
    def __init__(self):
        super().__init__(self, email)
        sel.fname = 

In [20]:
from typing import List, Union

In [None]:
x: List[Union[int, float, str]]

In [None]:
step 1: approximate 

In [21]:
from transformer_lens import FactoredMatrix

In [None]:
AB = FactoredMatrix(AB)

In [None]:
AB.eigenvalues

In [None]:
weight @ input
weight-1

In [None]:
softmax(x@W_Q@W_K@x) @ x@ W_V @ W_O

In [22]:
from itertools import product

In [None]:
combinations = product(n_heads, n_layers)

In [None]:
step logit diff
step 2: decompose logit
step 3: compute logit difference direction
step 4: project the output of 

In [None]:
step 1: record all the interdimate activations
step 2: check the attention pattern of heads
step 3: spot induction head, if yes, dig further
step 4: decompose the query and vector
step 5: compute the contribution of each apir
step 6: identify the pair that 
step 7: construct the full circuit

In [None]:
step 1: approximate layer norm
step 2: reverse transformation

In [None]:
step 1: decompose the logits
step 2: compute logit difference direction
step 3: apprxoimate
step 4: reverse transformation
step 5: project

In [None]:
x@W_OV

In [None]:
clean_tokens = model.to_tokens(clean_prompt)
corrupted_tokens = model.to_tokens(corrupted_tokens)

In [None]:
correct_token = model.to_single_token(" John")
incorrect_token = model.to_single_token(" Mary")

In [None]:
seq_len = len(clean_tokens)

In [None]:
results = torch.zeros(n_layers, seq_len)

In [23]:
def patch_activation(activations, hook, position_idx):
    activations = clean_activations[hook.name][:, position_idx, :]
    return activations

In [24]:
from functools import partial

In [None]:
def compute_logit_difference(logits, correct_token, incorrect_token):
    logit = logits[:, -1, :]
    return logit[correct_token] - logit[incorrect_token]

In [None]:
for layer_idx in range(n_layers):
    hook_name = get_act_name("resid_pre", layer_idx)
    for position_idx in range(seq_len):
        hook_func = partial(patch_activation, position_idx=position_idx)
        patched_logits = model.run_with_cache(
            corrupted_tokens,
            fwd_hooks=[(hook_name, hook_func)]
        )
        logit_difference = compute_logit_difference(patched_logits)
        results[layer_idx][position_idx] = logit_difference

In [None]:
head_names

In [None]:
_, cache = model.run_with_cache(
    tokens,
    names_filter=lambda x: x in hook_names
)

In [None]:
cache.accu

In [None]:
step 1: approximate layer norm
step 2: reverse transformation
step 3: logit difference direction
step 4: 

In [None]:
n_features = 5

In [None]:
interference[torch.arange(n_features), torch.arange(n_features)] = 0.

In [None]:
polysemanticity = interference.pow(2).sum(dim=-1)