### Engineering

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
new > ready > running > blocked > terminated

In [None]:
next_stream.record_event()

In [None]:
scheduler, control manager, ectd, api server

In [None]:
node > pod > container

In [3]:
class VocabParallelEmbedding(nn.Module):
    def __init__(self, num_embedding, embedding_dim):
        super().__init__()
        world_size = torch.distributed.get_world_size()
        self.num_embedding_per_partrition = num_embedding // world_size
        self.embedding_dim = embedding_dim
        
        self.weight = nn.Parameter(torch.randn(
            self.num_embedding_per_partrition,
            self.embedding_dim
        ))
        self.vocab_start_idx, self.vocab_end_idx = self.get_vocab_range(
            self.num_embedding_per_partrition
        )
    
    def get_vocab_range(self, num_embedding_per_partrition):
        rank = torch.distributed.get_rank()
        start_idx = num_embedding_per_partrition*rank
        end_idx = start_idx+num_embedding_per_partrition
        return start_idx, end_idx
    
    def forward(self, tokens):
        masks = (tokens < self.vocab_start_idx) | (tokens > self.vocab_end_idx)
        masked_tokens = tokens - self.vocab_start_idx
        masked_tokens[masks] = 0.
        
        embeddings = F.embedding(masked_tokens, self.weight)
        mask_idxs = torch.where(masks == False)[1]
        embeddings[:, mask_idxs, :] = 0.
        
        torch.distributed.all_reduce(embeddings)
        
        return embeddings

In [None]:
monitor
reassign if a node leaves
build a new communication ring if a new node join

In [4]:
class Copy(torch.autograd.Function):
    @staticmethod
    def forward(ctx, prev_stream, next_stream, input):
        ctx.prev_stream = prev_stream
        ctx.next_stream = next_stream
        
        compute_stream = torch.cuda.default_stream(next_stream.device)
        
        with torch.cuda.use_stream(prev_stream), torch.cuda.stream(next_stream):
            moved_input = input.to(next_stream.device)
            
            input.record_stream(prev_stream)
            moved_input.record_stream(compute_stream)
        
        return moved_input
    
    @staticmethod
    def backward(ctx, grad_input):
        prev_stream = ctx.prev_stream
        next_stream = ctx.next_stream
        
        compute_stream = torch.cuda.default_stream(prev_stream.device)
        
        with torch.cuda.use_stream(prev_stream), torch.cuda.stream(next_stream):
            moved_grad = grad_input.to(prev_stream.device)
            
            grad_input.record_stream(next_stream)
            moved_grad.record_stream(compute_stream)
        
        return tuple([None, None, moved_grad])

In [None]:
class Pipeline:
    def __init__(
        self,
        n_microbatches, n_partritions, devices,
        scheduler=DetermisticScheduler()
    ):
        self.n_microbatches = n_microbatches
        self.n_partritions = n_partritions
        self.devices = devices
        self.scheduler = scheduler
    
    def fit(self):
        n_microbatches = self.n_microbatches
        n_partritions = self.n_partritions
        devices = self.devices
        scheduler = self.scheduler
        
        with spawn_worker(devices) as (in_queues, out_queus):
            for schedule in scheduler.generate(n_microbatches, n_partritions):
                self.compute(schedule, in_queues, out_queues)
    
    def compute(self, schedule, in_queues, out_queues):
        batches = self.batches
        partritions = self.partritions
        
        for microbatch_idx, partrition_idx in schedule:
            batch = batches[microbatch_idx]
            partritions = partritions[partritions]
            
            def compute(batch, partrition):
                def wrapper():
                    return partrition(batch)
                return wrapper
            
            task = Task(compute=compute(batch, partrition))
            in_queues[partrition_idx].put(task)
        
        for microbatch_idx, partrition_idx in schedule:
            queue_output = out_queues[partrition_idx].get()
            task, 

In [None]:
clock cycle 1: F(0, 0)
clock cycle 2: F(0, 1), F(1, 0)
clock cycle 3: F(1, 1), F(2, 0)
clock cycle 4: F(2, 1)

In [None]:
step 1: split a mini-batch into microbatches
step 2: create cuda streams
step 3: run the pipeline
step 4: gather the output of all micro-batches

In [None]:

step 0: wait for data transfer
step 1: get the input
step 2: construct task
step 3: put the task into the correspond partrition's in_queue
step 4: wait and get the output
step 5: put the output to the next partrition's in_queues

In [None]:
build backward dependencies, data transfer and compute, put the output

In [None]:
step 1: backward(layer4), and backward(layer3)
step 2: recompute the activations
step 3: do backward 2
step 4: continue

In [None]:
criteria 1: all cuda operations that associate with x in gpu0...
criteria 2: 

### MechInterp

output_layer2 = embed + pos_embed + layer1 + layer2
= embed + pos_embed + attn00 + attn01 + mlp0 + attn10 + attn11 + mlp1

In [None]:
logits = model(past_moves)
log_probs = torch.log_softmax(logits[:, -1, :], dim=-1)

In [None]:
board_states = torch.zeros(board_size*board_size)
board_states[next_possible_moves] = log_probs

step 1: approximate nonlinear
step 2: probs => logit difference
step 3: decompose logits
step 4: project each input components to the target token in unembeding space

In [None]:
softmax(x@W_Q@W_K@x.T) @ W_V 

In [None]:
tokens = model.to_tokens(text)

In [None]:
_, cache = model.run_with_cache(tokens)

In [None]:
embed = cache["hook_embed"]
pos_embed = cache["hook_pos_embed"]
head_outputs = cache["result", layer_idx-1]

In [None]:
input_components = torch.cat([
    embed,
    pos_embed,
    head_outputs
], dim=-1)

In [5]:
from einops import einsum

In [None]:
W_Q = model.W_Q
query_components = einsum(
    input_components,
    W_Q,
    ""
)

In [None]:
W_K = model.W_K
key_components = einsum(
    input_components, W_K,
    ""
)

In [None]:
decomposed_attention_scores = einsum(
    query_components,
    key_components,
    ""
)

In [None]:
W_E @ W_QK^{1, 4} @ W_OV_{0, 7} @ W_E.T

In [None]:
W_E = model.W_E
W_Q = model.W_Q[1, 4]
W_K = model.W_K[1, 4]
W_O = model.W_O[0, 7]
W_V = model.W_V[0, 7]

In [None]:
Q = W_E @ W_Q

In [None]:
K = W_E W_V @ W_O.T @ W_K

In [None]:
ob_rref.owner()

In [None]:
tensor.to_numpy()

In [None]:
for param_group in optimizer.param_groups:
    for param in param_group.parameters():
        print(param.shape)

In [None]:
forward(x) > output = forward(x) > backward(output)

In [None]:
class Pipeline:
    def __init__(
        self,
        microbatches, partritions, devices,
        scheduler=DetermisticScheduler()
    ):
        super().__init__()
        self.batches = microbatches
        self.partritions = partritions
        self.devices = devices
        self.scheduler = scheduler
    
    def fit(self):
        batches = self.batches
        partritions = self.partritions
        devices = self.devices
        scheduler = self.scheduler
        
        n_microbatches = len(batches)
        n_partritions = len(partritions)
        
        with spawn_worker(device) as (in_queues, out_queues):
            for schedule in scheduler.generate(n_microbatches, n_partritions):
                self.compute(schedule, in_queues, out_queues)
    
    def compute(self):
        batches = self.batches
        
        for microbatch_idx, partrition_idx in schedule:
            batch = batches[microbatch_idx]
            
            def compute(batch, partrition):
                def wrapper():
                    return partrition(batch)
                return wrapper
            
            task = Task(compute=compute)
            in_queues[partrition_idx].put(task)
        
        for microbatch_idx, partrition_idx in schedule:
            output_queue = out_queues[partrition_idx].get()
            batches[microbatch_idx] = output_queue.output

In [6]:
from torch.utils.data import Sampler

In [None]:
class EvenSampler(Sampler):
    def __init__(self, data):
        self.data = self.data
    
    def __iter__(self):
        return iter([x for x in range(0, len(self.data), 2)])

ElasticDriver, HostDiscovery, NotificationManager, NotificationService, NotificationClient, TorchState

In [7]:
tensor_model_parallel_size = 2

In [8]:
num_tensor_model_parallel_groups = 8

In [None]:
for i in range(num_tensor_model_parallel_groups):
    ranks = range(
        i*tensor_model_parallel_size,
        i+1
    )

clock cycle 1: backward(m, n)
clock cycle 2: backward(m-1, n), backward(m, n-1)
clock cycle 3: backward(m-2, n), backward(m-1, n-1), backward(m, n-2)

In [None]:
sync
restore
commit/save
res

In [None]:
step 1: two prompts
step 2: record all the intedimate activations
step 3: iteratively ...
step 4: compute logit diff

In [None]:
W_U[:, tokens]

In [None]:
prob0 = sigmoid(logit0-logit1)

logit0 = fn_ln @ W_U[0]

In [None]:
tokens = model.to_tokens(text)

In [9]:
data = None

In [10]:
def extract_neuron_activations(activations, hook):
    data = activations
    return activations

In [None]:
hook_name = f"blocks.{layer_idx}.mlp.hook_post"

In [None]:
model.run_with_cache(
    tokens,
    fwd_hooks=[(hook_name, extract_neuron_activations)]
)

In [None]:
arg_max = torch.argmax(data, dim=-1)

In [11]:
def print_shape(module, inp):
    print(inp.shape)

In [None]:
model.blocks[1].register_forward_pre_hook(print_shape)

In [None]:
W_U = model.W_U

In [None]:
correct_resid_direction = W_U[:, correct_token]
incorrect_resid_direction = W_U[:, incorrect_token]

In [None]:
logit_diff_direction = correct_resid_direction - incorrect_resid_direction

In [None]:
n_features = 5

In [None]:
interferences[
    torch.arange(n_features),
    torch.arange(n_features)
] = 0.

In [None]:
polysemanticity = inteference.pow(2).sum(dim=-1)

In [None]:
model.accumulated_resid(
    layer=layer_idx,
    pos_slice=target_positions
)

In [None]:
QK, OV 

In [12]:
import pytest

In [None]:
@pytest.mark.parametrize(
    "input, output",
    [(1, 1), (2, 4)]
)
def test_square(input, output):
    assert square(input) == outpu

In [None]:
setattr

In [None]:
from tp

In [None]:
docker network create mongo-network

In [13]:
from typing import Union

In [None]:
x: Union[str, int, float] = []

In [14]:
from functools import lru_cache

In [None]:
ready, succeed, failed, cooldown, blacklisted

In [15]:
tensor_model_parallel_size

2

In [16]:
num_tensor_model_parallel_groups

8

In [17]:
for i in range(num_tensor_model_parallel_groups):
    ranks = list(range(
        i*tensor_model_parallel_size,
        (i+1)*tensor_model_parallel_size
    ))
    
    print(ranks)

[0, 1]
[2, 3]
[4, 5]
[6, 7]
[8, 9]
[10, 11]
[12, 13]
[14, 15]


In [None]:
torch.distributed.Work

In [18]:
from torch.profiler import profile, ProfilerActivity

In [None]:
with profile(
    activities=[ProfilerActivity.CPU]
) as prof:
    

In [None]:
for param_group in optimizer.param_groups:
    for param in param_group.parameters():
        print(param.shape)

In [None]:
stream = torch.cuda.Stream(device)

In [None]:
with torch.cuda.device(device):
    with torch.cuda.stream(stream):
        mean = xs.mean()

In [19]:
def probability_scores(image_embedding, text_embedding):
    image_norm = image_embedding.norm()
    image_embedding = image_embedding / image_norm
    
    text_norm = text_embedding.norm()
    text_embedding = text_embedding / text_norm
    
    similarities = image_embedding @ text_embedding.T
    probs = F.softmax(similarities, dim=-1)
    
    return probs

In [21]:
import torch.distributed.rpc as rpc

In [20]:
handlers = []

In [None]:
for ob_rref in ob_rrefs:
    handlers.append(
        rpc.async(
            ob_rref.owner(),
            func=ob_rref.rpc_sync().run
        )
    )

In [None]:
for handler in handlers:
    handler.wait()

In [22]:
from einops import rearrange

In [None]:
result = rearrange(
    images, 
    "b c (h w) -> b c h w",
    h=64
)

In [None]:
torch.zero_likes(RANDOM_TENSOR)

In [23]:
class SelfAttention(nn.Module):
    def __init__(self, d_head):
        super().__init__()
        self.d_head = d_head
    
    def forward(self, q, k, v, mask=None):
        k = k.permute(-2, -1)
        scores = torch.matmul(q, k)
        
        if mask is not None:
            scores.masked_fill(mask==True, -1e9)
        
        attn_patterns = F.softmax(scores / (self.d_head**0.5), dim=-1)
        output = torch.matmul(attn_patterns, v)
        return output, attn_patterns

In [None]:
class BottleneckResidualBlock(nn.Module):
    def __init__(self):
        

In [None]:
(first_part @ filter1).sum() + (first_part @ filter2).sum()

In [None]:
new_xs = xs.at[2].set(100)

In [None]:
grad_loss = jax.grad(loss)

In [24]:
from ray import tune

In [None]:
def training_function(config):
    x, y = config["x"], config["y"]
    score = objective(x, y)
    tune.report(score=score)

In [None]:
result = tune.run(
    training_function,
    config=config
)

In [None]:
accuracy, latency, memory

In [25]:
from torch.distributions import Categorical

In [27]:
class ActorCritic(nn.Module):
    def __init__(self, n_observations, n_actions, n_hidden):
        super().__init__()
        self.actor_network = nn.Sequential(
            nn.Linear(n_observations, n_hidden),
            nn.Tanh(),
            nn.Linear(n_hidden, n_hidden),
            nn.Tanh(),
            nn.Linear(n_hidden, n_actions)
        )
        self.critic_network = nn.Sequential(
            nn.Linear(n_observations, n_hidden),
            nn.Tanh(),
            nn.Linear(n_hidden, n_hidden),
            nn.Tanh(),
            nn.Linear(n_hidden, 1),
            nn.Sigmoid()
        )
    
    def forward(self, observations):
        logits = self.actor_network(observations)
        dist = Categorical(logits=logits)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        entropy = dist.entropy()
        critic_value = self.critic_network(observations)
        return action, log_prob, entropy, critic_value

In [28]:
import gymnasium as gym

In [None]:
envs = gym.vector.SyncVectorEnv([
    lambda: gym.make("CartPole-v1"), 
])

In [29]:
def clip(ratio, epsilon):
    return torch.clamp(ratio, min=1-epsilon, max=1+epsilon)