### Engineering

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
def wait_stream(source_stream, target_stream):
    if isinstance(target_stream, torch.cuda.Stream):
        if isinstance(source_stream, torch.cuda.Stream):
            # GPU waits for GPU
            source_stream.wait_stream(target_stream)
        else:
            # CPU waits for GPU
            target_stream.syncronous()

In [3]:
class Wait(torch.autograd.Function):
    @staticmethod
    def forward(ctx, prev_stream, next_stream, input):
        ctx.prev_stream = prev_stream
        ctx.next_stream = next_stream
        
        wait_stream(
            source_stream=next_stream,
            target_stream=prev_stream
        )
        
        return input
    
    @staticmethod
    def backward(ctx, grad_input):
        prev_stream = ctx.prev_stream
        next_stream = ctx.next_stream
        
        wait_stream(
            source_stream=prev_stream,
            target_stream=next_stream
        )
        
        grad_stream = [None, None]
        
        return grad_stream + grad_input

In [None]:
clock cycle 1: backward(m, n)
clock cycle 2: backward(m, n-1), backward(m-1, n)
clock cycle 3: backward(m, n-2), backward(m-1, n-1), backward(m-2, n)

In [None]:
node > pod > continer

In [None]:
step 1: scale the loss using the scaling factor
step 2: calculate the gradient using the scaled loss
step 3: unscale the gradient using the scaling factor
step 4: update the model's parameters with respect to unscaled gradient

In [5]:
class VocabParallelEmbedding(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super().__init__()
        world_size = torch.distributed.get_world_size()
        self.num_embeddings_per_partrition = num_embeddings // world_size
        self.embedding_dim = embedding_dim
        
        self.weight = nn.Parameter(torch.empty(
            self.num_embeddings_per_partrition,
            self.embedding_dim
        ))
        self.vocab_start_idx, self.vocab_end_idx = self.get_vocab_range(
            self.num_embeddings_per_partrition
        )
    
    def get_vocab_range(self, num_embeddings_per_partrition):
        rank = torch.distributed.get_rank()
        start_idx = rank*num_embeddings_per_partrition
        end_idx = rank*num_embeddings_per_partrition
        return start_idx, end_idx
    
    def forward(self, tokens):
        mask = (tokens < self.vocab_start_idx) | (tokens > self.vocab_end_idx)
        tokens = tokens - self.vocab_start_idx
        tokens[mask] = 0.
        
        embeddings = F.embedding(tokens, self.weight)
        mask_idxs = torch.where(mask == False)[1]
        embeddings[:, mask_idxs, :] = 0.
        
        torch.distributed.all_reduce(embeddings)
        
        return embeddings

In [None]:
step 1: partrition
step 2: rank
step 3: init local
step 4: sync
step 5: move to buckets

In [None]:
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

In [None]:
start.record()

hardshit()

end.record()

In [None]:
elapsed_time_ms = start.elapsed_time(end)

In [None]:
new > running > blocked > terminate

In [6]:
class RowParallelLinear(nn.Module):
    def __init__(self, input_size, output_size):
        world_size = torch.distributed.get_world_Size
        self.input_size_per_partrition = input_size
        self.output_size = output_size
        
        self.weight = nn.Parameter(torch.empty(
            self.output_size,
            self.input_size_per_partrition
        ))
        self.bias = nn.Parameter(torch.empty(
            self.output_size
        ))
    
    def forward(self, input):
        rank = torch.distributed.get_rank()
        world_size = torch.distributed.get_world_size()
        
        last_dim_size = input.shape[-1]
        last_dim_size_per_partrition = last_dim_size // world_size
        input_chunks = torch.split(
            input,
            last_dim_size_per_partrition,
            dim=-1
        )
        
        input_parallel = input_chunks[rank]
        output_parallel = F.linear(input_parallel, self.weight, self.bias)
        torch.distributed.all_reduce(output_parallel)
        return output_parallel

In [7]:
world_size = 16

In [8]:
tensor_model_parallel_size = 2

In [9]:
pipeline_model_parallel_size = 4

In [10]:
num_pipeline_model_parallel_groups = 4

In [11]:
for i in range(pipeline_model_parallel_size):
    start_rank = i*num_pipeline_model_parallel_groups
    end_rank = (i+1)*num_pipeline_model_parallel_groups
    
    for j in range(tensor_model_parallel_size):
        ranks = list(range(
            start_rank+j,
            end_rank,
            tensor_model_parallel_size
        ))
        
        print(ranks)

[0, 2]
[1, 3]
[4, 6]
[5, 7]
[8, 10]
[9, 11]
[12, 14]
[13, 15]


In [None]:
rank = torch.distributed.get_rank()

In [None]:
if rank == 69:
    torch.distributed.isend(x, dest=42)
elif rank == 42:
    torch.distributed.irecv(tensor_will_be_received_data, src=69)

In [None]:
Forward(x) -> output = Forward(x) -> Backward(output)

In [None]:
deployment, configmap, services

### MechInterp

In [13]:
induction_heads = [(6, 9), (4, 2)]

In [None]:
tokens = model.to_tokens(repeated_text, prepend_bos=True)

In [None]:
_, cache = model.run_with_cache(tokens)

In [14]:
from transformer_lens.utils import get_act_name

In [None]:
for head_idx, layer_idx in induction_heads:
    hook_name = get_act_name("attn", layer_idx)
    attention_pattern = cache[hook_name][0, head_idx]

In [None]:
n_features = interference.shape[-1]

In [None]:
polysemanticity = interference.pow(2).sqrt().sum(dim=-1)

In [None]:
with torch.cuda.device(device):
    total = x.sum(dim=-1)

In [15]:
from contextlib import contextmanager

In [16]:
@contextmanager
def first_context():
    print("entering the context")
    yield
    print("leaving the context")

In [None]:
if some_number > 6
    println("yes")
endif 

In [17]:
class MultiplyConstant(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.input = input
        return input
    
    @staticmethod
    def backward(ctx):
        return ctx.input + 1

In [18]:
@contextmanager
def use_stream(stream):
    if not isinstance(stream, torch.cuda.Stream):
        yield
        return
        
    with torch.cuda.stream(stream):
        yield

In [None]:
torch.distributed.get_rank()

In [None]:
nn.parallel.scatter()

In [None]:
from pyspark import SparkConf

In [None]:
conf = SparkConf()

In [None]:
conf.setAppName(app_name)

In [None]:
from pyspark import SparkContext

In [None]:
sc = SparkContext(conf=conf)

In [19]:
class RowParallelLinear(nn.Module):
    def __init__(self, input_size, output_size):
        world_size = torch.distributed.get_world_size()
        self.input_size_per_partrition = input_size // world_size
        self.output_size = output_size
        
        self.weight = nn.Parameter(torch.empty(
            self.output_size,
            self.input_size_per_partrition
        ))
        self.bias = nn.Parameter(torch.empty(
            self.output_size
        ))
    
    def forward(self, input):
        rank = torch.distributed.get_rank()
        world_size = torch.distributed.get_world_size()
        
        dim_size = input.shape[-1]
        dim_size_per_partrition = dim_size // world_size
        input_chunks = torch.split(
            input,
            dim_size_per_partrition,
            dim=-1
        )
        input_parallel = input_chunks[rank]
        output_parallel = F.linear(input_parallel, self.weight, self.bias)
        
        torch.distributed.all_reduce(output_parallel)
        
        return output_parallel

In [None]:
n_partrition
split
process

In [20]:
import pytest

In [None]:
@pytest.mark.parametrize(
    ("input", "output"),
    [(1, 1), (2, 4)]
)
def test_square(input, output):
    assert square(input) == output

In [21]:
from abc import ABC, abstractclassmethod

In [None]:
class Switchable(ABC)

In [22]:
class PositionalEmbedding(nn.Module):
    def __init__(self, n, d_model):
        super().__init__()
        self.n, self.d_model = n, d_model
    
    def forward(self, tokens):
        seq_len = len(tokens)
        embeddings = torch.zeros(seq_len, self.d_model)
        
        for p in range(seq_len):
            for i in range(self.d_model):
                denominator = torch.pow(self.n, 2*i*self.d_model)
                embeddings[p][i] = torch.cos(p/denominator) if i%2==0 else torch.sin(p/denominator)
        
        return embeddings

In [23]:
def create_one_hot(idx, size):
    xs = torch.zeros(size)
    xs[idx] = 1
    return xs

In [None]:
create_one_hot(3, 5).T @ user_factors

In [None]:
def add_x(layer, inp, out):
    add_log(layer, out)

In [None]:
def add_hook(model):
    for layer in model:
        layer.register_forward_hook(add_x)

In [None]:
class Optimizer:
    def __init__(self, params, lr):
        self.params = params
        self.lr = lr
    
    def step(self):
        for p in params:
            if p.requires_grad:
                p -= p.grad * self.lr

In [None]:
nn.parallel.gather()

In [None]:
def tokenize(x):
    return tokenizer(x["se)

In [None]:
small_dataset["sentence1"].map(tokenizer)

In [None]:
trainer = RLTrainer(
    "PPO",
    config=ScalingConfig(num_workers=3)
)

In [24]:
from ray import tune
from ray.tune.callback import Callback

In [25]:
class PrintCallback(Callback):
    def on_trial_result(self, iteration, trials, trial, result, **info):
        print("x")

In [None]:
tune.run(
    objective,
    config=search_space,
    callbacks=PrintCallback()
)

In [None]:
for step in range(10):
    action = env.action_space.sample()
    env.step(action)

In [None]:
def clip(ratio, epsilon):
    return torch.clamp(ratio, min=1-epsilon, max=1+epsilon)