### Engineering

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
def is_grad_enabled(input):
    return torch.is_grad_enabled() and input.requires_grad

In [None]:
def _broadcast(input):
    return input.clone()

In [None]:
def _reduce(input):
    world_size = torch.distributed.get_world_size()
    
    if world_size == 1:
        return input
    
    torch.distributed.all_reduce(input)
    
    return input

In [None]:
class Broadcast(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        return _broadcast(input)
    
    @staticmethod
    def backward(ctx, grad_output):
        return _reduce(grad_output)

In [None]:
def broadcast_with_forward_and_backward(inputs):
    if is_grad_enabled(inputs):
        outputs = Broadcast.apply(inputs)
    else:
        outputs = _broadcast(inputs)
    
    return outputs

In [None]:
world_size = torch.distributed.get_world_size()

In [None]:
sorted_params = sorted(param_list, key=lambda x: x.numel(), reverse=True)

In [None]:
numel_per_rank = [0 for _ in range(world_size)]

In [None]:
param_per_rank = [[] for _ in range(world_size)]

In [None]:
for param in sorted_params:
    next_device = numel_per_rank.index(min(numel_per_rank))
    
    param_per_rank[next_device].append(param)
    numel_per_rank[next_device] += param.numel()

In [None]:
step 1: send
step 2: sender continues their execution
step 3: recever stops
step 4: if, then

In [None]:
class VocabParallelEmbedding(nn.Module):
    def __init__(self, num_embedding, embedding_dim):
        super().__init__()
        
        world_size = torch.distributed.get_world_size()
        self.num_embedding = num_embedding
        self.embedding_dim_per_partrition = embedding_dim // world_size
        self.embedding = nn.Embedding(
            num_embedding,
            embedding_dim=self.embedding_dim_per_partrition
        )
        self.vocab_start_idx, self.vocab_end_idx = self.get_vocab_range(
            self.embedding_dim_per_partrition
        )
    
    def get_vocab_range(self, embedding_dim_per_partrition):
        rank = torch.distributed.get_rank()
        start_idx = rank*embedding_dim_per_partrition
        end_idx = start_idx+embedding_dim_per_partrition
        return start_idx, end_idx

    def forward(self, tokens):
        mask = (self.vocab_start_idx < tokens) | (tokens >= self.vocab_end_idx)
        tokens = tokens - self.vocab_start_idx
        tokens[mask] = 0.
        
        embeddings = self.embedding(tokens)
        mask_idxs = torch.where(mask == 0)[1]
        embedding[mask_idxs] = 0.
        
        return embedding

In [None]:
step 1: replicate the model
step 2: micro-batch
step 3: grad
step 4: average all the gradient

In [None]:
register > cached sram > main memory dram > 

In [None]:
step 1: partrition the params
step 2: 

In [None]:
gradient, params, optimizer 

In [None]:
import os

In [None]:
class MPU:
    def __init__(self, master_addr, master_port, backend):
        if not torch.distributed.is_initialized():
            rank = os.getenv("RANK")
            world_size = os.getenv("WORLD_SIZE")
            os.environ["MASTER_ADDR"] = master_addr
            os.environ["MASTER_PORT"] = master_port
            
            device_count = torch.cuda.device_count()
            
            if device_count > 0:
                device = rank % device_count
                torch.cuda.set_device(device)
            
            torch.distributed.init_process_group(
                rank=rank,
                world_size=world_size,
                backend=backend
            )

In [None]:
class ParallelMLP(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.dense_h_to_4h = ColumnParallelLinear(
            input_size=hidden_size,
            output_size=hidden_size*4
        )
        self.gelu = nn.GELU()
        self.dense_4h_to_h = RowParallelLinear(
            input_size=hidden_size*4,
            output_size=hidden_size
        )
    
    def forward(self, x):
        intermediate_activations = self.dense_h_to_4h(x)
        intermediate_activations = self.gelu(intermediate_activations)
        outputs = self.dense_4h_to_h(intermediate_activations)
        return outputs

In [None]:
from torch.utils.data import Dataset

In [None]:
class CachedDataset(Dataset):
    def __init__(self, filename):
        self.filename = filename
        self.data = data
        self.cache_index = {}
    
    def prefetch(self, idxs):
        if all([x in self.cache_index for x in idxs]):
            return
        
        if not self.data:
            self.data = torch.load(self.filename)
        
        total_numbers = sum([self.data[x].numel() for x in idxs])
        
        self.cache = torch.randn(total_numbers, dtype=self.data.dtype)
        
        offset = 0
        for i in idxs:
            n_numbers = self.data[i].numel()
            self.cache[offset:offset+n_numbers]
            self.cache_index[i] = offset

In [None]:
for i in range(num_tensor_model_parallel_groups):
    

In [None]:
step 1: load data to main memory
step 2: the size
step 3: reserve
step 4: load


In [None]:
shared memory, file system, message passing

In [None]:
step 1: partrition
step 2: move
step 3: init local optimizer
step 4: sync locla optimizer
step 5: move local params to a bucket

### ML Engineering

In [None]:
key, value, timestamp

In [None]:
import threading

In [None]:
thread = threading.Thread(target=print_numbers)

In [None]:
thread.start()

In [None]:
from typing import cast, List

In [None]:
numbers = cast(List[int], numbers)

In [None]:
step 1: create a network
step 2: attach the network
step 3:receive ip
step 4: communicate through the IPs

In [None]:
from contextlib import contextmanager

In [None]:
@contextmanager
def first_context():
    print("")
    

### Ai

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
def tokenize(x):
    return {
        "sentence1": tokenizer(x["tokenizer"])
    }

In [None]:
tokenized_dataset = small_dataset.map(tokenize)

In [None]:
def filter_func(x):
    return x["sentence1"].startswith("F")

In [None]:
small_dataset.filter(filter_func)

In [None]:
torch.distributed.broadcast(x, src=0, async_op=True)

In [None]:
for param_group in optimizer.param_groups:
    for param in param_group["param"]:
        print(param.shape)

In [None]:
def patch_func(activations, hook):
    activations[:, :, 4, :] = 0.
    return activations

In [None]:
model.run_with_hooks(
    tokens,
    fwd_hooks=[(hook_name, patch_func)]
)

In [None]:
class VocabParallelEmbedding(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super().__init__()
        world_size = torch.distributed.get_world_size()
        self.num_embeddings = num_embeddings
        self.embedding_dim_per_partrition = embedding_dim // world_size
        
        self.weight = nn.Parameter(torch.empty(
            self.num_embeddings,
            self.embedding_dim_per_partrition
        ))
        self.vocab_start_idx, self.vocab_end_idx = self.get_vocab_range(
            self.embedding_dim_per_partrition
        )
        
    def get_vocab_range(self, embedding_dim_per_partrition):
        rank = torch.distributed.get_rank()
        start_idx = rank*embedding_dim_per_partrition
        end_idx = start_idx+embedding_dim_per_partrition
        return start_idx, end_idx

    def forward(self, tokens):
        mask = (self.vocab_start_idx < tokens) | (tokens >= self.vocab_end_idx)
        mask_tokens = tokens - self.vocab_start_idx
        masked_tokens[mask] = 0.
        
        embeddings = F.embedding(masked_tokens, self.weight)
        mask_idxs = torch.where(mask == 0)[1]
        embeddings[mask_idxs] = 0.
        
        torch.distributed.all_reduce(embeddings)
        
        return embeddings

In [None]:
from einops import rearrange

In [None]:
output = rearrange(
    images,
    "b c (p_h n_h) (p_w n_w) -> b (n_h n_w) (p_h p_w c)"
)

In [None]:
tokens = model.to_tokens(text)

In [None]:
logits = model(tokens)

In [None]:
probs = F.softmax(logits, dim=-1)

In [None]:
last_token_probs = probs[:, -1, :]

In [None]:
target = tokens[1:]

In [None]:
predicted_log_probs = -last_tokens_probs[target].log()

In [None]:
repeated_tokens = model.to_tokens(repeated_text)

In [None]:
induction_heads = [(6, 9), (4, 2)]

In [None]:
attention_patterns = []

In [None]:
_, cache = model.run_with_cache()

In [None]:
for head_idx, layer_idx in induction_heads:
    

In [None]:
text_embeddings = model.embed(tokens)

In [None]:
positional_embeddings = model.pos_embed(tokens)

In [None]:
embedding = text_embeddings + positional_embeddings

In [None]:
residual = embedding

In [None]:
for block in model.blocks:
    residual = block(residual)

In [None]:
residual = model.ln_final(residual)

In [None]:
logits = model.unembed(residual)

In [None]:
task = torch.distributed.broadcast(x, src=0, async_op=True)

In [None]:
task.wait()

In [None]:
n_features = 5

In [None]:
interference[torch.arange(n_features), torch.arange(n_features)] = 0

In [None]:
polysemanticity = interference.pow(2).sum(dim=-1).sqrt() 

In [None]:
cache["hook_embed"]

In [None]:
cache["hook_pos_embd"]

In [None]:
linear, residual connection, layernorm, multi, mask-multi,