### Engineering

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
broadcast, gather, reduce, scatter

In [None]:
activation, gradient, optimizer states

In [None]:
tensor paralleism, and pipeline parallelism

In [None]:
threaded
process
vectorization
stream

In [None]:
class f(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        return input
    
    @staticmethod
    def backward(ctx, grad_output):
        torch.distributed.all_reduce(grad_output)
        return grad_output

In [None]:
class g(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        world_size = torch.distributed.get_world_size()
        inputs = [torch.empty_like(input) for _ in range(world_size)]
        torch.distributed.all_gather(inputs, input)
        inputs = torch.cat(inputs, dim=-1)
        return inputs
    
    @staticmethod
    def backward(ctx, grad_output):
        rank = torch.distributed.get_rank()
        world_size = torch.distributed.get_world_size()
        
        dim_size = input.shape[-1]
        chunk_dim_size = dim_size // world_size
        grad_split = torch.split(grad_output, chunk_dim_size, dim=-1)
        return grad_split[rank]

In [None]:
class ColumnParallelLinear(nn.Module):
    def __init__(self, input_size, output_size, world_size):
        super().__init__()
        
        self.input_size = input_size
        self.output_size_per_patrition = output_size // world_size
        
        self.weight = nn.Parameter(torch.empty(
            self.output_size_per_patrition,
            self.input_size
        ))
        self.bias = nn.Parameter(torch.empty(
            self.output_size_per_patrition
        ))
    
    def forward(self, x):
        input_parallel = f.apply(x)
        output_parallel = F.linear(input_parallel, self.weight, self.bias)
        outputs = g.apply(output_parallel)
        return outputs

In [None]:
- clock cycle 1: F_{1, 1}
- clock cycle 2: F_{1, 2}, F_{2, 1}
- clock cycle 3: F_{1, 3}, F_{2, 2}, F_{3, 1}

In [None]:
def compute_forward_pass_using_data_parallelism(
    model, inout, device_ids, output_id
):
    models = nn.parallel.replicate(model)
    inputs = nn.parallel.scatter(input)
    
    logit = nn.parallel.parallel_apply(models, inputs)
    
    logits = nn.parallel.gather(logit)
    return logits

In [None]:
micro-batch n, micro-batch n-1,....

In [None]:
step 1: weight fp16, fp32
step 2: compute the loss in fp16
step 3: scale the loss
step 4: compute the gradient in fp16 with respect to the scaled loss
step 5: unscale the gradient
step 6: cast the gradient to fp32
step 7: update parameters using fp32 of the gradient and fp32 of the weight

In [None]:
def compute_total_memory(model):
    total_memory = 0
    
    for param in model.parameters():
        total_memory += param.size

In [None]:
rank = torch.distributed.get_rank()

if rank == 0:
    torch.distributed.broadcast(x, src=0)

In [None]:
list_ranks = []

In [None]:
num_tensor_model_parallel_groups = 8

In [None]:
tensor_model_parallel_size = 2

In [None]:
rank = torch.distributed.get_rank()

In [None]:
for i in range(num_tensor_model_parallel_groups):
    ranks = range(
        i*tensor_model_parallel_size,
        i*tensor_model_parallel_size+1
    )
    
    list_ranks.append(ranks)
    
    if rank in ranks:
        group = torch.distributed.new_group(ranks=ranks)
        

In [None]:
list_ranks

[range(0, 1),
 range(2, 3),
 range(4, 5),
 range(6, 7),
 range(8, 9),
 range(10, 11),
 range(12, 13),
 range(14, 15)]

In [None]:
- task of patrition j must be on the j-th device
- F_{m, n} must be completed before F_{m+1, n}
- B_{m, n} must be completed before B_{m-1, n}

In [None]:
- clock cycle 1: b_{m, n}
- clock cycle 2: b_{m, n-1}, b_{m-1, n}
- clock cycle 3: b_{m, n-2}, b_{m-2, n-1}, b_{m-2, n}

In [None]:
step 1: record
step 2: split
step 3: move to 

### ML Engineering

In [None]:
import subprocess

In [None]:
subprocess.run(["echo", "hello world"])

hello world


CompletedProcess(args=['echo', 'hello world'], returncode=0)

In [None]:
blue = x[[1, 3], [0, 2]]

In [None]:
from prefect.deployments import Deployment

In [None]:
deployment_dev = Deployment.build_from_flow(
    training,
    name="model_training-dev"
)

### Sc

In [None]:
e > optic nerve >  visual cortex

In [None]:
neuron 1 > axon > synapse > dendrite

### AI

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
from torch.optim import Optimizer

In [None]:
class CustomOptimizer(Optimizer):
    def __init__(self, params):
        defaults = dict()
        super.__init__(self, params, defaults)
    
    def step(self):
        for param in self

In [None]:
from transformers import DataCollatorForLanguageModeling

In [None]:
collator = DataCollatorForLanguageModeling(tokenizer, mlm=True)

In [None]:
output = collator([tokenized_text])

In [None]:
from ray import tune

In [None]:
def target(param):
    x, y = param["x"], param["y"]
    score = objective(x, y)
    tune.report(score=score)

In [None]:
result = tune.run(
    target,
    config=config
)

In [None]:
class CustomOptimizer(Optimizer):
    def __init__(self, params):
        defaults = dict()
        super().__init__(params, defaults)
    
    def step(self):
        for group in self.param_groups:
            for p in group:
                if param.requires_grad is not True:
                    continue

                p.data = p.data * 6.9

In [None]:
import torch.distributed.rpc as rpc

In [None]:
rpc.remote("worker_1", create_tensor)

In [None]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
for i, block in enumerate(model.blocks):
    if i >= 6:
        for param in block.parameters():
            param.requires_grad = True

In [None]:
for param in model.transformer.ln_final.parameters():
    param.requires_grad = True

In [None]:
for param in model.lm_head.parameters():
    param.requires_grad = True

In [None]:
tokens = model.to_tokens(text)

In [None]:
logits = model(tokens)

In [None]:
last_token_logits = logit[:, -1, :]

In [None]:
last_token_log_prob = F.log_softmax(last_token_logits)

In [None]:
target_token = tokens[1:]

In [None]:
predicted_log_probs = -torch.gather(last_token_log_prob, dim=-1, index=target_tokens)

In [None]:
from einops import einsum

In [None]:
interference[torch.arange(5), torch.arange(5)] = 0.

In [None]:
interference.pow(2).sum(dim=-1).sqrt()

In [None]:
class Model(nn.Module):
    def __init__(self):
        device_0 = torch.device("cuda:0")
        device_1 = torch.device("cuda:1")
        
        self.net1 = nn.Sequential(
            nn.Linear(69, 420),
            nn.ReLU()
        ).to(device_0)
        
        self.net_2 = nn.Sequential(
            nn.Linear(420, 69)
        ).to(device_1)
    
    def forward(self, x):
        x = self.net1(x)
        next_device = next(self.net2.parameters()).device
        
        x.to(next_device)
        
        x = self.net2(x)
        return x

In [None]:
text_embeddings = cache["hook_embed"]

In [None]:
positional_embeddings = cache["hook_pos_embed"]

In [None]:
components = []

In [None]:
components.append(text_embeddings)

In [None]:
components.append(positional_embeddings)

In [None]:
from transformer_lens.utils import get_act_name

In [None]:
for layer in range(4):
    attn_name = get_act_name("attn_out", layer)
    mlp_name = get_act_name("mlp_out", layer)
    components.append(cache[attn_name])
    components.append(cache[mlp_name])

In [None]:
class Optimizer:
    def __init__(self, params, lr):
        self.params = params
        self.lr = lr
    
    def step(self):
        for p in self.params:
            if p.requires_grad:
                p.data.add_(-self.lr, p.grad.data)

In [None]:
interference[torch.arange(5), torch.arange(5)] = 0

In [None]:
poly = interference.pow(2).sum(dim=-1).sqrt()