### Engineering

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
intra-layer, inter-layer, data

In [None]:
thread
process
vectorization
stream

In [None]:
import time

In [None]:
def profile_times(model, batch):
    records = {}
    
    for layer in model:
        start_time = time.time()
        outputs = [layer(x) for x in batch]
        
        outputs_with_grad = [x for x in outputs if x.requires_grad]
        
        if outputs_with_grad:
            torch.autograd.grad(outputs_with_grad)
        
        end_time = time.time()
        
        records[layer] = end_time - start_time
    
    return records

In [None]:
class RowParallelLinear(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        world_size = torch.distributed.get_world_size()
        
        self.input_size_per_patrition = input_size // world_size
        self.output_size = output_size
        
        self.weight = nn.Parameter(torch.empty(
            self.output_size,
            self.input_size_per_patrition
        ))
        
        self.bias = nn.Parameter(torch.empty(
            self.output_size
        ))
    
    def forward(self, input):
        rank = torch.distributed.get_rank()
        world_size = torch.distributed.get_world_size()
        
        dim_size = input.shape[-1]
        n_chunks = dim_size // world_size
        input_chunks = torch.split(input, n_chunks, dim=-1)
        
        input_parallel = input_chunks[rank]
        
        output_parallel = F.linear(input_parallel, self.weight, self.bias)
        
        torch.distributed.all_reduce(output_parallel)
        
        return output_parallel

In [None]:
step 1: split the input
step 2: compute the output of each patrition
step 3: all reduce

In [None]:
ranks = [0, 1, 3, 5]

In [None]:
group = None

In [None]:
rank = torch.distributed.get_rank()

In [None]:
if rank in ranks:
    group = torch.distributed.new_group(ranks=ranks)

In [None]:
if group is not None:
    torch.distributed.broadcast(x, src=0, group=group)

In [None]:
class VocabParallelEmbedding(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super().__init__()
        world_size = torch.distributed.get_world_size()
        self.num_embedding_per_patrition = num_embeddings // world_size
        self.embedding_dim = embedding_dim
        
        self.embedding = nn.Embedding(num_embedding_per_patrition, embedding_dim)
        self.start_vocab_idx, self.end_vocab_idx = self.get_vocab_range(num_embeddings)
    
    def get_vocab_range(self, num_embeddings):
        world_size = torch.distributed.get_world_size()
        rank = torch.distributed.get_rank()
        num_embedding_per_patrition = num_embeddings // world_size
        start_vocab_idx = rank * num_embedding_per_patrition
        end_vocab_idx = start_vocab_idx + num_embedding_per_patrition
        
        return start_vocab_idx, end_vocab_idx
    
    def forward(self, input):
        mask = (input < self.start_vocab_idx) | (input >= self.end_vocab_idx)
        input = input - self.start_vocab_idx
        input[mask] = 0
        
        output_parallel = self.embedding(input)
        mask_idxs = torch.where(mask == True)[1]
        output_parallel[mask_idxs, :] = 0.
        
        torch.distributed.all_reduce(output_parallel)
        
        return output_parallel

In [None]:
class f(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        return input

    @staticmethod
    def backward(ctx, grad_output):
        torch.distributed.all_reduce(grad_output)
        return grad_output

In [None]:
class g(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        world_size = torch.distributed.get_world_size()
        inputs = [torch.zeros_like(input) for _ in range(world_size)]
        torch.distributed.all_gather(inputs, input)
        inputs = torch.cat(inputs, dim=-1)
        return inputs
    
    @staticmethod
    def backward(ctx, grad_output):
        rank = torch.distributed.get_rank()
        world_size = torch.distributed.get_world_size()
        dim_size = grad_output.shape[-1]
        n_chunks = dim_size // world_size
        grad_chunks = torch.split(grad_output, n_chunks, dim=-1)
        
        return grad_chunks[rank]

In [None]:
class ColumnParallelLinear(nn.Module):
    def __init__(self, input_size, output_size, world_size):
        super().__init__()
        
        self.input_size = input_size
        self.output_size_per_patrition = output_size // world_size
        
        self.weight = nn.Parameter(torch.empty(
            self.output_size_per_patrition,
            self.input_size
        ))
        self.bias = nn.Parameter(torch.empty(
            self.output_size_per_patrition
        ))
    
    def forward(self, input):
        input_parallel = f.apply(input)
        output_parallel = F.linear(input_parallel, self.weight, self.bias)
        output = g.apply(output_parallel)
        return output

### ML Engineering

In [None]:
from prefect import flow

In [None]:
@flow
def my_flow():
    print_values(111)
    print_values(222)

In [None]:
from functools import lru_cache

### AI

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
from torch.profiler import profile, ProfilerActivity

In [None]:
with profile(
    activities=[ProfilerActivity.CPU]
) as prof:
    hardcore()

In [None]:
split, truncation, spliding

In [None]:
from ray import tune
from ray.tune.schedulers import HyperBandScheduler

In [None]:
tune.run(
    objective,
    config=search_space,
    scheduler=HyperBandScheduler()
)

In [None]:
stream = torch.cuda.Stream(device=device)

In [None]:
with torch.cuda.device(device):
    with torch.cuda.stream(stream):
        mean = xs.mean()

In [None]:
from torch.profiler import schedule

In [None]:
with profile(
    activities=[ProfilerActivity.CPU],
    schedule=schedule(**schedular_params)
) as prof:
    for idx in range(8):
        model(inputs)
        prof.step()

In [None]:
import jax

In [None]:
jax.grad(square, has_aux=True)[x]

In [None]:
act_head = activations[1, 2]

In [None]:
from torchvision import transforms as tfms

In [None]:
transform = tfms.Compose([
    tfms.ToTensor(),
    tfm.
])

In [None]:
cache[act_name]

In [None]:
act_head[1, 2]

In [None]:
correct_residual_direction = model.tokens_to_residual_directions(correct_token)

In [None]:
incorrect_residual_direction = model.tokens_to_residual_directions(incorrect_token)

In [None]:
logit_diff_residual = correct_residual_direction - incorrect_residual_direction

In [None]:
rpc.rpc_sync()

In [None]:
step 1: tokenize the prompt
step 2: act
step 3: exec
step 4: 

In [None]:
from torch.distributions import Categorical

In [None]:
dist = Categorical(probs=probs)

In [None]:
action = dist.sample()

In [None]:
log_prob = dist.log_prob(action)

In [None]:
correct_residual_direction = unembedding[:, correct_token]
incorrect_residual_direction = unembedding[:, incorrect_token]

In [None]:
correct_residual_direction - incorrect_residual_direction

In [None]:
from torchvision import transforms as tfms

In [None]:
transform = tfms.Compose([
    tfms.ToTensor(),
    tfms.Normalize(0.3, 0.9)
])

In [None]:
from einops import einsum

In [None]:
interference = einsum(
    W, W,
    "b f1 d, b f2 d -> b f1 f2"
)

In [None]:
from einops import reduce

In [None]:
induction_score = reduce(induction_stripe, "h w -> h", reduction="mean")

In [None]:
outputs = []

In [None]:
for layer in range(2):
    outputs.append(cache[f"block.{layer}.attn.hook_result"])

In [None]:
import gymnasium as gym

In [None]:
envs = gym.vector.SyncVectorEnv([
    lambda: gym.make("CartPole-v1"),
    lambda: gym.make("CartPole-v1"),
    lambda: gym.make("CartPole-v1"),
])

In [None]:
from torch.distributions import Categorical

In [None]:
dist = Categorical(probs=probs)

In [None]:
action = dist.sample()

In [None]:
action = dist.log_

In [None]:
torch.jit.script() 

In [None]:
activations["attn", 2]

In [None]:
dist.log_prob(action)

In [None]:
outputs = []

In [None]:
for layer in range(3):
    outputs.append(cache[f"blocks.{block}.attn.hook_result"])

In [None]:
outputs = torch.cat(outputs, dim=-2)

In [None]:
class ShortcutProjection(nn.Module):
    def __init__(self, in_channels, out_channels, stride):
        super().__init__()
        self.conv = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            stride=stride
        )

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps):
        super().__init__()
        self.eps = eps
        self.adds = nn.Parameter(torch.zeros(features))
        self.mults = nn.Parameter(torch.ones(features))
    
    def forward(self, x):
        mean, var = x.mean(dim=-1), x.var(dim=-1)
        x = (x-mean) / (self.eps + var).sqrt()
        x = self.adds + self.mults * x
        
        return x