### Engineering

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
world_size = 16

In [None]:
num_gpus = 4

In [None]:
process_to_gpu = []

In [None]:
for rank in range(world_size):
    process_to_gpu.append(rank % num_gpus)

In [None]:
process_to_gpu

[0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]

In [None]:
sorted_param = sorted(
    param_list,
    key=lambda x: x.numel(),
    reverse=True
)

In [None]:
numel_per_rank = [0 for _ in range(world_size)]
param_per_rank = [[] for _ in range(world_size)]

In [None]:
for param in sorted_param:
    next_rank = numel_per_rank.index(min(numel_per_rank))
    param_per_rank[next_rank].append(param)
    numel_per_rank[next_rank] += param

In [None]:
class RowParallelLinear(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        
        world_size = torch.distributed.get_world_size()
        self.input_size_per_partrition = input_size // world_size
        self.output_size = output_size
        
        self.weight = nn.Parameter(torch.empty(
            self.output_size,
            self.input_size_per_partrition
        ))
        
        self.bias = nn.Parameter(torch.empty(
            self.output_size
        ))
        
    def forward(self, input):
        world_size = torch.distributed.get_world_size()
        rank = torch.distributed.get_rank()
        
        dim_size = input.shape[-1]
        dim_size_per_partrition = dim_size // world_size
        input_chunks = torch.split(
            input,
            split_size_or_sections=dim_size_per_partrition,
            dim=-1
        )
        input_parallel = input_chunks[rank]
        
        output_parallel = F.linear(input_parallel, self.weight, self.bias)
        
        torch.distributed.all_reduce(output_parallel)
        
        return output_parallel

In [None]:
for i in range(pipeline_model_parallel_size):
    start_rank = i*num_pipeline_model_parallel_groups
    end_rank = (i+1)*num_pipeline_model_parallel_groups
    
    for j in range(tensor_model_parallel_size):
        ranks = list(range(start_rnak+j, end_rank, tensor_model_paralle_size))

In [None]:
parameters, forward pass, optimizer

In [None]:
register < cached memory < main memory <  hard drive < backup

In [None]:
for i in range(pipeline_model_parallel_size):
    start_rank = i*pipeline_model_parallel_size
    end_rank = (i+1)*pipeline_model_parallel_size
    
    for j in range(tensor_model_parallel_size):
        ranks = list(range(
            start_rank+j,
            end_rank,
            tensor_model_parallel_size
        ))

### ML Engineering

In [None]:
from metaflow import FlowSpec, step, profile

In [None]:
@project(name="project_69")
class TrainFlow(FlowSpec):
    @step
    def start(self):
        self.next(self.train)
    
    @step
    def train(self):
        train()
    
    @step
    def end(self):
        pass

In [None]:
from functools import reduce

In [None]:
reduce(lambda x, y: x + y, numbers)

In [None]:
region > vpc > az > subnet > resource

### MechInterp

In [None]:
import pysvelte

In [None]:
induction_heads = [(6, 9), (4, 2)]

In [None]:
from transformer_lens.utils import get_act_name

In [None]:
attn_patterns = []

In [None]:
repeated_logits, repeated_activations = model.to_tokens(repeated_text)

In [None]:
batch_idx = 0

In [None]:
for head_idx, layer_idx in induction_heads:
    hook_name = get_act_name("attn", layer_idx)
    attn_patterns.append(repeated_activations[hook_name][batch_idx, head_idx])

In [None]:
act_head = activations[1, 2]

In [None]:
cache[act_name]

In [None]:
correct_direction = unembedding[:, correct_token]
incorrect_direction = unembedding[:, incorrect_token]

In [None]:
text_embeddings = model.embed(tokens)
positional_embeddings = model.pos_embed(tokens)

In [None]:
embeddings = text_embeddings + positional_embeddngs

In [None]:
residual = embeddings

In [None]:
for block in model.blocks:
    residual = block(residual)

In [None]:
residual = model.ln_final(residual)

In [None]:
loigts = model.unembed(residual)

### AI

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
ray.wait()

In [None]:
cuda = torch.cuda.Stream(device=device)

In [None]:
with torch.cuda.device(device):
    with torch.cuda.stream(stream):
        mean = xs.mean(dim=-1)

In [None]:
nn.Flatten(start_dim=1, end_dim=3)

In [None]:
rref.local_value()

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super().__init__()
        self.mha = MultiHeadAttention(
            d_model, n_heads
        )
        self.norm_1 = ResidualLayerNorm(d_model, dropout)
        self.norm_2 = ResidualLayerNorm(d_model, dropout)
        self.feed_forward = PositionWiseFeedForward(
            d_model, d_ff, dropout
        )
    
    def forward(self, embeddings):
        mha_output, mha_attn = self.mha(embeddings)
        norm_1 = self.norm_1(mha_output, embeddings)
        feed_forward = self.feed_forward(norm_1)
        norm_2 = self.norm_2(feed_forward, norm_1)
        return norm_2

In [None]:
def probability_scores(image_embeddings, text_embeddings):
    image_norm = image_embeddings.norm()
    image_embeddings = image_embeddings / image_norm
    
    text_norm = text_embeddings.norm()
    text_embeddings = text_embeddings / text_norm
    
    similarities = image_norm @ text_norm.T
    probs = F.softmax(similarities, dim=-1)