### Engineering

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
import os

In [None]:
class MPU:
    def __init__(self, master_addr, master_port, backend):
        if not torch.distributed.is_initialized():
            self.initialize_distributed(master_addr, master_port, backend)
    
    def process_to_device(self, rank):
        device_count = torch.cuda.device_count()
        
        if device_count > 0:
            device = rank % device_count
            torch.cuda.set_device(device)
    
    def initialize_distributed(self, master_addr, master_port, backend):
        if not torch.distributed.is_initialized():
            rank = int(os.getenv["RANK"])
            world_size = int(os.getenv["WORLD_SIZE"])
            os.environ["MASTER_ADDR"] = master_addr
            os.environ["MASTER_PORT"] = master_port
            
            self.process_to_device(rank)
            
            torch.distributed.new_process_group(
                rank=rank,
                world_size=world_size,
                backend=backend
            )

In [None]:
class ColumnParallelLinear(nn.Module):
    def __init__(self, input_size, output_size, world_size):
        super().__init__()
        self.input_size = input_size
        self.output_size_per_partrition = output_size // world_size
        
        self.weight = nn.Parameter(torch.empty(
            self.output_size_per_partrition,
            self.input_size
        ))
        self.bias = nn.Parameter(torch.empty(
            self.output_size_per_partrition
        ))
    
    def forward(self, input):
        output_partrition = F.linear(input, self.weight, self.bias)
        
        world_size = torch.distributed.get_world_size()
        outputs = [torch.empty_like(output_partrition) for _ in range(world_size)]
        torch.distributed.all_gather(outputs, output_partrition)
        outputs = torch.cat(outputs, dim=-1)
        return outputs

In [None]:
step 1: replicate the model
step 2: divide mini-batch into micro-batches
step 3: forward pass
step 4: average the gradient
step 5: update gradient to all devices

In [None]:
tensor parallelism, pipeline parallelism

In [None]:
def compute_total_memory(model):
    total_memory = 0
    
    for param model.parameters():
        total_size += param.numel() * param.element_size()

In [None]:
file system, shared memory, message passing

In [None]:
def _broadcast(input):
    return input.clone()

In [None]:
def _reduce(grad_output):
    world_size = torch.distributed.get_world_size(group=process_group)
    
    if world_size == 1:
        return grad_output

    torch.distributed.all_reduce(grad_output)
    return grad_output

In [None]:
class Broadcast(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        return _broadcast(input)
    
    @staticmethod
    def backward(ctx, grad_output):
        return _reduce(grad_output)

In [None]:
def broadcast_with_forward_and_backward(inputs):
    if is_grad_enable(inputs):
        outputs = Broadcast.apply(inputs)
    else:
        outputs = _broadcast(inputs)
    
    return outputs

In [None]:
from torch.utils.data import Dataset

In [None]:
class CachedDataset(Dataset):
    def __init__(self, filename):
        self.filename = filename
        self.cache_index = []
        self.data = None
    
    def prefetch(self, idxs):
        if all([i in self.cache_index for i in idxs]):
            return
    
        if not self.data:
            self.data = torch.load(self.filename)
        
        total_elements = sum([x.numel() for x in self.data])
        self.cache = torch.empty(total_elements, dtype=self.data.dtype)
        self.cache_index.clear()
        
        offset = 0
        
        for i in idxs:
            n_elements = self.data[i].numel()
            self.cache[offset:offset+n_elements] = self.data[i]
            self.cache_index.append(i)

In [None]:
world_size = 16

In [None]:
tensor_model_parallel_size = 2

In [None]:
pipeline_model_parallel_size = 4

In [None]:
num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size

In [None]:
num_pipeline_model_parallel_groups

4

In [None]:
for i in range(pipeline_model_parallel_size):
    start_rank = i*num_pipeline_model_parallel_groups
    end_rank = (i+1)*num_pipeline_model_parallel_groups
    
    for j in range(tensor_model_parallel_size):
        ranks = list(range(
            start_rank+j,
            end_rank,
            tensor_model_parallel_size
        ))
        
        print(ranks)

[0, 2]
[1, 3]
[4, 6]
[5, 7]
[8, 10]
[9, 11]
[12, 14]
[13, 15]


In [None]:
broadcast, scatter, reduce, gather

In [None]:
class ColumnParallelLinear(nn.Module):
    def __init__(self, input_size, output_size, world_size):
        super().__init__()
        self.input_size = input_size
        self.output_size_per_partrition = output_size // world_size
        
        self.weight = nn.Parameter(torch.empty(
            self.output_size_per_partrition,
            self.input_size
        ))
        self.bias = nn.Parameter(torch.empty(
            self.output_size_per_partrition
        ))
    
    def forward(self, input):
        output_parallel = F.linear(input, self.weight, self.bias)
        
        world_size = torch.distributed.get_world_size()
        
        outputs = [torch.empty_like(output_parallel) for _ in range(world_size)]
        torch.distributed.all_gather(outputs, output_parallel)
        outputs = torch.cat(outputs, dim=-1)
        return outputs

In [None]:
def by_column_parallelism(inputs, weights):
    dim_size = weights.shape[-1]
    
    w1, w2 = weights[:, :dim_size//2], weights[:, dim_size//2:]
    
    out1 = torch.matmul(inputs, w1)
    out2 = torch.matmul(inputs, w2)
    out = torch.cat([out1, out2], dim=-1)
    return out

In [None]:
step 1: load training data
step 2: list items
step 3: total size
step 4: memory continuous
step 5: load item to that

In [None]:
step 1: patrition the parmaeter
step 2: move to rank
step 3: move to device
step 4: init local optimizer
step 5: do local step
step 6: broadcast
step 7: update all

In [None]:
inputs = torch.randn(2, 4)

In [None]:
weights = torch.randn(4, 2)

In [None]:
def by_row_parallelism(inputs, weights):
    x1, x2 = torch.chunk(inputs, chunks=2, dim=-1)
    w1, w2 = torch.chunk(weights, chunks=2, dim=0)
    
    out1 = x1 @ w1
    out2 = x2 @ w2
    
    return out1 + out2

In [None]:
by_row_parallelism(inputs, weights)

tensor([[-2.8840,  0.0045],
        [ 2.1557,  0.1922]])

In [None]:
inputs @ weights

tensor([[-2.8840,  0.0045],
        [ 2.1557,  0.1922]])

In [None]:
send sync, recv sync
send sync, recev async
send async, recev sync
send async, recev async

### ML Engineering

In [None]:
from metaflow import FlowSpec, conda, step

In [None]:
class TrainingFlow(FlowSpec):
    @step
    def start(self):
        self.next(self.train)
    
    @conda(python="3.11.0", libraries={"pytorch": "2.0.0"})
    @step
    def train(self):
        train()
        self.next(self.end)
    
    @step
    def end(self):
        pass

In [None]:
docker start 31b

In [None]:
docker network create mongo-network

In [None]:
from pydantic import BaseModel

In [None]:
class User(BaseModel):
    user_id: int
    username: str

In [None]:
from metaflow import S3

In [None]:
with S3() as s3:
    res = s3.get(URL)

In [None]:
from functools import partial

In [None]:
result = partial(lambda x, y: x + y, numbers)

In [None]:
docker log monitor_app

### AI

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
for param_groups in optimizer.param_groups:
    for param in param_groups["params"]:
        print(param.shape)

In [None]:
torch.amax(6, dim=-1)

In [None]:
stream = torch.cuda.Stream(device=device)

In [None]:
with torch.cuda.device(device):
    with torch.cuda.stream(stream):
        mean = xs.mean(dim=-1)

In [None]:
torch.cuda.set_device()

In [None]:
features are represented directions
features are linear representation

In [None]:
step 1: choose a component C
step 2: two prompts x1 and x2
step 3: record activations of C in x1
step 4: activation patching 
step 5: compare the difference in the output logits

In [None]:
cache["embed.hook_"]

In [None]:
clean_tokens = model.to_tokens(clean_prompt)

In [None]:
corrupted_tokens = model.to_tokens(corrupted_prompt)

In [None]:
correct_token = model.to_single_token(" John")
incorrect_token = model.to_single_token(" Mary")

In [None]:
_, clean_activations = model(clean_tokens)

In [None]:
n_tokens = clean_tokens.shape[-1]

In [None]:
from transformer_lens.utils import get_act_name

In [None]:
from functools import partial

In [None]:
batch_idx = 0

In [None]:
def patch_component(corrupted_activations, hook, position, clean_activations):
    corrupted_activations[batch_idx, position, :] = clean_activations[hook.name][batch_idx, position, :]
    return clean_activations

In [None]:
def compute_logit_diff(logits, correct_token, incorrect_token):
    last_token_logit = logits[:, -1, :]
    correct_logit = last_token_logit[:, correct_token]
    incorrect_logit = last_token_logit[:, incorrect_token]
    return correct_logit - incorrect_logit

In [None]:
data = torch.zeros(n_layers, n_tokens)

In [None]:
for layer_idx in range(n_layers):
    for position in range(n_tokens):
        hook_name = get_act_name("resid_pre", layer_idx)
        hook_func = partial(patch_component, position=position, clean_activations=clean_activations)
        
        corrupted_logits = model.run_with_hooks(
            corrupted_tokens,
            fwd_hooks=[(hook_name, hook_func)],
            return_type="logits"
        )
        logit_diff = compute_logit_diff(corrupted_logits, correct_token, incorrect_token)
        data[layer_idx][position] = logit_diff

In [None]:
x[0, 2, 2]

In [None]:
from einops import repeat

In [None]:
output = repeat(x, "h w -> h w new_axis", n=2)

In [None]:
repeated_tokens = model.to_tokens(repeated_tokens)

In [None]:
repeated_logits = model(repeated_tokens)

In [None]:
probs = F.softmax(repeated_logits, dim=-1)

In [None]:
last_probs = probs[:, -1, :]

In [None]:
target = repeated_tokens[1:]

In [None]:
predicted_log_probs = -last_probs[target].log()

In [None]:
def probability_scores(image_embedding, text_embedding):
    image_norm = image_embedding.norm(dim=-1)
    image_embedding = image_embedding / image_norm
    
    text_norm = text_embedding.norm(dim=-1)
    text_embedding = text_embedding / text_norm
    
    similarities = image_embedding @ text_embedding.T
    probs = F.softmax(similarities, dim=-1)
    return probs

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, n_heads)
        self.norm_1 = ResidualLayerNorm(d_model, dropout)
        self.feed_forward = PositionFeedForward(d_model, d_ff, dropout)
        self.norm_2 = ResidualLayerNorm(d_model, dropout)
    
    def forward(self, embeddings):
        attn_output, attn_weights = self.mha(embeddings)
        norm_1 = self.norm_1(attn_output, embeddings)
        feed_forward = self.feed_forward(norm_1)
        norm_2 = self.norm_2(feed_forward, norm_1)
        return norm_2, attn_weights

In [None]:
from einops import repeat

In [None]:
output = repeat(x, "h w -> h w n", n=2)

In [None]:
discount_returns

In [None]:
for discount_return, prob in zip():
    total_loss += discount_return * -prob.log()

In [None]:
isend