### Engineering

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
import time

In [None]:
stream1 = torch.cuda.Stream()
stream2 = torch.cuda.Stream()

In [None]:
star_time = time.time()

with torch.cuda.stream(stream1):
    operation_a()

with torch.cuda.stream(stream2):
    operation_b()

torch.cuda.synchronize()

end_time()

In [None]:
world_size = 16
num_gpus = 4

In [None]:
for rank in range(world_size):
    print(f"rank: {rank} -> gpu: {rank % num_gpus}")

rank: 0 -> gpu: 0
rank: 1 -> gpu: 1
rank: 2 -> gpu: 2
rank: 3 -> gpu: 3
rank: 4 -> gpu: 0
rank: 5 -> gpu: 1
rank: 6 -> gpu: 2
rank: 7 -> gpu: 3
rank: 8 -> gpu: 0
rank: 9 -> gpu: 1
rank: 10 -> gpu: 2
rank: 11 -> gpu: 3
rank: 12 -> gpu: 0
rank: 13 -> gpu: 1
rank: 14 -> gpu: 2
rank: 15 -> gpu: 3


In [None]:
class f(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        return input
    
    @staticmethod
    def backward(ctx, grad_output):
        torch.distributed.all_reduce(grad_output)
        return grad_output

In [None]:
class g(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        world_size = torch.distributed.get_world_size()
        rank = torch.distributed.get_rank()
        
        inputs = [torch.zeros_like(input) for _ in range(world_size)]
        torch.distributed.all_gather(inputs)
        inputs = torch.cat(inputs, dim=-1)
        return inputs

    @staticmethod
    def backward(ctx, grad_output):
        world_size = torch.distributed.get_world_size()
        rank = torch.distributed.get_rank()
        
        dim_size = grad_output.shape[-1]
        dim_size_per_partrition = dim_size // world_size
        grad_chunks = torch.split(grad_output, dim_size_per_partrition, dim=-1)
        return grad_chunks[rank]

In [None]:
class ColumnParallelLinear(torch.autograd.Function):
    def __init__(self, input_size, output_size, world_size):
        super().__init__()
        self.input_size = input_size
        self.output_size_per_partrition = output_size // world_size
        
        self.weight = nn.Parameter(torch.empty(
            self.output_size_per_partrition,
            self.input_size
        ))
        self.bias = nn.Parameter(torch.empty(
            self.output_size_per_partrition
        ))
    
    def forward(self, input):
        input_parallel = f.apply(input)
        output_parallel = F.linear(input_parallel, self.weight, self.bias)
        outputs = g.apply(output_parallel)
        return outputs

In [None]:
step 1: determine the location of activation checkpoints
step 2: compute the activations, save the activation checkpoints
step 3: compute the gradient of the last part
step 4: recompute the forward if need

In [None]:
optimizer-related variables, parameters, gradient

In [None]:
import os

In [None]:
class MPU:
    def __init__(self, master_addr, master_port, backend):
        if not torch.distributed.is_initialized():
            self.initialize_distributed(master_addr, master_port, backend)
            
            
    
    def process_to_gpu(self, rank):
        num_devices = torch.cuda.device_count()
        if num_devices > 0:
            device = rank % num_devices
            torch.cuda.set_device(device)
    
    def initialize_distributed(self, master_addr, master_port, backend):
        if not torch.distributed.is_initialized():
            rank = int(os.getenv["RANK"])
            world_size = int(os.getenv["WORLD_SIZE"])
            os.environ["MASTER_ADDR"] = master_addr
            os.environ["MASTER_PORT"] = master_port
            
            self.process_to_gpu(rank)
            
            torch.distributed.new_process_group(
                rank=rank,
                world_size=world_size,
                backend
            )

In [None]:
def _broadcast(input):
    return input.clone()

In [None]:
def _scatter(input):
    world_size = torch.distributed.get_world_size(group=parallel_group)
    rank = torch.distributed.get_rank(group=parallel_group)
    
    dim_size = input.shape[-1]
    dim_size_per_partrition = dim_size // world_size
    
    input_chunks = torch.split(input, dim_size_per_partrition, dim=-1)
    
    return input_chunks[rank]

In [None]:
class Broadcast(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        return _broadcast(input)
    
    @staticmethod
    def backward(ctx, grad_output):
        return _scatter(grad_output)

In [None]:
def broadcast_with_forward_and_backward(inputs):
    if is_grad_enabled(inputs):
        outputs = Broadcast.apply(inputs)
    else:
        outputs = _broadcast.apply(inputs)
    
    return outputs

In [None]:
step 1: sender sends
step 2: sender continue its execution
step 3: receiver stop and wait
step 4: once the receiver received, it continues its execution

In [None]:
from torch.utils.daa

In [None]:
step 1: load the training data to ram
step 2: measure the size of the next
step 3: allocate a portion in memory
step 4: load


In [None]:
num_tensor_model_parallel_groups = 8

In [None]:
tensor_model_parallel_size = 2

In [None]:
for i in range(num_tensor_model_parallel_groups):
    ranks = list(range(
        i*tensor_model_parallel_size,
        (i+1)*tensor_model_parallel_size
    ))
    
    print(ranks)

[0, 1]
[2, 3]
[4, 5]
[6, 7]
[8, 9]
[10, 11]
[12, 13]
[14, 15]


In [None]:
optimizer, forward, param

In [None]:
send sync, receiv sync
send sync, receive async
send async, recev sync
send async, recev async

In [None]:
def _broadcast(input):
    return input.clone()

In [None]:
def _reduce(grad_output):
    world_size = torch.distributed.get_world_size(group=parallel_group)
    
    if world_size == 1:
        return grad_output
    
    torch.distributed.all_reduce(grad_output, group=parallel_group)
    
    return grad_output

In [None]:
class Broadcast(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        return _broadcast(input)
    
    @staticmethod
    def backward(ctx, grad_output):
        return _reduce(grad_output)

In [None]:
def is_grad_enable(inputs):
    return torch.is_grad_enabled() and inputs.requires_grad

In [None]:
def broadcast_with_forward_and_backward(inputs):
    if is_grad_enable(inputs):
        outputs = Broadcast.apply(inputs)
    else:
        outputs = _broadcast(inputs)
    
    return outputs

In [None]:
lazy loading
data prefetching
memory mapping

In [None]:
def create_continuous_memory(memory_size):
    FP32_SIZE = 4
    n_numbers = memory_size // 4
    return torch.empty(n_numbers, dtype=torch.float32)

In [None]:
from torch.utils.data import Dataset

In [None]:
class CachedDataset(Dataset):
    def __init__(self, filename):
        self.filename = filename
        self.cache_index = {}
        self.data = None
    
    def prefetch(self, idxs):
        if all([i in self.cache_index for i in idxs]):
            return
        
        if not self.data:
            self.data = torch.load(self.filename)
        
        n_elements = [x.numel() for x in self.data]
        self.cache = torch.empty(n_elements, dtype=self.data.dtype)
        
        self.cache_index = {}
        offset = 0
        
        for i in idxs:
            n = self.data[i].numel()
            self.cache[offset:offset+n] = self.data[i]
            self.cache_index[i] = offset
            offset += i

In [None]:
file system, shared memory, message passing

In [None]:
world_size = 16
tensor_model_parallel_size = 2

In [None]:
pipeline_model_paralell_size = 4

In [None]:
num_pipeline_model_parallel_groups = world_size // tensor_model_parallel_size

In [None]:
num_tensor_model_parallel_groups

8

In [None]:
for stage_idx in range(pipeline_model_paralell_size):
    start_rank = stage_idx*num_pipeline_model_parallel_groups
    end_rank = (stage_idx+1)*num_pipeline_model_parallel_groups
    
    for i in range(tensor_model_parallel_size):
        ranks = list(range(
            start_rank+i,
            end_rank,
            tensor_model_parallel_size
        ))
        
        print(ranks)

[0, 2, 4, 6]
[1, 3, 5, 7]
[8, 10, 12, 14]
[9, 11, 13, 15]
[16, 18, 20, 22]
[17, 19, 21, 23]
[24, 26, 28, 30]
[25, 27, 29, 31]


### ML Engineering

In [None]:
from metaflow import FlowSpec, step, schedule

In [None]:
@schedule(daily=True)
class HelloFlow(FlowSpec):
    @step
    def start(self):
        self.next(self.fuck)
    
    @step
    def fuck(self):
        print(69)
        self.next(self.end)
    
    @step
    def end(self):
        pass

In [None]:
from metaflow import project

In [None]:
@project(name="project_69")
class TrainFlow(FlowSpec):
    @step
    def start(self):
        self.next(self.train)
    
    @step
    def train(self):
        train()
        self.next(self.end)
    
    @step
    def end(self):
        pass

In [None]:
@project(name="project_69")
class EvaluateFlow(FlowSpec):
    @step
    def start(self):
        self.next(self.train)
    
    @step
    def train(self):
        evaluate()
        self.next(self.end)
    
    @step
    def end(self):
        pass

In [None]:
class TrainingFlow(FlowSpec):
    @step
    def start(self):
        self.next(self.train, self.eval)
    
    @step
    def train(self):
        print("training...")
        self.next(self.join)
    
    @step
    def eval(self):
        print("evaluating...")
        self.next(self.join)
    
    @step
    def join(self, inputs):
        self.next(self.end)
    
    @step
    def end(self):
        pass

In [None]:
docker run --net mongo-network mongo

In [None]:
docker pull redis

In [None]:
step 1: nat gate way to vpc
step 2, 3, 4: create, route table to nat gate way, attach
step 5: 

In [None]:
public ip, private ip, elastic ip

In [None]:
docker images

In [None]:
in-server
as-a-service
edge

In [None]:
from typing import overload, List, Union

In [None]:
@overload
def getitem(x: str) -> str:
    pass

In [None]:
@overload
def getitem(x: List[int]) -> int:
    pass

In [None]:
from typing import Callable

In [None]:
def foo() -> Callable[[int, int], int]:
    def add(x: int, y: int) -> int:
        return x+y
    return add

In [None]:
region > vpc > availability zone > subnet > resource

In [None]:
docker rmi redis:22.2

In [None]:
docker network create 

In [None]:
docker rm 23a

In [None]:
docker start

In [None]:
docker run postgres:4.3

In [None]:
docker stop 12b

### AI

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
stream1 = torch.cuda.Stream()
stream2 = torch.cuda.Stream()

In [None]:
with torch.cuda.stream(stream1):
    x_mean = x.mean(dim=-1)

with torch.cuda.stream(stream1):
    y_mean = y.mean(dim=-1)

In [None]:
from torch.optim import Optimizer

In [None]:
class CustomOptimizer(Optimizer):
    def __init__(self, params):
        defaults = dict()
        super().__init__(params, defaults)
    
    def step(self):
        for param_group in self.param_groups:
            for param in param_group["params"]:
                if param.requires_grad is False:
                    continue
                
                param.data = param.data * 6.9

In [None]:
model.embed(tokens)

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential

In [None]:
correct_tokens = model.to_tokens(clean_prompt)
incorrect_tokens = model.to_tokens(corrupted_prompt)

In [None]:
correct_token = model.to_single_token(" John")
incorrect_token = model.to_single_token(" Mary")

In [None]:
_, clean_logits = model.run_with_cache(clean_tokens)

In [None]:
n_tokens = clean_tokens.shape[-1]

In [None]:
data = torch.zeros(n_layers, n_tokens)

In [None]:
batch_idx = 0

In [None]:
def patch_component(
    corrupted_activations,
    hook,
    position,
    clean_activations
):
    corrupted_activations[batch_idx, position, :] = clean_activations[hook.name][batch_idx, position, :]
    return corrupted_activations

In [None]:
from transformer_lens.utils import get_act_name

In [None]:
from functools import partial

In [None]:
def compute_logit_diff(logits, correct_token, incorrect_token):
    last_token_logits = logits[:, -1, :]
    correct_logit = last_token_logits[:, correct_token]
    incorrect_logit = last_token_logits[:, incorrect_token]
    return correct_logit - incorrect_logit

In [None]:
for layer_idx in range(n_layers):
    
    for position in range(n_tokens):
        hook_name = get_act_name("resid_pre", layer_idx)
        hook_func = partial(patch_component, position=position, clean_activations=clean_activations)
        corrupted_logits = model.run_with_hook(
            corrupted_tokens,
            fwd_hooks=[(hook_name, hook_func)]
        )
        logit_diff = compute_logit_diff(corrupted_logits, correct_token, incorrect_token)
        data[layer_idx][position] = logit_diff

In [None]:
tokens = model.to_tokens(text)

In [None]:
logits = model(tokens)

In [None]:
log_probs = F.log_softmax(logits, dim=-1)

In [None]:
last_token_logits = logits[:, -1, :]

In [None]:
predicted_log_prob = 

In [None]:
induction_heads = [(6, 9), (4, 2)]

In [None]:
tokens = model.to_tokens(repeated_text)

In [None]:
, cache = model.run_with_cache(tokens)

In [None]:
for head_idx, layer_idx in induction_heads:
    hook_name = get_act_name("attn", layer_idx)
    attention_pattern = cache[hook_name][:, head_idx]

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
docker network ls

In [None]:
logits = output.logits

In [None]:
probs = F.softmax(logits, dim=-1)

In [None]:
sorted_probs = torch.argmax(probs, dim=-1)

In [None]:
step 1: choose the component
step 2: prompt x1 and x2
step 3: record the output and activation at C in prompt 1
step 4: activation patching
step 5:

In [None]:
def dropout_layer(activations, dropout):
    assert 0 <= dropout <= 1
    
    if dropout == 1:
        return torch.zeros_like(activations)
    
    mask = (torch.randn_like(activations) > dropout)
    
    return activations[mask]

In [None]:
for param_group in optimizer.param_groups:
    for param in param_groups["params"]:
        print(param.shape)

In [None]:
dist.get_rank()

In [None]:
scatter

In [None]:
step 1: partrition parameter
step 2: assign parameter to a specific ranks
step 3: allocate the parameter to device
step 4: local optimizer
step 5: do local update
step 6: broadcast 

In [None]:
from einops import einsum

In [None]:
einops_output = einsum(x, y, "batch dim, batch dim ->")

In [None]:
world_size = 16

In [None]:
tensor_model_parallel_size = 2

In [None]:
pipeline_model_parallel_size = 4

In [None]:
pipeline_model_parallel_groups = 4

In [None]:
for i in range(pipeline_model_parallel_size):
    start_rank = i*pipeline_model_parallel_groups
    end_rank = (i+1)*pipeline_model_parallel_groups
    
    for j in range(tensor_model_parallel_size):
        ranks = list(range(start_rank+j, end_rank, tensor_model_parallel_size))
        
        print(ranks)

[0, 2]
[1, 3]
[4, 6]
[5, 7]
[8, 10]
[9, 11]
[12, 14]
[13, 15]


In [None]:
step 1: tokenize the prompt
step 2: tokenize the observation and append to the prompt
step 3: take action
step 4: execute the action
step 5: repeat step 1

In [None]:
latent representation of the current observation, recurrent state

In [None]:
impulse

momentum: fb

In [None]:
eye > optic ne

In [None]:
import threading

In [None]:
lock = threading.Lock()

In [None]:
def target():
    with lock:
        increment_counter()

In [None]:
for _ in range(4):
    thread = threading.Thread(target=target)
    thread.start()

In [None]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )
    
    def forward(self, x):
        return self.net(x)