### Engineering

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
handles = [model.ln_final.register_forward_pre_hook(hook) for hook in hooks]

In [None]:
handles[1].remove()

In [None]:
text_embeddings = model.embed(tokens)
positional_embeddings = model.pos_embed(tokens)

In [None]:
embeddings = text_embeddings + positional_embeddings

In [None]:
residual = embeddings

In [None]:
for block in model.blocks:
    residual = block(residual)

In [None]:
residual = model.ln_final(residual)
logits = model.unembed(residual)

In [None]:
clean_tokens = model.to_tokens(clean_prompt)
corrupted_tokens = model.to_tokens(corrupted_prompt)

In [None]:
, corrupted_activations = model.run_with_cache(corrupted_tokens)

In [2]:
from transformer_lens.utils import get_act_name

In [None]:
n_tokens = clean_tokens.shape[-1]

In [3]:
from functools import partial

In [4]:
def patch_activation(activations, hook, position_idx, corrupted_activations):
    target_activations = corrupted_activations[hook.name][:, position_idx, :]
    activations[:, position_idx, :] = target_activations
    return activations

In [None]:
clean_logits = model(clean_tokens)

In [6]:
def compute_logit_difference(clean_logits, corrupted_logits, target_token):
    return clean_logits[:, target_token, :] - corrupted_logits[:, target_token, :]

In [None]:
for layer_idx in range(n_layers):
    hook_name = get_act_name("resid_pre", layer_idx)
    for position_idx in range(n_tokens):
        hook_func = partial(
            patch_activation,
            position_idx=position_idx,
            corrupted_activations=corrupted_activations
        )
        
        patched_logits = model.run_with_hooks(
            clean_tokens,
            fwd_hooks=[(hook_name), hook_func]
        )
        
        logit_difference = compute_logit_difference(clean_logits, patched_logits)
    

In [None]:
def print_shape(module, input):
    print(input.shape)

In [None]:
model.h[1].register_forward_pre_hook(print_shape)

In [7]:
from contextlib import contextmanager

In [None]:
@contextmanager
def use_hooks(model, hooks):
    try:
        module = model.transformer.h[1]
        handles = [module.register_forward_pre_hooks(hook) for hook in hooks]
    finally:
        for handle in handles:
            handle.remove()

In [None]:
step 1: target component
step 2: two prompts
step 3: record the activations of the two prompts
step 4: patch the activations of the target component from the corrupted prompt to the clean prompt
step 5: record the activations of the target receiver nodes
step 6: run again with patch the receiver nodes, clean prompt

In [8]:
import torch.distributed as dist

In [None]:
if dist.get_rank() == 59

In [None]:
#include <iostream>
using namespace std;

In [None]:
class Book() {
    public:
        string title;
    
        Book(aTitle) {
            title = aTitle;
        }
}

In [9]:
n_microbatches = 4

In [12]:
n_partritions = 3

In [21]:
n_clock_cycles = n_microbatches + n_partritions - 1

In [22]:
n_clock_cycles

6

In [32]:
for clock_idx in range(n_clock_cycles):
    start_partrition = max(clock_idx+1-n_microbatches, 0)
    end_partrition = min(clock_idx+1, n_partritions)
    
    tasks = []
    for partrition_idx in range(start_partrition, end_partrition):
        microbatch_idx = clock_idx - partrition_idx
        tasks.append((microbatch_idx, partrition_idx))
    
    print(tasks)

[(0, 0)]
[(1, 0), (0, 1)]
[(2, 0), (1, 1), (0, 2)]
[(3, 0), (2, 1), (1, 2)]
[(3, 1), (2, 2)]
[(3, 2)]


In [29]:
start_partrition

0

In [30]:
end_partrition

1

In [None]:
register memory > cache > main memory > disk > external disk

In [33]:
from torch.utils.data import Dataset

In [34]:
class CachedDataset(Dataset):
    def __init__(self, filename):
        self.filename = filename
        self.data = None
        self.cached_idxs = {}
        self.cache = []
    
    def prefetch(self, idxs):
        if all([x in self.cached_idxs for i in idxs]):
            return
        
        if not self.data:
            self.data = torch.load(self.filename)
        
        n_elements = sum([self.data[i].numel() for i in idxs])
        self.data = torch.zeros(n_elements, dtype=self.data.dtype)
        offset = 0
        for i in idxs:
            n_elements = self.data[i].numel()
            self.cache[offset:offset+n_elements] = self.data[i].view(-1)
            offset += n_elements

In [None]:
container runtime, kublet, kube proxy

In [35]:
tensor_model_parallel_size = 2

In [36]:
num_tensor_model_parallel_groups = 8

In [37]:
for i in range(num_tensor_model_parallel_groups):
    ranks = list(range(
        i*tensor_model_parallel_size,
        (i+1)*tensor_model_parallel_size
    ))
    
    print(ranks)

[0, 1]
[2, 3]
[4, 5]
[6, 7]
[8, 9]
[10, 11]
[12, 13]
[14, 15]


In [38]:
world_size = 16

In [39]:
tensor_model_paralell_size = 2

In [40]:
pipeline_model_paralell_size = 4

In [41]:
num_pipeline_model_parallel_groups = world_size // pipeline_model_paralell_size

In [42]:
num_pipeline_model_parallel_groups

4

In [None]:
for i in range(num_pipeline_model_parallel_groups):
    start_rank = i*num_pipeline_model_parallel_groups
    end_rank = (i+1)*num_pipeline_model_parallel_groups
    
    for j in range(tensor_model_paralell_size):
        ranks = list(range(
            
        ))

In [43]:
for i in range(pipeline_model_paralell_size):
    start_rank = i*num_pipeline_model_parallel_groups
    end_rank = (i+1)*num_pipeline_model_parallel_groups
    
    for j in range(tensor_model_paralell_size):
        ranks = list(range(
            start_rank+j,
            end_rank,
            tensor_model_paralell_size
        ))
        
        print(ranks)

[0, 2]
[1, 3]
[4, 6]
[5, 7]
[8, 10]
[9, 11]
[12, 14]
[13, 15]


In [None]:
broadcast, reduce, scatter, gather

In [None]:
rank = torch.distributed.get_rank()

if rank == 69:
    torch.distributed.isend(tensor, dst=42)
elif rank == 42:
    torch.distributed.irecv(tensor_will_be_received_data, src=69)

In [45]:
from airflow.decorators import dag, task

In [None]:
@dag(dag_id=dag_id, start_date=start_date)
def workflow():
    @task
    def task_1(ti):
        ti.push("x", 2)
    
    @task
    def task_2(ti):
        x = ti.get("x")
        y = x + 3
        ti.push("y", y)

In [47]:
from airflow.decorators import dag
from airflow.decorators.python import PythonOperator

In [None]:
@dag(dag_id=dag_id, start_date=start_date)
def xx():
    task_1 = PythonOperator

In [48]:
from torch.optim import Optimizer

In [50]:
# class CustomOptimizer(Optimizer):
#     def __init__(self, params):
#         defaults = dict()
#         super().__init__(self, params, defaults)
    
#     def step(self):
#         for param_group in self.param_groups:
#             for param in param_group:
#                 if param.requires_grad:
#                     param = param

In [51]:
from torch.profiler import profiler, ProfilerActivity

In [None]:
with profiler(activities=[ProfilerActivity.CPU], schedule=pro)

In [None]:
x.repeat((2, 2))

In [None]:
task = torch.distributed.broadcast(x, src=0, async_op=True)

In [None]:
task.wait()

In [None]:
p, i, d, n