### Engineering

In [1]:
num_gpus = 4

In [2]:
world_size = 16

In [3]:
for rank in range(world_size):
    print(f"rank: {rank} -> {rank%num_gpus}")

rank: 0 -> 0
rank: 1 -> 1
rank: 2 -> 2
rank: 3 -> 3
rank: 4 -> 0
rank: 5 -> 1
rank: 6 -> 2
rank: 7 -> 3
rank: 8 -> 0
rank: 9 -> 1
rank: 10 -> 2
rank: 11 -> 3
rank: 12 -> 0
rank: 13 -> 1
rank: 14 -> 2
rank: 15 -> 3


In [None]:
reassign workload, monitor, 

In [None]:
int *h_a, *h_b, *h_c;

In [None]:
h_a = (int*)malloc(h_a);
h_a = (int*)malloc(h_a);
h_a = (int*)malloc(h_a);

In [4]:
from contextlib import contextmanager

In [None]:
@contextmanager
def use_hooks(model, hooks):
    try:
        handles = []
        for hook in hooks:
            model.transformer.h[1].register_pre_forward_hook(hook)
        yield
    finally:
        for handle in handles:
            handle.remove()

In [5]:
from transformer_lens.utils import get_act_name

In [None]:
pre_final_ln_name = get_act_name("post", 2)
post_final_ln = get_act_name("normalized")

In [None]:
_, cache = model.run_with_cache(tokens)

In [None]:
pre_final_ln = cache[pre_final_ln_name]
post_final_ln = cache[post_final_ln]

In [6]:
from sklearn.linear_model import LinearRegression

In [None]:
LinearRegression().fit(pre_final_ln, post_final_ln)

In [None]:
handles = []
for hook in hooks:
    handles.append(model.ln_f.register_forward_hook(hook))

handles[1].remove()

In [7]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.distributed as dist
import torch.distributed.rpc as rpc

In [8]:
class DataParallel:
    def __init__(self, module, parallel_context):
        self.module = module
        self.parallel_context = parallel_context
    
    @torch.no_grad()
    def parallelize(self):
        module = self.module
        
        if self.parallel_context.data_parallel_size > 1:
            self._register_backward_hook(module)
        
        return module
    
    def _register_backward_hook(self, module):
        for p in module.parameters():
            if p.requires_grad:
                p.register_hook(self._avg_grad)
    
    def _avg_grad(self, grad):
        data_parallel_size = self.parallel_context.data_parallel_size
        process_group = self.parallel_context.get_group(ParallelMode.DATA)
        
        new_grad = grad / data_parallel_size
        dist.all_reduce(new_grad, op=dist.ReduceOp.SUM, group=process_group)
        
        return new_grad

In [9]:
n_partitions = 4
n_microbatches = 3

In [10]:
n_clock_cycles = n_partitions + n_microbatches - 1

schedules = []
for clock_idx in range(n_clock_cycles):
    start_partrition = max(clock_idx + 1 - n_microbatches, 0)
    end_partition = min(clock_idx + 1, n_partitions)

    tasks = []
    for partition_idx in range(start_partrition, end_partition):
        microbatch_idx = clock_idx - partition_idx
        tasks.append((microbatch_idx, partition_idx))

    schedules.append(tasks)

In [11]:
schedules

[[(0, 0)],
 [(1, 0), (0, 1)],
 [(2, 0), (1, 1), (0, 2)],
 [(2, 1), (1, 2), (0, 3)],
 [(2, 2), (1, 3)],
 [(2, 3)]]

In [12]:
def reverse_nested_list(lst):
    reversed_list = []
    
    for item in lst:
        if isinstance(item, list):
            reversed_list.append(reverse_nested_list(item)) 
        else:
            reversed_list.insert(0, item)
            
    return reversed_list

In [13]:
reverse_nested_list(schedules)

[[(0, 0)],
 [(0, 1), (1, 0)],
 [(0, 2), (1, 1), (2, 0)],
 [(0, 3), (1, 2), (2, 1)],
 [(1, 3), (2, 2)],
 [(2, 3)]]

In [15]:
lst = [[1], [2, 3], [4, 5, 6]]
lsschedulest.reverse()
print(lst)

[[4, 5, 6], [2, 3], [1]]


In [16]:
schedules.reverse()

In [17]:
schedules

[[(2, 3)],
 [(2, 2), (1, 3)],
 [(2, 1), (1, 2), (0, 3)],
 [(2, 0), (1, 1), (0, 2)],
 [(1, 0), (0, 1)],
 [(0, 0)]]

In [18]:
def variance(x):
    return (x-x.mean())/x.var()

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, d_head):
        super().__init__()
        self.d_head = d_head
    
    def forward(self, q, k, v, mask = None):
        k = k.permute(-1, -2)
        qk = torch.matmul(q, k) / (self.d_head**0.5)
        
        if mask is not None:
            qk.masked_scores(mask == True, 1e-9)
        
        scores = F.softmax(qk)

In [None]:
first_part @ filter_1

In [59]:
import torch

In [60]:
VALUE = 69

Create a future object with the value equal to `VALUE`, and it automatically prints `69420` after setting a result

In [61]:
def callback(fut):
    print(69420)

In [62]:
fut = torch.futures.Future()
fut

<torch.jit.Future at 0x7ff2d2352680>

In [63]:
fut.add_done_callback(callback)
fut.set_result(VALUE)

69420


In [64]:
fut.value()

69