### Engineering

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
ranks = [0, 1, 3, 6]

In [None]:
group = None

In [None]:
rank = torch.distributed.get_rank()

In [None]:
if rank in ranks:
    group = torch.distributed.new_process_group(ranks=ranks)

In [None]:
if rank == 0:
    torch.distributed.broadcast(x, src=0, group=group)

In [None]:
class ParallelMLP(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.dense_h_to_4h = ColumnParallelLinear(
            input_size=hidden_size,
            output_size=hidden_size*4
        )
        self.gelu = nn.GELU()
        self.dense_4h_to_h = RowParallelLinear(
            input_size=hidden_size*4,
            output_size=hidden_size
        )
    
    def forward(self, input):
        output_paralell = self.dense_h_to_4h(input)
        output_parallel = self.gelu(output_paralell)
        output = self.dense_4h_to_h(output_paralell)
        return output

In [None]:
class ColumnParallelLinear(nn.Module):
    def __init__(self, input_size, output_size, world_size):
        super().__init__()
        self.input_size = input_size
        self.output_size_per_partrition = output_size // world_size
        self.weight = nn.Parameter(torch.empty(
            self.output_size_per_partrition,
            self.input_size
        ))
        self.bias = nn.Parameter(torch.empty(
            self.output_size_per_partrition
        ))
    
    def forward(self, input):
        output_parallel = F.linear(input, self.weight, self.bias)
        
        world_size = torch.distributed.get_world_size()
        outputs = [torch.empty_like(output_parallel) for _ in range(world_size)]
        torch.distributed.all_gather(outputs, output_parallel)
        outputs = torch.cat(outputs, dim=-1)
        return outputs

In [None]:
class f(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        return input
    
    @staticmethod
    def backward(ctx, grad_output):
        torch.distributed.all_reduce(grad_output)
        return grad_output

In [None]:
class g(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        world_size = torch.distributed.get_world_size()
        inputs = [torch.empty_like(input) for _ in range(world_size)]
        torch.distributed.all_gather(inputs, input)
        inputs = torch.cat(inputs, dim=-1)
        return inputs
    
    @staticmethod
    def backward(ctx, grad_output):
        world_size = torch.distributed.get_world_size()
        rank = torch.distributed.get_rank()
        
        dim_size = grad_output.shape[-1]
        dim_size_per_partrition = dim_size // world_size
        
        grad_chunks = torch.split(grad_output, dim_size_per_partrition, dim=-1)
        
        return grad_chunks[rank]

In [None]:
class ColumnParallelLinear(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        world_size = torch.distributed.get_world_size()
        self.input_size = input_size
        self.output_size_per_partrition = output_size // world_size
        self.weight = nn.Parameter(torch.empty(
            self.output_size_per_partrition,
            self.input_size
        ))
        self.bias = nn.Parameter(torch.empty(
            self.output_size_per_partrition
        ))
    
    def forward(self, input):
        input_parallel = f.apply(input)
        output_parallel = F.linear(input_parallel, self.weight, self.bias)
        output = g.apply(output_parallel)
        return output

In [None]:
def create_continuous_memory(memory_size):
    FP32_SIZE = 4
    n_numbers = memory_size // FP32_SIZE
    return torch.empty(n_numbers, dtype=torch.float32)

In [None]:
step 1: record the elapse time for each layer
step 2: determine the number of layers for each partrition
step 3: split the
step 4: move

In [None]:
tensor_model_parallel_size = 2

In [None]:
num_tensor_model_parallel_groups = 8

In [None]:
for i in range(num_tensor_model_parallel_groups):
    ranks = list(range(
        i*tensor_model_parallel_size,
        (i+1)*tensor_model_parallel_size
    ))
    
    print(ranks)

[0, 1]
[2, 3]
[4, 5]
[6, 7]
[8, 9]
[10, 11]
[12, 13]
[14, 15]


### ML Engineering

In [None]:
from metaflow import parallel_map

In [None]:
results = parallel_map(plus_69, numbers)

### MechInterp

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
components = []

In [None]:
text_embeddings = cache["hook_embed"]

In [None]:
positional_embeddings = cache["hook_pos_embed"]

In [None]:
components.append(text_embeddings)
components.append(positional_embeddings)

In [None]:
n_layers = 3

In [None]:
from transformer_lens import utils

In [None]:
for i in range(n_layers):
    attn_out = utils.get_act_name("attn_out", i)
    mlp_out = utils.get_act_name("mlp_out", i)
    components.append(attn_out)
    components.append(mlp_out)

In [None]:
features are directions
features are linear representations

In [None]:
n_features = interference.shape[-1]

In [None]:
interference[torch.arange(n_features), torch.arange(n_features)] = 0

In [None]:
from einops import einsum

In [None]:
interference[torch.arange(n_features), torch.arange(n_features)] = 0.

In [None]:
polysemanticity = interference.pow(2).sum(dim=-1).sqrt()

In [None]:
step 1
step 2 x1 x2
step 3: extract
step 4: normalize
step 5:

### AI

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
from contextlib import contextmanager

In [None]:
@contextmanager
def use_stream(stream):
    if isinstance(stream, torch.cuda.Stream):
        return
    
    with torch.cuda.stream(stream):
        yield

In [None]:
step 1: tokenize prompt
step 2: observation
step 3: pred
step 4: take action
step 5: repeat

In [None]:
tokenizer.add_special_tokens(SPECIAL_TOKENS)

In [None]:
model.resize_text_embedding(len(tokenizer))

In [None]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
for i, block in enumerate(model.blocks):
    if i >= 6:
        for param in block.parameters():
            param.requires_grad = True

In [None]:
for b

In [None]:
2, 3, 3