### Engineering

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
class f(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        return input
    
    @staticmethod
    def backward(ctx, grad_output):
        torch.distributed.all_reduce(grad_output)
        return grad_output

In [None]:
class g(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        world_size = torch.distributed.get_world_size()
        inputs = [torch.empty_like(input) for _ in range(world_size)]
        torch.distributed.all_gather(inputs, input)
        inputs = torch.cat(inputs, dim=-1)
        return inputs
    
    @staticmethod
    def backward(ctx, grad_output):
        rank = torch.distributed.get_rank()
        world_size = torch.distributed.get_world_size()
        
        last_dim_size = grad_output.shape[-1]
        chunk_size = last_dim_size // world_size
        grad_chunks = torch.split(grad_output, chunk_size, dim=-1)
        return grad_chunks[rank]

In [None]:
class ColumnParallelLinear(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        world_size = torch.distributed.get_world_size()
        
        self.input_size = input_size
        self.output_size_per_patrition = output_size // world_size
        
        self.weight = nn.Parameter(torch.empty(
            self.output_size_per_patrition,
            self.input_size
        ))
        self.bias = nn.Parameter(torch.empty(
            self.output_size_per_patrition
        ))
    
    def forward(self, input):
        input_parallel = f.apply(input)
        output_parallel = F.linear(input_parallel, self.weight, self.bias)
        outputs = g.apply(output_parallel)
        return outputs

In [None]:
from typing import OrderedDict

In [None]:
def split_model(model, balances, devices):
    patrition_idx = 0
    layers = Orde
    patritions = []
    
    for i, layer in enumerate(model):
        layers.append(i, layer)
        
        if len(layer) == balances[patrition_idx]:
            patrition = nn.Sequential(*layers)
            patrition.to(devices[patrition_idx])
            patritions.append(patrition)
            layers.clear()
    
    return patritions

In [None]:
p2p, message passing, collective communication

In [None]:
gather, reduce, scatter, 

### Sci

### ML Engineering

In [None]:
from prefect import flow

In [None]:
from metaflow import FlowSpec, step, Parameter

In [None]:
class Training(FlowSpec):
    secret = Parameter("secret")
    
    @step
    def start(self):
        self.next(self.train)
    
    @step
    def train(self):
        self.secret = 69
        print(f"secret {self.secret}")
        self.next(self.end)
    
    @step
    def end(self): pass

In [None]:
from metaflow import Flow

In [None]:
Flow("CountFlow").latest_run

In [None]:
from prefect.deployments import Deployment

In [None]:
Deployment.build_from_flow(
    flow=run_workflowb
)

In [None]:
from metaflow import Flow

In [None]:
class Training(FlowSpec):
    @step
    def start(self):
        self.next(self.train)
    
    @step
    def train(self):
        train()
        self.next(self.end)
    
    @step
    def end(self): pass

In [None]:
FROM x x 

In [None]:
import threading

In [None]:
lock = threading.Lock()

In [None]:
def target():
    with lock:
        increment_counter()

In [None]:
for _ in range(3):
    thread = threading.Thread(target=target)
    thread.start()

In [None]:
xs[2]

In [None]:
from prefect.deployments import Deployment

In [None]:
deployment_dev = Deployment.build_from_flow(
    flow=training
)

In [None]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
for i, block in enumerate(model.transformer.h):
    if i >= 6:
        for param in block.parameters():
            param.requires_grad = True

In [None]:
model.ln_final

In [None]:
model.lm_head

In [None]:
tokenzier.add_special_tokens(SPECIAL_TOKENS)

In [None]:
model.resize_embedding

### AI

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
from transformers import AutoModel

In [None]:
class CustomTokenClassifier(nn.Module):
    def __init__(self, checkpoint, n_labels, dropout):
        self.model = AutoModel.from_pretrained(checkpoint)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(768, n_labels)
    
    def forward(self, input_ids, attention_mask):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        last_hidden_state = output.last_hidden_state
        output = self.dropout(last_hidden_state)
        output = self.classifier(output)
        return output

In [None]:
act_head = activations[1, 2]

In [None]:
outputs = []

In [None]:
for i in range(2):
    outputs.append(cache[f"blocks.{i}.attn.hook_result"])

In [None]:
WA[:, :, 0]

In [None]:
tokens = model.to_tokens(text)

In [None]:
logits = model(tokens)

In [None]:
target_token = tokens[1:]

In [None]:
log_probs = F.log_probs(logits, dim=-1)

In [None]:
last_token_logits = log_probs[:, -1, :]

In [None]:
targ

In [None]:
logits = model(tokens)

In [None]:
last_token_logits = logits[:, -1, :]

In [None]:
target_tokens = tokens[1:]

In [None]:
loss = -last_tokens_logits.gather()

In [None]:
torch.split(x, split_size_or_sections=3)

In [None]:
torch.split(x, split_size_or_sections=[1, 3, 2])

In [None]:
torch.jit.script()

In [None]:
weights = attn_weights.diagonal(dim1=-2, dim2=-1, offset=-1)

In [None]:
from torchvision import transforms as tfms

In [None]:
transform = tfms.Compose([
    tfms.ToTensor(),
    tfms.Normalize(0.3, 0.9)
])

In [None]:
dist.log_prob(action)

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps):
        super().__init__()
        self.features = features
        self.eps = eps
        self.adds = nn.Parameter(torch.zeros(features))
        self.mults = nn.Parameter(torch.ones(features))
    
    def forward(self, x):
        mean, var = x.mean(), x.var()
        x = (x - mean) / (self.eps + var).sqrt()
        x = self.adds + self.mults * x
        return x

In [None]:
import torch.distributed.rpc as rpc

In [None]:
rref = rpc.remote("worker_1", create_tensor)