### Engineering

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
broadcast, gather, reduce, scatter

In [2]:
import threading

In [3]:
event = threading.Event()

In [None]:
def run_worker():
    event.wait()

In [None]:
worker_thread = threading.Thread(
    target=run_worker
)

In [None]:
#include <iostream>

In [None]:
using namespace std;

In [None]:
class Book() {
    public:
        string title;
    
        Book(aTitle) {
            title = aTitle;
        }
}

In [None]:
int* ptr = (int*)malloc(69)

In [None]:
def gather_tensors(x):
    world_size = torch.distributed.get_world_size()
    xs = [torch.empty_like(x) for _ in range(world_size)]
    torch.distributed.all_gather(xs, x)
    xs = torch.cat(xs, dim=-1)
    return xs

In [4]:
import socketserver

In [None]:
class EchoRequestHandler(socketserver.StreamRequestHandler):
    def handle(self):
        print(self.rfile.readline())

In [5]:
lock = threading.Lock()

In [None]:
def run_worker():
    with lock:
        print_numbers()

In [None]:
t1 = threading.Thread(target=run_worker)

In [8]:
def wait_stream(source_stream, target_stream):
    if isinstance(target_stream, torch.cuda.Stream):
        if isinstance(source_stream, torch.cuda.Stream):
            # GPU waits for GPU
            source_stream.wait_stream(target_stream)
        else:
            # GPU waits for CPU
            target_stream.syncronous()

In [9]:
class Wait(torch.autograd.Function):
    @staticmethod
    def forward(ctx, prev_stream, next_stream, input):
        ctx.prev_stream = prev_stream
        ctx.next_stream = next_stream
        
        wait_stream(
            source_stream=next_stream,
            target_stream=prev_stream
        )
        return input
    
    @staticmethod
    def backward(ctx, grad_output):
        prev_stream = ctx.prev_stream
        next_stream = ctx.next_stream
        
        wait_stream(
            source_stream=prev_stream,
            target_stream=next_stream
        )
        
        return input

In [10]:
from functools import wraps

In [11]:
def my_decorator(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        return func()
    
    return wrapper

In [12]:
import socketserver

In [None]:
with socketserver.ThreadingTCPServer((MASTER_HOST, MASTER_ADDR), EchoHanderl) as server:
    server.server_forever()

In [None]:
client: send
manager: handle
service: receive

In [13]:
import socketserver

In [14]:
class EchoRequestHandler(socketserver.StreamRequestHandler):
    def handle(self):
        pass

In [15]:
n_microbatches, n_partritions = 4, 3

In [16]:
n_clock_cycles = n_microbatches+n_partritions-1

In [17]:
n_clock_cycles

6

In [20]:
for clock_idx in range(n_clock_cycles):
    start_partrition = max(clock_idx+1-n_microbatches, 0)
    end_partrition = min(clock_idx+1, n_partritions)
    
    tasks = []
    for partrition_idx in range(start_partrition, end_partrition):
        microbatch_idx = clock_idx - partrition_idx
        tasks.append((microbatch_idx, partrition_idx))
    
    print(tasks)

[(0, 0)]
[(1, 0), (0, 1)]
[(2, 0), (1, 1), (0, 2)]
[(3, 0), (2, 1), (1, 2)]
[(3, 1), (2, 2)]
[(3, 2)]


In [None]:
class Recompute(torch.autograd.Function):
    @staticmethod
    def forward(ctx, phony, recomputed, function, input):
        ctx.recomputed = recomputed
        ctx.function = function
        ctx.input = input
        
        return phony
    
    @staticmethod
    def backward(ctx, grad_input):
        function = ctx.function
        input = ctx.input
        input_leaf = input.detach().requires_grad_(input.requires_grad)
        
        with torch.grad_enabled():
            output = function(input_leaf)
        
        ctx.recomputed.append((output, input_leaf))
        
        grad = [None, None, None]
        
        if input_leaf.requires_grad:
            grad.extend([input_leaf])

In [None]:
int* h_a, h_b, h_c;

In [None]:
size_t size = sizeof(int)*n

In [None]:
h_a = (int*)malloc(size)
h_b = (int*)malloc(size)
h_c = (int*)malloc(size)

In [None]:
main worker > worker > task > cuda stream

In [None]:
event = threading.Event()

In [None]:
def run_worker():
    event.wait()

In [None]:
worker_thread = threading.Thread(target=run_worker)

In [21]:
class Checkpoint(torch.autograd.Function):
    @staticmethod
    def forward(ctx, phony, recomputed, function, input):
        ctx.recomputed = recomputed
        ctx.function = function
        ctx.input = input
        
        with torch.no_grad():
            output = function(input)
        
        return input
    
    @staticmethod
    def backward(ctx, grad_output):
        output, input_leaf = ctx.recomputed.pop()
        
        if input_leaf.requires_grad:
            torch.autograd.backward(output, grad_output)
        
        grad = [None, None, None]
        
        if input_leaf.requires_grad:
            grad.extend([input_leaf.grad])
        else:
            grad.extend([None])
            
        return tuple(grad)

In [None]:
def by_row_parallelism(inputs, weights):
    inp_partrition_size = input.shape[-1] // 2
    w_partrition_size = weights.shape[0] // 2
    
    inp1 = inputs[:, :inp_partrition_size]
    inp2 = inputs[:, inp_partrition_size:]
    
    w1 = weights[:w_partrition_size, :]
    w2 = weights[w_partrition_size:, :]
    
    out1 = inp1 @ w1
    out2 = inp2 @ w2
    
    return out1 + out2

In [None]:
criteria 1:
criteria 2: 

In [None]:
with socketserver.ThreadingTCPServer(
    (MASTER_HOST, MASTER_PORT),
    EchoRequestHandler
) as server:
    server.serve_forever()

In [None]:
def by_column_parallelism(inputs, weights):
    partrition_size = weights.shape[-1] // 2
    
    w1, w2 = weights[:, :partrition_size], weights[:, partrition_size:]
    out1 = inputs @ w1
    out2 = inputs @ w2
    
    return torch.cat([out1, out2], dim=-1)

In [None]:
client, manager, service

In [22]:
n_microbatches = 4
n_partritions = 3

In [24]:
n_clock_cycles = n_microbatches + n_partritions - 1

In [25]:
for clock_idx in range(n_clock_cycles):
    start_partrition = max(clock_idx+1-n_microbatches, 0)
    end_partrition = min(clock_idx+1, n_partritions)
    
    tasks = []
    for partrition_idx in range(start_partrition, end_partrition):
        microbatch_idx = clock_idx - partrition_idx
        tasks.append((microbatch_idx, partrition_idx))
        
    print(tasks)

[(0, 0)]
[(1, 0), (0, 1)]
[(2, 0), (1, 1), (0, 2)]
[(3, 0), (2, 1), (1, 2)]
[(3, 1), (2, 2)]
[(3, 2)]


In [None]:
Checkpoint.forward() > Recompute.forward() > Recompute.backward() > Checkpoint.backward()

### MLE

In [None]:
@app.get("/users/{uid}")
def get_user(uid: int):
    pass

In [26]:
from typing import Optional

In [None]:
@app.get("/files")
def files(is_deleted: Optional):
    pass

In [27]:
from airflow.decorators import dag
from airflow.operators.python import PythonOperator

In [None]:
@dag(dag_id=dag_id)
def dag():
    task_1 = PythonOperator(
        python_callable=say_hello,
        task_id="tasl_1 "
    )

In [28]:
from fastapi import status

In [None]:
@app.get("/blog", status=status.HTTP_404_NOT_FOUND)
def blog():
    pass

### Engineering

In [None]:
target_tokens = tokens[1:]

In [None]:
W_U = model.W_U
W_U_correct_tokens = W_U[:, target_tokens]

In [None]:
_, cache = model.run_with_cache(tokens)

In [None]:
embed = cache[hook_name]

In [30]:
from einops import einsum

In [None]:
embed_attributions = einsum(
    W_U_correct_tokens,
    embed[:-1]
)

In [None]:
tokens = model.to_tokens(repeated_text)

In [None]:
_, cache = model.to_tokens(tokens)

In [None]:
induction_heads = [(6, 9), (4, 2)]

In [None]:
attention_patterns = []

In [31]:
from transformer_lens.utils import get_act_name

In [None]:
for head_idx, layer_idx in range(induction_heads):
    hook_name = get_act_name("attn", layer_idx)
    attention_patterns.append(cache[hook_name][0, head_idx])

In [32]:
from transformer_lens import HookedTransformer, HookedTransformerConfig

In [None]:
config = HookedTransformerConfig(**params)

In [None]:
model = HookedTransformer(cfg=config)

In [None]:
slices = tensor.unbind(dim=0)

In [None]:
step 1: prompt yes and no
step 2:

In [None]:
unemed(ln_final(final_residual_stream))

In [None]:
model.run_with_cache(
    names_filter=lambda x: x.endswith("mlp_out")
)

In [None]:
_, cache = model.run_with_cache(
    tokens
)

In [33]:
from transformer_lens.utils import get_act_name

In [34]:
hook_name = get_act_name("pattern", 0, "attn")

In [35]:
hook_name

'blocks.0.attn.hook_pattern'

In [None]:
def patch(activations, hook):
    return activations[:, 1, :] = clean_acts[hook.name][:, 1, :]

In [None]:
patched_logits = model.run_with_cache(
    clean_tokens,
    fwd_hooks=[(hook_name, patch)]
)

In [36]:
from itertools import product

In [None]:
combinations = list(product(range(n_heads), range(n_layers)))

In [None]:
corrupted_prompt = "John told Mary: 'Persistence is all you need.' Mary replied back to "

In [None]:
clean_tokens = model.to_tokens(prompt)

In [None]:
corrupted_tokens = model.to_tokens(corrupted_prompt)

In [None]:
, corrupted_activations = model.run_with_cache(
    corrupted_tokens
)

In [37]:
head_idx, layer_idx = 6, 9

In [None]:
hook_name = get_act_name("attn", layer_idx)

In [None]:
corrupted_head = corrupted_activations[hook_name][0, head_idx, :, :]

In [38]:
def patch_corrupted_head(activations, hook):
    activations[hook_name][0, head_idx, :, :] = corrupted_head
    return activations

In [None]:
_, patched_activations = model.run_with_cache(
    clean_tokens,
    fwd_hooks=[(hook_name, patch_corrupted_activations)]
)

In [None]:
corrupted_receiver = patched_activations[receiver_hook_name]

In [None]:
def patch_corrupted_receiver(activations, hook):
    return corrupted_receiver

In [None]:
logits, _ = model.run_with_cache(
    clean_tokens,
    fwd_hooks=[(receiver_hook_name, patch_corrupted_receiver)]
)

In [None]:
target_token = model.to_single_token(" John")

In [None]:
def compute_logit_diff(clean_logits, corrupted_logits, target_token):
    return clean_logits[:, -1, :][target_token] - corrupted_logits[:, -1, :][target_token]

In [None]:
logits = unembed @ final_residual_stream

final_residual = embed + layer_1 + layer_2

logits = unembed @ (embed + layer_1 + layer_2)
= unembed @ embed + unembed @ layer_1 + unembed @ layer_2

In [41]:
import torch.distributed.rpc as rpc
from torch.distributed.rpc import RRef

In [42]:
class Agent:
    def __init__(self):
        self.id = RRef(self)

In [44]:
from einops import einsum

In [None]:
output = einsum(
    x, y, "b d, b d ->"
)

In [None]:
rpc.get_worker_info(worker_id)

In [None]:
agent_rref.rpc_rpc_sync().init("heloo")

In [None]:
class Observer:
    def __init__(self, env):
        self.env = env
        self.id = RRef(self)
    
    def run_episode(self, agent_rref):
        state, _ = env.reset()
        
        for _ in range(69):
            action = agent_rref.rpc_sync().select_action(
                observer_id=self.id,
                state=state
            )
            
            state, reward, done, _ = env.step()
            
            if done: break

In [46]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
text_similarities = text_embeddings @ text_embeddings.T
image_similarities = image_embeddings @ image_embeddings.T

In [None]:
target = F.softmax(
    (text_similarities + image_similarities) / (2*temperature),
    dim=-1
)

In [None]:
image encoder, text encoder, project

In [None]:
class DotProduct(nn.Module):
    def forward(self, x):
        users = self.user_factors[:, x[0]]
        movies = self.movie_factors[:, x[1]]
        return users*movies

In [None]:
image > image encoder > embedding

In [None]:
RUN mkdir /home/app

In [None]:
image encoder, text encoder, projection, constrative loss

In [None]:
cell, forgate, output, input

In [None]:
import jax

In [None]:
vmapped_multiply = jax.vmap(multiply)

In [None]:
results = vmapped_multipl

In [48]:
from einops import rearrange

In [None]:
output = rearrange(
    images,
    "bs c (n_h p_h) (n_w p_w) -> bs n_h n_w (p_h p_w c)"
)