### Engineering

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
import torch.distributed as dist
import torch.distributed.rpc as rpc

In [None]:
kubelet, kube-proxy, container runtime

In [None]:
inputs = inputs.view(-1)

In [None]:
probs = F.softmax(switch(inputs), dim=-1)

In [None]:
_, idxs = torch.max(probs, dim=-1)

In [None]:
step 1: mask input
step 2: calcualte local embedding
step 3: calculate global embedding
step 4: sum

In [None]:
broadcast > gather

In [None]:
def by_row_parallelism(inputs, weights):
    inp_per_partition = inputs.shape[-1] // 2
    w_per_partition = weights.shape[0] // 2
    
    inp1 = inputs[:, :inp_per_partition]
    inp2 = inputs[:, inp_per_partition:]
    w1 = weights[:w_per_partition, :]
    w2 = weights[w_per_partition:, :]
    
    out1 = inp1 @ w1
    out2 = inp2 @ w2
    
    return out1 + out2

In [None]:
step 1: wait data
step 2: construct task
step 3: put
step 4: execute
step 5: wait
step 6: get the output
step 7: put

In [None]:
step 1: 
step 2: resize embedding size, and lm_ea

In [None]:
job selection > initial workers > pool watcher

In [3]:
import threading

In [4]:
event = threading.Event()

In [5]:
def run_worker():
    event.wait()
    print("received")

In [None]:
worker_thread = threading.Thread(
    target=run_worker
)

In [None]:
worker_thread.start()

In [None]:
ready, running, suceed, failed, cooldown, blacklist

In [8]:
from typing import overload, List

In [9]:
@overload
def getitem(x: str) -> str:
    pass

In [10]:
@overload
def getitem(x: List[int]) -> int:
    pass

In [None]:
scatter > reduce > iden

In [11]:
import torch.nn.functional as F

In [13]:
top1_idx = torch.tensor([1, 3, 2])

In [17]:
mask = F.one_hot(top1_idx, num_classes=5)

In [21]:
mask

tensor([[0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0],
        [0, 0, 1, 0, 0]])

In [20]:
torch.mean(mask.float(), dim=0)

tensor([0.0000, 0.3333, 0.3333, 0.3333, 0.0000])

In [None]:
def intervene_resid(resid, position, feature):
    feature /= feature.norm(dim=-1)
    feature_projection = resid[0, position] @ feature
    resid[]

In [None]:
receiver_heads = [(7, 3), (7, 9), (8, 6), (8, 10)]

In [None]:
receiver_layers = [7, 8]

In [None]:
_, clean_cache = model.run_with_cache(clean_tokens)
_, corrupted_cache = model.run_with_cache(corrupted_tokens)

In [27]:
def patch_sender_head(resid, hook, clean_cache, corrupted_cache, target_head):
    trg_head_idx, trg_layer_idx = target_head
    
    if hook.layer() == trg_layer_idx:
        resid[:, :, head_idx] = corrupted_cache[hook.name][:, :, head_idx]
    else:
        resid = clean_cache[hook.name]
    
    return resid

In [23]:
from itertools import product

In [None]:
sender_heads = list(product(range(n_heads), range(max(receiver_layers))))

In [25]:
from transformer_lens.utils import get_act_name
from functools import partial

In [None]:
for head_idx, layer_idx in sender_heads:
    hook_name = get_act_name("z", layer_idx)
    hook_func = partial(
        patch_sender_head,
        clean_cache=clean_cache,
        corrupted_cache=corrupted_cache,
        target_head=(head_idx, layer_idx)
    )
    
    model.add_hook(hook_name, hook_func)
    _, patched_cache = model.run_with_cache(tokens)
    
    # filter

In [None]:
def probability_scores(image_embedding, text_embedding):
    image_embedding /= image_embedding.norm()

In [35]:
# Define function to calculate memory, FLOPs, and training time
def calculate_training_metrics(tensor_parallel_size, data_parallel_size, pipeline_parallel_size,
                               num_layers, num_params, num_heads, hidden_size, throughput_gpu,
                               context_length, batch_size, num_epochs):
    # Constants
    bytes_per_param_fp32 = 4  # fp32 uses 4 bytes per parameter
    
    num_gpus = tensor_parallel_size*data_parallel_size*pipeline_parallel_size
    
    # Memory Calculations
    ## Model Memory
    model_memory = num_params * bytes_per_param_fp32
    ## Optimizer Memory (Vanilla AdamW uses 12 bytes per parameter)
    optimizer_memory = 12 * num_params
    ## Activation Memory
    ### Using equation for memory_activations^Selective Recomputation
    activation_memory = context_length * batch_size * hidden_size * num_layers * (10 + 24 / tensor_parallel_size) * \
    (5*((num_heads*context_length)/(hidden_size*tensor_parallel_size)))
    ## Gradient Memory (Stored in fp32)
    gradient_memory = num_params * bytes_per_param_fp32
    ## Total Training Memory (3D-parallelism with ZeRO-1)
    total_memory = (model_memory / (pipeline_parallel_size * tensor_parallel_size)) + \
                   (optimizer_memory / num_gpus) + \
                   (activation_memory / tensor_parallel_size) + \
                   (gradient_memory / pipeline_parallel_size)

    # FLOPs Calculations
    ## Using C = tau * T => T = C / tau
    ### Forward Pass FLOPs
    forward_flops = 2 * num_params * (context_length * batch_size)
    ### Backward Pass FLOPs
    backward_flops = 4 * num_params * (context_length * batch_size)
    ## Total FLOPs per Epoch
    total_flops_per_epoch = (forward_flops + backward_flops)
    ## Total Training Time per Epoch in seconds
    training_time_per_epoch = total_flops_per_epoch / (throughput_gpu * 1e12)
    ## Total Training Time for all Epochs in seconds
    total_training_time = training_time_per_epoch * num_epochs

    # Print Results
    print(f"\nResults:")
    print(f"Model Memory: {model_memory / 1e9} GB")
    print(f"Optimizer Memory: {optimizer_memory / 1e9} GB")
    print(f"Activation Memory: {activation_memory / 1e9} GB")
    print(f"Gradient Memory: {gradient_memory / 1e9} GB")
    print(f"Total Training Memory: {total_memory / 1e9} GB")
    print(f"Total FLOPs per Epoch: {total_flops_per_epoch:.2e}")
    print(f"Training Time per Epoch: {training_time_per_epoch:.2f} seconds")
    print(f"Total Training Time: {total_training_time:.2f} seconds")

In [42]:
calculate_training_metrics(
    tensor_parallel_size=2,
    data_parallel_size=2,
    pipeline_parallel_size=2,
    num_layers=12,
    num_params=125e9,
    num_heads=12,
    hidden_size=768,
    throughput_gpu=120,
    context_length=512,
    batch_size=9765,
    num_epochs=25
)


Results:
Model Memory: 500.0 GB
Optimizer Memory: 1500.0 GB
Activation Memory: 20273.9023872 GB
Gradient Memory: 500.0 GB
Total Training Memory: 10699.4511936 GB
Total FLOPs per Epoch: 3.75e+18
Training Time per Epoch: 31248.00 seconds
Total Training Time: 781200.00 seconds


In [None]:
tokens = model.to_tokens(text)

In [None]:
for layer_idx in range(n_layers):
    for head_idx in range(n_heads):
        _, cache = model.run_with_cache(tokens)
        target_pattern = cache["pattern", layer_idx][:, head_idx]

In [None]:
clean_tokens = model.to_tokens(clean_prompts)

In [None]:
correct_tokens = modell.to_tokens("Mary Tom", prepend_bos=False)
incorrect_tokens = modell.to_tokens("John James", prepend_bos=False)

In [None]:
def compute_ioi_metric(logits):
    pass

In [None]:
clean_logits = model.

In [None]:
class DotProduct(nn.Module):
    def forward(self, x):
        

In [None]:
state, reward, done, truncated, info = env.step()

while done:
    #reward
    # max action
    # take action
    #predicte the enxt reward
    #target
    # loss
    # update

In [None]:
def convert_to_distributions(states):
    q_values = q_function(states)
    z = q_values.sum(dim=-1)