Here is a mathematical explanation for why representing latent features as orthogonal vectors makes reconstruction easier in an autoencoder:

Let's say our autoencoder has an encoder function $f$ that maps the input $x$ to a latent representation $z = f(x)$, and a decoder $g$ that reconstructs the input as $\hat{x} = g(z)$.

If the dimensions of $z$ are orthogonal unit vectors $z_1, z_2, ..., z_n$, then we can write:

$z = a_1 z_1 + a_2 z_2 + ... + a_n z_n$

where $a_i$ are scalar coefficients that indicate the strength of each latent feature $z_i$ in the representation of $x$.

Since the $z_i$ vectors are orthogonal (dot product is 0), we have:

$z_i^\top z_j = 0$ for $i \neq j$

and because they are unit vectors:

$z_i^\top z_i = 1$

This allows us to easily "decode" the contribution of each latent feature via:

$a_i = z_i^\top z$

So the decoder can reconstruct $x$ as:

$\hat{x} = g(z) = g(a_1 z_1 + ... + a_n z_n)$

$= a_1 g(z_1) + ... + a_n g(z_n)$

Where $g(z_i)$ represents the contribution of the $i$-th latent feature to the reconstruction.

The orthogonality allows the decoder to cleanly disentangle and decode the contribution of each latent dimension, making reconstruction simpler. If the latent features overlapped, it would be harder to isolate each one's individual contribution to the input.

### Engineering

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
step 1: partrition
step 2: forward: gather, do the forward, discard
step 3: backward: gather, do the backward, disard
step 4: 

In [None]:
api server, scheduler, control manager, ectd

In [None]:
grd

step 1: init batch, processed data
step 2: unprocessed data
step 3: num of workers
step 4: partritioning

In [None]:
step 1: F(0, 0)
step 2: F(0, 1), F(1, 0)
step 3: F(1, 1), F(2, 0)
step 4: F(2, 1)

In [3]:
def compute_total_memory(model):
    total_memory = 0
    
    for param in model.parameters():
        total_memory += param.storage.size() * param.numel()
    
    return total_memory

In [4]:
class HostUpdatedInterrupt(RuntimeError):
    def __init__(self, skip_sync):
        self.skip_sync = skip_sync

In [None]:
clock_idx+1-n_microbatches

In [None]:
step 1: normalize the loss based on n/n_epochs
step 2: calculate the gradient with respect to the normalized loss
step 3: sum the gradient
step 4: if n =, update, if not, repeat step 1

In [None]:
grid > thread block > thread

In [None]:
class Pipeline:
    def __init__(
        self,
        batches, partritions, devices,
        scheduler=DetermisticScheduler()
    ):
        self.batches = batches
        self.partritions = partritions
        self.devices = devices
        self.scheduler = scheduler
    
    def fit(self):
        batches = self.batches
        partritions = self.partritions
        devices = self.devices
        scheduler = self.scheduler
        
        n_batches = len(batches)
        n_partritions = len(partritions)
        
        with spawk_worker(devices) as (in_queues, out_queues):
            for schedule in scheduler.generate(n_batches, n_partritions):
                self.compute(schedule, in_queues, out_queues)
    
    def compute(self, schedule, in_queues, out_queues):
        batches = self.batches
        for microbatch_idx, partrition_idx in schedule:
            batch = batches[microbatch_idx].get()
            
            def compute_func(batch, partrition):
                def wrapper():
                    return partrition(batch)
                return wrapper
            
            task = Task(compute=compute_func)
            out_queues[partrition_idx].put(task)
        
        for microbatch_idx, partrition_idx in schedule:
            output_task = in_queues[microbatch_idx].get()
            batches[microbatch_idx] = output_task.output

In [None]:
next_stream.record_event(event)

In [None]:
forward, backward, optimizer-state

In [None]:
grid > thread block > thread

In [None]:
m+n-1

In [None]:
model parameters, optimizer paramaters, gradients

In [5]:
tensor_model_parallel_size = 2
num_tensor_model_parallel_groups = 8

In [6]:
for i in range(num_tensor_model_parallel_groups):
    ranks = list(range(
        i*tensor_model_parallel_size,
        (i+1)*tensor_model_parallel_size
    ))
    
    print(ranks)

[0, 1]
[2, 3]
[4, 5]
[6, 7]
[8, 9]
[10, 11]
[12, 13]
[14, 15]


In [7]:
def by_column_parallel(inputs, weights):
    partrition_size = weights.shape[-1] // 2
    w1 = weights[:, :partrition_size]
    w2 = weights[:, partrition_size:]
    
    out1 = torch.matmul(inputs, w1)
    out2 = torch.matmul(inputs, w2)
    
    return torch.cat([out1, out2])

In [None]:
forward(x) > out = forward(x) > backward(out)

In [None]:
loss = loss_func(output, labels)

In [None]:
step 1: gather
step 2: do backward
step 3: discard
step 4: average 

In [None]:
x.record_stream(prev_stream)
x.record_stream(default_stream)

In [None]:
step 1: record the elapsed time
step 2: determine the number of layers in each partrition
step 3: 

### MechInterp

In [8]:
from transformer_lens.utils import get_act_name

In [None]:
head_names = [get_act_name("result", layer_idx) for layer_idx in range(n_layers)]
mlp_names = [get_act_name("mlp_out", layer_idx) for layer_idx in range(n_layers)]

In [None]:
all_hook_names = ["hook_embed", "hook_pos_embed"] + head_names, mlp_names

In [None]:
_, cache = model.run_with_cache(
    tokens,
    names_filter=lambda x: x in all_hook_names
)

In [None]:
input_components = torch.tensor([cache["embed"] + cache["pos_embed"]]).unsqueeze(0)

In [None]:
for head_name, mlp_name in zip(head_names, mlp_names):
    input_components = torch.cat([
        input_components,
        cache[head_name],
        cache[mlp_name]
    ])

In [None]:
W_U = model.W_U
logit_diff_direction = W_U[:, 0] - W_U[:, 1]

In [None]:
pre_final_ln_direction = coeff.T @ logit_diff_direction

In [9]:
from einops import einsum

In [None]:
contributions = einsum(
    input_components,
    pre_final_ln_direction,
    ""
)

In [None]:
W_normalized = W / W.norm(keepdim=True)

In [None]:
similarities = W_normalized @ W_normalized.T

In [None]:
fig.add_trace()

In [None]:
features are linear representations
feautres are represented as directions

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df_stocks.date, y=df_stocks.AAPL))

In [None]:
step 1: logit_diff_direction = W_U[0] - W_U[1]
step 2: approximate layer norm
step 3: backward transformation
step 4: project the pre layer norm to logit_diff_direction

In [11]:
from transformer_lens import HookedTransformerConfig, HookedTransformer

In [None]:
config = HookedTransformerConfig(**params)

In [None]:
model = HookedTransformer(cfg=config)

In [None]:
q, k, v, pattern, output

In [None]:
import 

step 1: rewrite probs as logit differnece
step 2: decompose the logits
step 3: calcualte the logit difference direction
step 4: project the output of each input components to the logit diff direction

In [None]:
W_OV

In [12]:
n_features = 5

In [None]:
interference[
    torch.arange(n_features),
    torch.arange(n_features)
] = 0.

In [None]:
polysemanticity = interference.pow(2).sum(-1).sqrt()

In [None]:
step 1: x = embed + pos_embed + sum(12 heads)
step 2: x@W_Q
step 3: [embed + pos_embed + sum(12 hedas)]@W_Q
step 4...

In [None]:
model.W_E

In [None]:
(their + mine)/2: 

In [13]:
import plotly.express as px

In [None]:
px.line(x=df_stocks.date, y=[])

In [None]:
model.embed(tokens)

In [None]:
step 1: logit_diff_direction = W_U[0] - W_U[1]
step 2: approximate layer norm
step 3: inverse transformation
step 4: project pre ln to logit_diff

In [None]:
cache.accumulated_resid(layer=layer_idx, pos_slice=target_positions)

In [14]:
from typing import List

In [None]:
numbers: List[int] = [1, 2, 3, 4, 5]

In [None]:
public, private, elastic

In [15]:
class Christmas:
    def __init__(self):
        self.color = "red"
    
    def __missing__(self):
        print("can't find")

In [25]:
from typing import Sequence

`x` has a fixed size and has no order type

In [30]:
x = [3, 5, "persistence", 41.1, "fuck ya"]

Add type hints to `x`

In [31]:
from typing import Union, List

In [32]:
x: List[Union[str, int, float]] = [3, 5, "persistence", 41.1, "fuck ya"]

In [33]:
x

[3, 5, 'persistence', 41.1, 'fuck ya']

In [34]:
import torch.distributed.rpc as rpc

In [35]:
class Observer:
    def __init__(self, env):
        self.id = rpc.get_worker_info().id
        self.env = env
    
    def run_episode(self, agent_rref):
        state, _ = self.env.reset()
        
        for _ in range(69):
            action = rpc.rpc_sync(
                to=agent_rref.owner(),
                func=agent_rref.rpc_sync().select_action,
                args=(self.id, state)
            )
            
            state, reward, done, _ = env.step(action)
            
            if done: break

In [None]:
text_target = text_embedding @ text_embedding.T
image_target = image_embedding @ image_embedding.T

target = F.softmax(
    (text_target+image_target)/(2*temperature),
    dim=-1
)

In [None]:
x.repeat((2, 2))

In [36]:
from torch.utils.data import DataLoader, random_split

In [None]:
train_set, test_set = random_split(dataset, lengths=[6, 4])

In [None]:
train_loader = DataLoader(train_set, batch_size=2)
test_loader = DataLoader(test_set, batch_size=2)

In [37]:
import torch.distributed as dist

In [None]:
dist.broadcast(x, src=0, async_op=True)

In [None]:
step 1: partritioning
step 2: gather, do forward pass, discard
step 3: gather, do backward pass, discard
step 4: grad
step 5: update 

In [None]:
owner, id, name

In [38]:
from typing import List, Union

In [39]:
x: List[Union[str, int, float]] = []

In [None]:
x.repeat((3, 2))

In [40]:
from torch.nn.utils import clip_grad_norm_

In [None]:
clip_grad_norm_(param_1, max_norm=GRAD_CLIP_NORM)

In [None]:
def func(x):
    return tokenize(x["sentence1"])

In [None]:
small_dataset.map(func)

In [None]:
from 

In [41]:
import torch

In [42]:
x = torch.randn(1, requires_grad=True)

In [43]:
y = torch.randn(1, requires_grad=True)

In [45]:
(x+y).mean().backward()

In [46]:
x.grad

tensor([1.])