### ML Engineering

In [None]:
from pymongo import MongoClient

In [None]:
client = MongoClient(URL)

In [None]:
from airflow.decorators import dag

In [None]:
@dag
def example(
    dag_id=dag_id,
    start_date=start_date
):
    pass

In [None]:
from airflow import DAG
from airflow.operators.bash import BashOperator

In [None]:
with DAG(
    task_id=task_id
) as dag:
    task = BashOperator(
        task_id="x",
        bash_command="echo hello world"
    )
    
    task

In [None]:
from airflow.operators.python import PythonOperator

In [None]:
def task_1(ti):
    ti.xcom_push("x", 2)

In [None]:
def task_2(ti):
    x = ti.xcom_pull("x", task_id="task_1")[0]
    y = x + 3
    ti.xcom_push("y", y)

In [None]:
with DAG(
    dag_id=dag_id,
    start_date=start_date
) as dag:
    task_1 = PythonOperator("task_1", python_callable=task_1)
    task_2 = PythonOperator("task_2", python_callable=task_2)
    
    task_1 >> task_2

In [None]:
from datetime import datetime

In [None]:
with DAG(dag_id="nnn", start_date=datetime(2023, 5, 31)) as dag:
    task = PythonOperator("task", python_callable=say_hello)

In [None]:
from airflow.decorators import dag

In [None]:
@dag(start_date=start_date)
def fck():
    pass

In [None]:
from airflow.decorators import dag, task

In [None]:
@dag(dag_id=dag_id, start_date=start_date)
def example():
    @task
    def get_name(): pass

    @task
    def get_age(): pass

    @task
    def greet(name, age):
        pass
    
    name = get_name()
    age = get_age()
    
    greet(name, age )

In [None]:
from metaflow import FlowSpec, step

In [None]:
class CountFlow(FlowSpec):
    @step
    def start(self):
        self.words = words
        self.next(self.count, foreach="words")
    
    @step
    def count(self):
        self.length = len(self.input)
        self.next(self.join)
    
    @step
    def join(self): pass

In [None]:
import pytest

In [None]:
@pytest.mark.parametrize(
    "input, output",
    ([1, 1], [2, 4])
)
def test_square(input, output):
    assert square(input) == output

In [None]:
[task_3, task_2] >> task_1

### Engineering

In [None]:
step 1: create the data on cpu
step 2: reserve a portion in memory in gpu
step 3: copy the data from cpu to gpu
step 4: determine ___
step 5: launch a new kernel
step 6: execute
step 7: copy the results from gpu to cpu
step 8: memory dealocation

In [None]:
grid > thread block > thread

In [None]:
global memory

In [None]:
step 1: replicate the model
step 2: mini-batch -> micro-batch
step 3: do forward and backward
step 4: average the gradient
step 5: update according to the average gradient

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
class VocabParallelEmbedding(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super().__init__()
        world_size = torch.distributed.get_world_size()
        self.num_embeddings = num_embeddings
        self.embedding_dim_per_partrition = embedding_dim // world_size
        
        self.weight = nn.Parameter(torch.empty(
            self.num_embeddings,
            self.embedding_dim_per_partrition
        ))
        self.vocab_start_idx, self.vocab_end_idx = self.get_vocab_range(
            self.embedding_dim_per_partrition
        )
    
    def get_vocab_range(self, embedding_dim_per_partrition):
        rank = torch.distributed.get_rank()
        start_idx = rank*embedding_dim_per_partrition
        end_idx = start_idx+embedding_dim_per_partrition
        return start_idx, end_idx
    
    def forward(self, tokens):
        mask = (tokens < self.vocab_start_idx) | (tokens > self.vocab_end_idx)
        masked_tokens = tokens - self.vocab_start_idx # why?
        masked_tokens[mask] = 0.
        
        embeddings = F.embedding(masked_tokens, weight=self.weight)
        mask_idxs = torch.where(mask==False)[1]
        embeddings[mask_idxs] = 0.
        
        torch.distributed.all_reduce(embeddings)
        
        return embeddings

In [None]:
step 1: copy fp16 and fp32 of weight
step 2: do forward pass in fp16
step 3: update 

In [None]:
import os

In [None]:
class MPU:
    def __init__(self, master_addr, master_port, backend):
        if not torch.distributed.is_initialized():
            rank = os.getenv("RANK")
            world_size = os.getenv("WORLD_SIZE")
            os.environ["MASTER_ADDR"] = master_addr
            os.environ["MASTER_PORT"] = master_port
            
            torch.distributed.init_process_group(
                rank=rank,
                world_size=world_size,
                backend=backend
            )
            
            device_count = torch.distributed.device_count()
            if device_count > 0:
                device = rank % device
                torch.cuda.set_device(device)

In [None]:
from torch.utils.data import Dataset

In [None]:
class CachedDataset:
    def __init__(self, filename):
        self.filename = filename
        self.data = None
        self.cache_index = {}
    
    def prefetch(self, idxs):
        if all(i in self.cache_index for i in idxs):
            return
        
        if not self.data:
            self.data = torch.load(self.filename)
        
        total_elements = [self.data[i] for i in idxs]
        self.cache = torch.empty(total_elements, dtype=self.data.dtype)
        self.cache_index.clear()
        
        offset = 0
        for i in idxs:
            n_elements = self.data[i].numel()
            self.cache[offset:offset+n_elements] = self.data[i]
            self.cache_index[i] = offset
            offset += n_elements

In [None]:
from torch.profiler import ProfilerActivity, profile

In [None]:
for param_group in optimizer.param_groups:
    for param in param_group["params"]:
        print(param.shape)

In [None]:
tokens = model.to_tokens(repeated_text)

In [None]:
, cache = model.run_with_cache(tokens)

In [None]:
induction_heads = [(6, 9), (4, 2)]

In [None]:
attn_patterns = []

In [None]:
for head_idx, layer_idx in induction_heads:
    layer_attention_pattern = cache["attn", layer_idx]
    attention_pattern = layer_attention_pattern[0, head_idx]