In [1]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
import lightning as L

import torch.distributed as dist

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))

    def forward(self, x):
        return self.l1(x)


class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))

    def forward(self, x):
        return self.l1(x)

In [4]:
class LitAutoEncoder(L.LightningModule):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, _ = batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat, x)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [6]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [7]:
dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())
train_loader = DataLoader(dataset)

100%|██████████████████████████████████████| 9.91M/9.91M [00:00<00:00, 12.7MB/s]
100%|███████████████████████████████████████| 28.9k/28.9k [00:00<00:00, 318kB/s]
100%|██████████████████████████████████████| 1.65M/1.65M [00:00<00:00, 3.09MB/s]
100%|██████████████████████████████████████| 4.54k/4.54k [00:00<00:00, 9.51MB/s]


In [14]:
import torch.utils.data as data

# use 20% of training data for validation
train_set_size = int(len(dataset) * 0.8)
valid_set_size = len(dataset) - train_set_size

# split the train set into two
seed = torch.Generator().manual_seed(42)
train_set, valid_set = data.random_split(dataset, [train_set_size, valid_set_size], generator=seed)

In [16]:
train_loader = DataLoader(train_set)
validation_loader = DataLoader(valid_set)

In [19]:
from lightning.pytorch.callbacks import DeviceStatsMonitor

In [20]:
# model
autoencoder = LitAutoEncoder(Encoder(), Decoder())

# train model
trainer = L.Trainer(callbacks=[DeviceStatsMonitor()])
trainer.fit(model=autoencoder, train_dataloaders=train_loader, val_dataloaders=validation_loader)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | encoder | Encoder | 50.4 K | train
1 | decoder | Decoder | 51.2 K | train
--------------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Epoch 0:   6%|█                  | 2655/48000 [00:06<01:58, 382.49it/s, v_num=4]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [None]:
command = ["./master_command.py", "--retailerid", ','.join([str(rid) for rid in retailerIds]),
                           "--marketplaces", 'Walmart-OPDBS',
                           "--start_weekid", str(weekid), "--end_weekid", str(weekid),
                           "--run_all_steps", "1", "--local", "0", "--branch", "improve-speed-walmart-test"]

In [None]:
"resourceRequirements": [
                      {
                         "type": "MEMORY",
                         "value": str(1024*80)
                      },
                      {
                         "type": "VCPU",
                         "value": "20"
                      }
                   ],

In [24]:
from pytorch_lightning import Trainer

Epoch 0:  17%|██▎           | 10135/60000 [84:04:38<413:40:02,  0.03it/s, v_num=0]
Epoch 5:  35%|█████▋          | 2120/6000 [84:02:12<153:48:10,  0.01it/s, v_num=1]
Epoch 0:   3%|▍             | 1823/60000 [84:01:45<2681:36:18,  0.01it/s, v_num=2]
Epoch 0:   7%|█             | 3555/48000 [83:39:38<1045:56:04,  0.01it/s, v_num=3]


In [2]:
from lightning import Trainer
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

from models.llama4.args import ModelArgs
from box import Box

In [3]:
from models.llama4.model import Transformer

In [4]:
import yaml

yaml_config_filepath = '../models/llama4/config.yaml'
with open(yaml_config_filepath, 'r') as f:
    all_configs = Box(yaml.safe_load(f))

In [5]:
args = ModelArgs(**all_configs.config_yaml.model.model_args)

In [6]:
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'

# Initialize the process group
dist.init_process_group(
    backend='gloo',     # or 'gloo' for CPU-only
    rank=0,
    world_size=1
)



In [13]:
model = Transformer(args)

AssertionError: Torch not compiled with CUDA enabled

In [9]:
torch.distributed.get_rank()

0

In [10]:
from fairscale.nn.model_parallel.initialize import initialize_model_parallel

In [12]:
initialize_model_parallel(model_parallel_size_=1,pipeline_length=1)

> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1


In [35]:
os.path.dirname(os.getcwd())

'/Users/yjiang/Documents/personal/workspace/projects/llama-models'

In [18]:
import torch
seq_len = 100
attention_chunk_size = 20
block_pos = torch.abs(
        (torch.arange(seq_len).unsqueeze(0) // attention_chunk_size)
        - (torch.arange(seq_len).unsqueeze(1) // attention_chunk_size)
    )

In [19]:
block_pos

tensor([[0, 0, 0,  ..., 4, 4, 4],
        [0, 0, 0,  ..., 4, 4, 4],
        [0, 0, 0,  ..., 4, 4, 4],
        ...,
        [4, 4, 4,  ..., 0, 0, 0],
        [4, 4, 4,  ..., 0, 0, 0],
        [4, 4, 4,  ..., 0, 0, 0]])

In [20]:
torch.arange(seq_len).unsqueeze(0) // attention_chunk_size

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 4, 4]])

In [21]:
torch.arange(seq_len).unsqueeze(1) // attention_chunk_size

tensor([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [3],
        [3],
        [3],
        [3],
        [3],
        [3],
        [3],
        [3],
        [3],
        [3],
        [3],
        [3],
        [3],
        [3],
        [3],
        [3],
        [3],