In [2]:
import os
from typing import Iterable

os.environ["CUDA_VISIBLE_DEVICES"] = "7"

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

def get_num_parameters(model: nn.Module) -> int:
    return sum(param.numel() for param in model.parameters())

def get_device(index: int = 0) -> torch.device:
    """Try to use the GPU if possible, otherwise, use CPU."""
    if torch.cuda.is_available():
        return torch.device(f"cuda:{index}")
    else:
        return torch.device("cpu")

# Moule Parameters

In [3]:
input_dim = 16384
output_dim = 32

w = nn.Parameter(torch.randn(input_dim, output_dim))
assert isinstance(w, torch.Tensor)  # Behaves like a tensor
assert type(w.data) == torch.Tensor  # Access the underlying tensor

# Custom Model

In [4]:
class Linear(nn.Module):
    """Simple linear layer."""
    def __init__(self, input_dim: int, output_dim: int):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(input_dim, output_dim) / np.sqrt(input_dim))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x @ self.weight
    
class Cruncher(nn.Module):
    def __init__(self, dim: int, num_layers: int):
        super().__init__()
        self.layers = nn.ModuleList([
            Linear(dim, dim)
            for i in range(num_layers)
        ])
        self.final = Linear(dim, 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Apply linear layers
        B, D = x.size()
        for layer in self.layers:
            x = layer(x)

        # Apply final head
        x = self.final(x)
        assert x.size() == torch.Size([B, 1])

        # Remove the last dimension
        x = x.squeeze(-1)
        assert x.size() == torch.Size([B])
        
        return x
    
D = 64  # Dimension
num_layers = 2
model = Cruncher(dim=D, num_layers=num_layers)

param_sizes = [
    (name, param.numel())
    for name, param in model.state_dict().items()
]
assert param_sizes == [
    ("layers.0.weight", D * D),
    ("layers.1.weight", D * D),
    ("final.weight", D),
]
num_parameters = get_num_parameters(model)
assert num_parameters == (D * D) + (D * D) + D

# Remember to move the model to the GPU.
device = get_device()
model = model.to(device)

# Run the model on some data.
B = 8  # Batch size
x = torch.randn(B, D, device=device)
y = model(x)
assert y.size() == torch.Size([B])

# Note about Random

In [5]:
# There are three places to set the random seed which you should do all at once just to be safe.
# Torch
seed = 0
torch.manual_seed(seed)

# NumPy
import numpy as np
np.random.seed(seed)

# Python
import random
random.seed(seed)

# Data Loading

In [6]:
def get_batch(data: np.array, batch_size: int, sequence_length: int, device: str) -> torch.Tensor:
    # Sample batch_size random positions into data.
    start_indices = torch.randint(len(data) - sequence_length, (batch_size,))
    assert start_indices.size() == torch.Size([batch_size])

    # Index into the data.
    x = torch.tensor([data[start:start + sequence_length] for start in start_indices])
    assert x.size() == torch.Size([batch_size, sequence_length])

    # Pinned memory
    # By default, CPU tensors are in paged memory. We can explicitly pin.
    if torch.cuda.is_available():
        x = x.pin_memory()
    # This allows us to copy x from CPU into GPU asynchronously.
    x = x.to(device, non_blocking=True)
    # This allows us to do two things in parallel (not done here):
    # 1. Fetch the next batch of data into CPU
    # 2. Process x on the GPU.

    return x

# serialization
orig_data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=np.int32)
orig_data.tofile("data.npy")

# loading
# Use memmap to lazily load only the accessed parts into memory.
data = np.memmap("data.npy", dtype=np.int32)
assert np.array_equal(data, orig_data)

# A data loader generates a batch of sequences for training.
B = 2  # Batch size
L = 4  # Length of sequence
x = get_batch(data, batch_size=B, sequence_length=L, device=get_device())
assert x.size() == torch.Size([B, L])

  x = torch.tensor([data[start:start + sequence_length] for start in start_indices])


# Optimizer

In [7]:
class SGD(torch.optim.Optimizer):
    def __init__(self, params: Iterable[nn.Parameter], lr: float = 0.01):
        super(SGD, self).__init__(params, dict(lr=lr))

    def step(self):
        for group in self.param_groups:
            lr = group["lr"]
            for p in group["params"]:
                grad = p.grad.data
                p.data -= lr * grad
                
                
class AdaGrad(torch.optim.Optimizer):
    def __init__(self, params: Iterable[nn.Parameter], lr: float = 0.01):
        super(AdaGrad, self).__init__(params, dict(lr=lr))

    def step(self):
        for group in self.param_groups:
            lr = group["lr"]
            for p in group["params"]:
                # Optimizer state
                state = self.state[p]
                grad = p.grad.data

                # Get squared gradients g2 = sum_{i<t} g_i^2
                g2 = state.get("g2", torch.zeros_like(grad))

                # Update optimizer state
                g2 += torch.square(grad)
                state["g2"] = g2
                
                # Update parameters
                p.data -= lr * grad / torch.sqrt(g2 + 1e-5)

# recall deep linear model
B = 2
D = 4
num_layers = 2
model = Cruncher(dim=D, num_layers=num_layers).to(get_device())

optimizer = AdaGrad(model.parameters(), lr=0.01)
state = model.state_dict()  # @inspect state

# Compute gradients
x = torch.randn(B, D, device=get_device())
y = torch.tensor([4., 5.], device=get_device())
pred_y = model(x)
loss = F.mse_loss(input=pred_y, target=y)
loss.backward()

# Take a step
optimizer.step()
state = model.state_dict()  # @inspect state

# Free up the memory (optional)
optimizer.zero_grad(set_to_none=True)

In [8]:
# memory
# Parameters
num_parameters = (D * D * num_layers) + D  # @inspect num_parameters
assert num_parameters == get_num_parameters(model)

# Activations
num_activations = B * D * num_layers  # @inspect num_activations

# Gradients
num_gradients = num_parameters  # @inspect num_gradients

# Optimizer states
num_optimizer_states = num_parameters  # @inspect num_optimizer_states

# Putting it all together, assuming float32
total_memory = 4 * (num_parameters + num_activations + num_gradients + num_optimizer_states)  # @inspect total_memory

# Compute (for one step)
flops = 6 * B * num_parameters  # @inspect flops

# Train Loop

In [12]:
def train(name: str, get_batch,
          D: int, num_layers: int,
          B: int, num_train_steps: int, lr: float):
    model = Cruncher(dim=D, num_layers=0).to(get_device())
    optimizer = SGD(model.parameters(), lr=0.01)

    for t in range(num_train_steps):
        # Get data
        x, y = get_batch(B=B)

        # Forward (compute loss)
        pred_y = model(x)
        loss = F.mse_loss(pred_y, y)

        print(f"Step {t}, Loss: {loss.item():.4f}, ")

        # Backward (compute gradients)
        loss.backward()

        # Update parameters
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)

D = 16
true_w = torch.arange(D, dtype=torch.float32, device=get_device())
def get_batch(B: int) -> tuple[torch.Tensor, torch.Tensor]:
    x = torch.randn(B, D).to(get_device())
    true_y = x @ true_w
    return (x, true_y)

# Let's do a basic run
train("simple", get_batch, D=D, num_layers=0, B=4, num_train_steps=10, lr=0.01)

Step 0, Loss: 1292.5894, 
Step 1, Loss: 1166.7372, 
Step 2, Loss: 1209.9089, 
Step 3, Loss: 1480.7728, 
Step 4, Loss: 1109.9225, 
Step 5, Loss: 906.7762, 
Step 6, Loss: 479.8622, 
Step 7, Loss: 425.7617, 
Step 8, Loss: 919.3986, 
Step 9, Loss: 3253.2998, 


In [13]:
# Do some hyperparameter tuning
train("simple", get_batch, D=D, num_layers=0, B=4, num_train_steps=10, lr=0.1)

Step 0, Loss: 1160.1960, 
Step 1, Loss: 584.6788, 
Step 2, Loss: 682.3929, 
Step 3, Loss: 542.1847, 
Step 4, Loss: 1326.8734, 
Step 5, Loss: 517.3848, 
Step 6, Loss: 600.0370, 
Step 7, Loss: 3213.6191, 
Step 8, Loss: 2038.8077, 
Step 9, Loss: 208.5746, 


# Checkpointing

In [14]:
model = Cruncher(dim=64, num_layers=3).to(get_device())
optimizer = AdaGrad(model.parameters(), lr=0.01)

# save the checkpoint
checkpoint = {
    "model": model.state_dict(),
    "optimizer": optimizer.state_dict(),
}
torch.save(checkpoint, "model_checkpoint.pt")

# load the checkpoint
loaded_checkpoint = torch.load("model_checkpoint.pt")