# CUDA vs CPU RLCT estimation

This notebook measures how fast RLCT estimation is on CUDA vs on CPU. We check this using a standard normal crossing model.

In [4]:
import timeit
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.sgd import SGD

import numpy as np

sys.path.insert(1, "/home/paperspace/devinterp")  # TODO fix path


from devinterp.optim.sgld import SGLD
from devinterp.optim.sgnht import SGNHT
from devinterp.slt.sampler import estimate_rlct, sample
from devinterp.zoo.normal_crossing import PolyModel

assert torch.cuda.is_available()

In [5]:
from functools import partial

sigma = 0.25
lr = 0.0005
criterion = F.mse_loss


def timeit_rlct_estimation_wrapper(model, device, cores):
    return estimate_rlct(
        model,
        train_loader,
        criterion=criterion,
        optimizer_kwargs=dict(
            lr=lr,
            diffusion_factor=0.01,
            bounding_box_size=1.0,
            num_samples=len(train_data),
        ),
        sampling_method=SGNHT,
        num_chains=cores,
        num_draws=1_000,
        num_burnin_steps=0,
        num_steps_bw_draws=1,
        verbose=False,
        device=device,
        cores=cores
    )
num_train_samples=500_000
batch_size = num_train_samples
x = torch.normal(0, 2, size=(num_train_samples,))
y = sigma * torch.normal(0, 1, size=(num_train_samples,))
train_data = TensorDataset(x, y)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
for cores in (1, 4):
    for device in ("cuda", "cpu"):
        powers = torch.tensor([1, 2], device=device)
        model = PolyModel(powers)
        w_true = torch.zeros_like(powers)
        timeit_rlct_function = partial(timeit_rlct_estimation_wrapper, *(model, device, cores))
        time_taken = timeit.timeit(
            timeit_rlct_function,
            number=5
        )
        print(
            f"{num_train_samples} samples on {device}, {cores} cores/chains: {time_taken:.2f}s per estimate"
        )

500000 samples 500000 batch_size on cuda, 1 cores/chains: 52.60s per estimate
500000 samples 500000 batch_size on cpu, 1 cores/chains: 62.75s per estimate
500000 samples 500000 batch_size on cuda, 4 cores/chains: 113.27s per estimate
