In [1]:
import fvdb
from fvdb.nn import VDBTensor
import torch
from UNet import SparseUNet
import tqdm

#fvdb.nn.SparseConv3d.allow_tf32 = False

In [2]:
from torch.utils.data import Dataset
import os

class ChunkDataset(Dataset):
    def __init__(self, chunksPath):
        self.paths = []

        for filename in os.listdir(chunksPath):
            self.paths.append(f"{chunksPath}/{filename}")

    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, idx):
        grid_batch, labels, names = fvdb.load(self.paths[idx], device='cuda')
        labels.to(torch.long)

        return VDBTensor(grid_batch, labels)

In [3]:
dataset = ChunkDataset("data/training_data/chunks")
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, collate_fn=fvdb.jcat)

In [4]:
num_classes = 0
with open("minecraft-serialization/block_list.txt", 'r') as file:
    num_classes = sum(1 for line in file)

model = SparseUNet(num_classes).to('cuda')
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [5]:
# Training Loop
epochs = 5
with tqdm.tqdm(total=epochs) as pbar:
    for epoch in range(epochs):
        for i, vdb_tensor in enumerate(dataloader):
            target = vdb_tensor.data.jdata.squeeze().to(torch.long)
            print(target.min(), target.max())
            actives = vdb_tensor.grid.jagged_like(torch.ones(target.shape[0], device='cuda', dtype=torch.float32).unsqueeze(1))

            X = VDBTensor(grid=vdb_tensor.grid, data=actives)
            optimizer.zero_grad()
            y_hat = model(X)

            l = loss(y_hat.data.jdata, target)

            l.backward()
            optimizer.step()
            pbar.set_description(f"Epoch {epoch}, batch {i}, loss {l.item()}")
        pbar.update(1)

  0%|          | 0/5 [00:00<?, ?it/s]

tensor(1, device='cuda:0') tensor(1023, device='cuda:0')


Epoch 0, batch 0, loss 7.020885467529297:   0%|          | 0/5 [00:06<?, ?it/s]

tensor(1, device='cuda:0') tensor(1023, device='cuda:0')


Epoch 0, batch 1, loss 7.010582447052002:   0%|          | 0/5 [00:09<?, ?it/s]

tensor(1, device='cuda:0') tensor(1023, device='cuda:0')


Epoch 0, batch 2, loss 7.00816011428833:   0%|          | 0/5 [00:11<?, ?it/s] 

tensor(1, device='cuda:0') tensor(1023, device='cuda:0')


Epoch 0, batch 3, loss 7.0204973220825195:   0%|          | 0/5 [00:48<?, ?it/s]

tensor(1, device='cuda:0') tensor(305666608, device='cuda:0')


Epoch 0, batch 3, loss 7.0204973220825195:   0%|          | 0/5 [02:42<?, ?it/s]u:250: nll_loss_forward_reduce_cuda_kernel_2d
: block: [0,0,0], thread: [2,0,0] Assertion `t >= 0 && t < n_classes` failed.
/opt/conda/conda-bld/pytorch_1720538459595/work/aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [17,0,0] Assertion `t >= 0 && t < n_classes` failed.
/opt/conda/conda-bld/pytorch_1720538459595/work/aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [25,0,0] Assertion `t >= 0 && t < n_classes` failed.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
