# Zenith vs PyTorch: Large Dataset Benchmark (1GB)

In [None]:
!pip install pyarrow torch --quiet
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader as TorchDataLoader, TensorDataset
import time
import os
print("Ready")

In [None]:
# Config
NUM_SAMPLES = 100000
BATCH_SIZE = 256
EPOCHS = 3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")
print(f"Samples: {NUM_SAMPLES:,}")

In [None]:
# Generate data
print("Generating synthetic data...")
data_images = np.random.rand(NUM_SAMPLES, 3, 32, 32).astype(np.float32)
data_labels = np.random.randint(0, 10, NUM_SAMPLES).astype(np.int64)
print(f"Shape: {data_images.shape}, Size: {data_images.nbytes/1e9:.2f} GB")

In [None]:
# Save as Parquet
print("Saving Parquet...")
table = pa.table({
    'img': [x.tobytes() for x in data_images],
    'lbl': data_labels
})
pq.write_table(table, 'data.parquet')
print(f"Saved: {os.path.getsize('data.parquet')/1e6:.0f} MB")

In [None]:
# Model
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.c1 = nn.Conv2d(3, 32, 3, padding=1)
        self.c2 = nn.Conv2d(32, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(64*8*8, 256)
        self.fc2 = nn.Linear(256, 10)
    def forward(self, x):
        x = self.pool(F.relu(self.c1(x)))
        x = self.pool(F.relu(self.c2(x)))
        x = x.reshape(-1, 64*8*8)
        return self.fc2(F.relu(self.fc1(x)))

In [None]:
# ZENITH BENCHMARK
print("="*50)
print("ZENITH (Arrow/Parquet)")
print("="*50)

tbl = pq.read_table('data.parquet', memory_map=True)
model = CNN().to(device)
opt = optim.Adam(model.parameters())
crit = nn.CrossEntropyLoss()

z_times = []
for ep in range(EPOCHS):
    model.train()
    t0 = time.time()
    idx = np.random.permutation(NUM_SAMPLES)
    for i in range(0, NUM_SAMPLES, BATCH_SIZE):
        batch = tbl.take(idx[i:i+BATCH_SIZE])
        imgs = np.array([np.frombuffer(b, np.float32).reshape(3,32,32) for b in batch['img'].to_pylist()])
        lbls = batch['lbl'].to_numpy()
        x = torch.from_numpy(imgs).to(device)
        y = torch.from_numpy(lbls).to(device)
        opt.zero_grad()
        loss = crit(model(x), y)
        loss.backward()
        opt.step()
    z_times.append(time.time()-t0)
    print(f"Epoch {ep+1}: {z_times[-1]:.2f}s")

z_avg = sum(z_times[1:])/len(z_times[1:])
print(f"Avg: {z_avg:.2f}s")

In [None]:
# PYTORCH BENCHMARK  
print("\n" + "="*50)
print("PYTORCH DataLoader")
print("="*50)

# Use original numpy arrays
pt_loader = TorchDataLoader(
    TensorDataset(torch.from_numpy(data_images), torch.from_numpy(data_labels)),
    batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True
)

model = CNN().to(device)
opt = optim.Adam(model.parameters())

pt_times = []
for ep in range(EPOCHS):
    model.train()
    t0 = time.time()
    for x, y in pt_loader:
        x, y = x.to(device), y.to(device)
        opt.zero_grad()
        loss = crit(model(x), y)
        loss.backward()
        opt.step()
    pt_times.append(time.time()-t0)
    print(f"Epoch {ep+1}: {pt_times[-1]:.2f}s")

pt_avg = sum(pt_times[1:])/len(pt_times[1:])
print(f"Avg: {pt_avg:.2f}s")

In [None]:
# RESULTS
print("\n" + "="*50)
print("RESULTS")
print("="*50)
print(f"Zenith:  {z_avg:.2f}s")
print(f"PyTorch: {pt_avg:.2f}s")
if z_avg < pt_avg:
    print(f"Zenith is {pt_avg/z_avg:.2f}x faster")
else:
    print(f"PyTorch is {z_avg/pt_avg:.2f}x faster")