# New JEPADataset (faster loading)

In [1]:
from torch.utils.data import Dataset, DataLoader
import torch
import h5py
from JEPADataset import JEPADataset

In [6]:
tag = 'train'
dataset_path = f"../../data/{tag}/processed_{tag}_20_30_torch.h5"
dataset = JEPADataset(dataset_path, num_jets=10000)

Loading file from ../../data/train/processed_train_20_30_torch.h5
number of jets: 10000


In [7]:
train_loader = DataLoader(dataset, batch_size=100, shuffle=True)
train_iter = iter(train_loader)
for i in range(len(dataset) // 100):
    x, particle_features, subjets, particle_indices, subjet_mask, particle_mask = next(train_iter)

In [8]:
print("x", x.shape) #(batch_size, N_subjets, N_part, N_part_ftr), dim 3 ordering: ['part_deta', 'part_dphi', 'part_pt_log', 'part_e_log']
print("particle features", particle_features.shape)
print("subjets", subjets.shape) # dim 2 ordering: ['subjet_pt', 'subjet_eta', 'subjet_phi', 'subjet_E, 'subjet_num_ptcls']
print("particle indices", particle_indices.shape)
print("subjet_mask", subjet_mask.shape)
print("particle_mask", particle_mask.shape)

x torch.Size([100, 20, 30, 4])
particle features torch.Size([100, 128, 4])
subjets torch.Size([100, 20, 5])
particle indices torch.Size([100, 20, 30])
subjet_mask torch.Size([100, 20])
particle_mask torch.Size([100, 20, 30])


In [9]:
print("x", x.dtype) #(batch_size, N_subjets, N_part, N_part_ftr), dim 3 ordering: ['part_deta', 'part_dphi', 'part_pt_log', 'part_e_log']
print("particle features", particle_features.dtype)
print("subjets", subjets.dtype) # dim 2 ordering: ['subjet_pt', 'subjet_eta', 'subjet_phi', 'subjet_E, 'subjet_num_ptcls']
print("particle indices", particle_indices.dtype)
print("subjet_mask", subjet_mask.dtype)
print("particle_mask", particle_mask.dtype)

x torch.float32
particle features torch.float64
subjets torch.float64
particle indices torch.int32
subjet_mask torch.bool
particle_mask torch.bool


# Old dataset (data loading is slow)

In [1]:
from torch.utils.data import Dataset, DataLoader
import torch
import h5py
from JetDataset import JetDataset
tag = 'val'
train_dataset = JetDataset(f"../../data/{tag}/{tag}_20_30_new.h5", num_jets=10000, config=None, labels=True)

Initializing JetDataset with file: ../../data/val/val_20_30_new.h5
Loading features and subjets from HDF5 file
Filtered to 261239 good jets
Final dataset size: 10000 jets
__getitem__ returns (x, particle_features, subjets, indices, subjet_mask, particle_mask, labels)


In [2]:
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
train_iter = iter(train_loader)
x, particle_features, subjets, particle_indices, subjet_mask, particle_mask, labels = next(train_iter)

In [3]:
print("x", x.shape) #(batch_size, N_subjets, N_part, N_part_ftr), dim 3 ordering: ['part_deta', 'part_dphi', 'part_pt_log', 'part_e_log']
print("particle features", particle_features.shape)
print("subjets", subjets.shape) # dim 2 ordering: ['subjet_pt', 'subjet_eta', 'subjet_phi', 'subjet_E, 'subjet_num_ptcls']
print("particle indices", particle_indices.shape)
print("subjet_mask", subjet_mask.shape)
print("particle_mask", particle_mask.shape)
print("labels", labels.shape)

x torch.Size([100, 20, 30, 4])
particle features torch.Size([100, 128, 4])
subjets torch.Size([100, 20, 5])
particle indices torch.Size([100, 20, 30])
subjet_mask torch.Size([100, 20])
particle_mask torch.Size([100, 20, 30])
labels torch.Size([100])


In [4]:
print("x", x.dtype) #(batch_size, N_subjets, N_part, N_part_ftr), dim 3 ordering: ['part_deta', 'part_dphi', 'part_pt_log', 'part_e_log']
print("particle features", particle_features.dtype)
print("subjets", subjets.dtype) # dim 2 ordering: ['subjet_pt', 'subjet_eta', 'subjet_phi', 'subjet_E, 'subjet_num_ptcls']
print("particle indices", particle_indices.dtype)
print("subjet_mask", subjet_mask.dtype)
print("particle_mask", particle_mask.dtype)
print("labels", labels.dtype)

x torch.float32
particle features torch.float64
subjets torch.float64
particle indices torch.float64
subjet_mask torch.float32
particle_mask torch.float32
labels torch.int64


In [None]:
particle_indices.to(torch.int32)

## Check to see if x was constructed correctly

In [None]:
for jet_idx in range(10):
    for subjet_idx in range(20):
        num_real_ptcls = int(subjets[jet_idx, subjet_idx, -1])
        particle_features_at_subjet_idx = torch.zeros((30, 4))
        real_indices = particle_indices[jet_idx, subjet_idx].long()[:num_real_ptcls]
        particle_features_at_subjet_idx[:num_real_ptcls, :] = particle_features[jet_idx, real_indices, :]
        print((particle_features_at_subjet_idx == x[jet_idx, subjet_idx]).all())

In [None]:
particle_indices[0, subjet_idx]

In [None]:
particle_features_at_subjet_idx.shape

In [None]:
particle_features_at_subjet_idx == x[0, subjet_idx]

In [None]:
x[0, 0]

In [None]:
jet_idx = 1
subjet_idx = 1
num_real_ptcls = int(subjets[jet_idx, subjet_idx, -1])
particle_features_at_subjet_idx = torch.zeros((30, 4))
particle_features_at_subjet_idx[:num_real_ptcls] = particle_features[jet_idx, particle_indices[jet_idx, subjet_idx].long()[:num_real_ptcls], :]
particle_features_at_subjet_idx

In [None]:
x[jet_idx, subjet_idx]

In [None]:
subjets[jet_idx, subjet_idx, -1]