In [1]:
import os
tag = 'train'
file_path = f"/j-jepa-vol/J-JEPA/data/top/normalized/semi-processed/{tag}/{tag}_20_30.h5"
if os.path.exists(file_path):
    print(f"File exists: {file_path}")
else:
    print(f"File does not exist: {file_path}")
file_name = f"processed_{tag}_20_30.h5"
dataset_path = f"/j-jepa-vol/J-JEPA/data/top/normalized/{tag}/"
os.makedirs(dataset_path, exist_ok=True)
out_path = dataset_path+file_name

File exists: /j-jepa-vol/J-JEPA/data/top/normalized/semi-processed/val/val_20_30.h5


In [2]:
import random
import h5py
from JetDataset import JetDataset
import torch
from torch.utils.data import Dataset, DataLoader


# train_dataset = JetDataset(f"/mnt/d/physic/data/train_20_30_new.h5", config=None)
train_dataset = JetDataset(file_path)
# Create a new HDF5 file
# Usage
# dataset_path = f"D:\physic\data\processed_{tag}_20_30_torch.h5"




Initializing JetDataset with file: /j-jepa-vol/J-JEPA/data/top/normalized/semi-processed/val/val_20_30.h5
Loading features and subjets from HDF5 file
Filtered to 261239 good jets
Final dataset size: 261239 jets
__getitem__ returns (x, particle_features, subjets, indices, subjet_mask, particle_mask)


In [3]:
from tqdm import tqdm
import h5py
import torch
import numpy as np
from torch.utils.data import DataLoader, Subset

# Initialize DataLoader
batch_size = 1000
total_data = len(train_dataset)
part_size = total_data // 3

# Create three parts of the dataset
datasets = [Subset(train_dataset, range(i * part_size, (i + 1) * part_size)) for i in range(3)]
if total_data % 3 != 0:  # Handle the remainder
    datasets[-1] = Subset(train_dataset, range(2 * part_size, total_data))

# Function to process and save each part
def process_and_save(part_data, file_path):
    train_loader = DataLoader(part_data, batch_size=batch_size, shuffle=True)
    with h5py.File(file_path, 'w') as hf:
        first_batch = next(iter(train_loader))
        num_subjets = first_batch[0].shape[1]
        num_ptcls_per_subjet = first_batch[0].shape[2]
        num_ptcl_ftrs = first_batch[0].shape[3]
        num_ptcls_per_jet = first_batch[1].shape[1]
        num_subjet_ftrs = first_batch[2].shape[2]

        hf.create_dataset('x', shape=(0, num_subjets, num_ptcls_per_subjet, num_ptcl_ftrs), maxshape=(None, num_subjets, num_ptcls_per_subjet, num_ptcl_ftrs), dtype='float32')
        hf.create_dataset('particle_features', shape=(0, num_ptcls_per_jet, num_ptcl_ftrs), maxshape=(None, num_ptcls_per_jet, num_ptcl_ftrs), dtype='float64')
        hf.create_dataset('subjets', shape=(0, num_subjets, num_subjet_ftrs), maxshape=(None, num_subjets, num_subjet_ftrs), dtype='float64')
        hf.create_dataset('particle_indices', shape=(0, num_subjets, num_ptcls_per_subjet), maxshape=(None, num_subjets, num_ptcls_per_subjet), dtype='int32')
        hf.create_dataset('subjet_mask', shape=(0, num_subjets), maxshape=(None, num_subjets), dtype='bool')
        hf.create_dataset('particle_mask', shape=(0, num_subjets, num_ptcls_per_subjet), maxshape=(None, num_subjets, num_ptcls_per_subjet), dtype='bool')

        num_batches_processed = 0
        for data in tqdm(train_loader):
            x, particle_features, subjets, particle_indices, subjet_mask, particle_mask = [d.detach().cpu() for d in data]
            num_new = x.shape[0]
            particle_indices = particle_indices.to(torch.int32)
            subjet_mask = subjet_mask.to(torch.int32)
            particle_mask = particle_mask.to(torch.int32)
            
            hf['x'].resize(num_batches_processed * batch_size + num_new, axis=0)
            hf['particle_features'].resize(num_batches_processed * batch_size + num_new, axis=0)
            hf['subjets'].resize(num_batches_processed * batch_size + num_new, axis=0)
            hf['particle_indices'].resize(num_batches_processed * batch_size + num_new, axis=0)
            hf['subjet_mask'].resize(num_batches_processed * batch_size + num_new, axis=0)
            hf['particle_mask'].resize(num_batches_processed * batch_size + num_new, axis=0)
            
            hf['x'][num_batches_processed * batch_size:num_batches_processed * batch_size + num_new] = x
            hf['particle_features'][num_batches_processed * batch_size:num_batches_processed * batch_size + num_new] = particle_features
            hf['subjets'][num_batches_processed * batch_size:num_batches_processed * batch_size + num_new] = subjets
            hf['particle_indices'][num_batches_processed * batch_size:num_batches_processed * batch_size + num_new] = particle_indices
            hf['subjet_mask'][num_batches_processed * batch_size:num_batches_processed * batch_size + num_new] = subjet_mask
            hf['particle_mask'][num_batches_processed * batch_size:num_batches_processed * batch_size + num_new] = particle_mask
            
            num_batches_processed += 1

# Save each part to a separate file
for i, dataset in enumerate(datasets):
    file_path = f"{out_path}_p{i+1}.hdf5"
    process_and_save(dataset, file_path)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [01:46<00:00,  1.21s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [01:46<00:00,  1.21s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [01:44<00:00,  1.18s/it]


In [4]:
from torch.utils.data import Dataset, DataLoader
import torch
import h5py
import os
import numpy as np

class HDF5Dataset(Dataset):
    def __init__(self, directory_path):
        # Initialize empty lists to store the datasets
        x_list, particle_features_list, subjets_list = [], [], []
        particle_indices_list, subjet_mask_list, particle_mask_list = [], [], []

        # Loop through each file in the directory
        for filename in os.listdir(directory_path):
            if filename.endswith(".hdf5"):
                file_path = os.path.join(directory_path, filename)
                with h5py.File(file_path, 'r') as file:
                    # Append each dataset to the corresponding list
                    print(f"Loading {filename}")
                    x_list.append(file['x'][:])
                    particle_features_list.append(file['particle_features'][:])
                    subjets_list.append(file['subjets'][:])
                    particle_indices_list.append(file['particle_indices'][:])
                    subjet_mask_list.append(file['subjet_mask'][:])
                    particle_mask_list.append(file['particle_mask'][:])
                print(f"Loaded {filename}")

        # Concatenate all datasets from all files
        self.x = np.concatenate(x_list, axis=0)
        self.particle_features = np.concatenate(particle_features_list, axis=0)
        self.subjets = np.concatenate(subjets_list, axis=0)
        self.particle_indices = np.concatenate(particle_indices_list, axis=0)
        self.subjet_mask = np.concatenate(subjet_mask_list, axis=0)
        self.particle_mask = np.concatenate(particle_mask_list, axis=0)

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        return (self.x[idx], self.particle_features[idx], self.subjets[idx],
                self.particle_indices[idx], self.subjet_mask[idx], self.particle_mask[idx])

# Usage example
dataset = HDF5Dataset(dataset_path)


Loading processed_val_20_30.h5_p2.hdf5
Loaded processed_val_20_30.h5_p2.hdf5
Loading processed_val_20_30.h5_p1.hdf5
Loaded processed_val_20_30.h5_p1.hdf5
Loading processed_val_20_30.h5_p3.hdf5
Loaded processed_val_20_30.h5_p3.hdf5


In [5]:
len(dataset)

261239

# Save everything into a single file (works for file sizes < 5GB)

In [10]:
from tqdm import tqdm
import h5py
import torch
import numpy as np
from torch.utils.data import DataLoader
tag="val"
file_name = f"processed_{tag}_20_30_small.h5"
dataset_path = f"/j-jepa-vol/J-JEPA/data/small/"
os.makedirs(dataset_path, exist_ok=True)
out_path = dataset_path+file_name

# Initialize DataLoader
batch_size = 1000
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)



with h5py.File(out_path, 'w') as hf:
    # Get the first batch to determine sizes
    first_batch = next(iter(train_loader))
    num_jets = len(train_dataset)
    num_subjets = first_batch[0].shape[1]
    num_ptcls_per_subjet = first_batch[0].shape[2]
    num_ptcl_ftrs = first_batch[0].shape[3]
    num_ptcls_per_jet = first_batch[1].shape[1]
    num_subjet_ftrs = first_batch[2].shape[2]

    # Initialize datasets with extendable dimensions in the first dimension
    hf.create_dataset('x', shape=(0, num_subjets, num_ptcls_per_subjet, num_ptcl_ftrs), maxshape=(None, num_subjets, num_ptcls_per_subjet, num_ptcl_ftrs), dtype='float32')
    hf.create_dataset('particle_features', shape=(0, num_ptcls_per_jet, num_ptcl_ftrs), maxshape=(None, num_ptcls_per_jet, num_ptcl_ftrs), dtype='float64')
    hf.create_dataset('subjets', shape=(0, num_subjets, num_subjet_ftrs), maxshape=(None, num_subjets, num_subjet_ftrs), dtype='float64')
    hf.create_dataset('particle_indices', shape=(0, num_subjets, num_ptcls_per_subjet), maxshape=(None, num_subjets, num_ptcls_per_subjet), dtype='int32')
    hf.create_dataset('subjet_mask', shape=(0, num_subjets), maxshape=(None, num_subjets), dtype='bool')
    hf.create_dataset('particle_mask', shape=(0, num_subjets, num_ptcls_per_subjet), maxshape=(None, num_subjets, num_ptcls_per_subjet), dtype='bool')

    # Write batches to HDF5
    num_batches_processed = 0
    for data in tqdm(train_loader):
        x, particle_features, subjets, particle_indices, subjet_mask, particle_mask = [d.detach().cpu() for d in data]

        # convert to appropriate data types

        # Number of new data points
        num_new = x.shape[0]
        # particle_indices = particle_indices.astype(np.int32)
        # subjet_mask = subjet_mask.astype(np.int32)
        # particle_mask = particle_mask.astype(np.int32)

        particle_indices = particle_indices.to(torch.int32)
        subjet_mask = subjet_mask.to(torch.int32)
        particle_mask = particle_mask.to(torch.int32)

        # Resize datasets to accommodate new data
        hf['x'].resize(num_batches_processed * batch_size + num_new, axis=0)
        hf['particle_features'].resize(num_batches_processed * batch_size + num_new, axis=0)
        hf['subjets'].resize(num_batches_processed * batch_size + num_new, axis=0)
        hf['particle_indices'].resize(num_batches_processed * batch_size + num_new, axis=0)
        hf['subjet_mask'].resize(num_batches_processed * batch_size + num_new, axis=0)
        hf['particle_mask'].resize(num_batches_processed * batch_size + num_new, axis=0)

        # Append new data
        hf['x'][num_batches_processed * batch_size:num_batches_processed * batch_size + num_new] = x
        hf['particle_features'][num_batches_processed * batch_size:num_batches_processed * batch_size + num_new] = particle_features
        hf['subjets'][num_batches_processed * batch_size:num_batches_processed * batch_size + num_new] = subjets
        hf['particle_indices'][num_batches_processed * batch_size:num_batches_processed * batch_size + num_new] = particle_indices
        hf['subjet_mask'][num_batches_processed * batch_size:num_batches_processed * batch_size + num_new] = subjet_mask
        hf['particle_mask'][num_batches_processed * batch_size:num_batches_processed * batch_size + num_new] = particle_mask

        num_batches_processed += 1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.64it/s]


In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
import h5py

class HDF5Dataset(Dataset):
    def __init__(self, file_path):
        with h5py.File(file_path, 'r') as self.file:
            self.x = self.file['x'][:] # Load entire dataset into memory
            self.particle_features = self.file['particle_features'][:]
            self.subjets = self.file['subjets'][:]
            self.particle_indices = self.file['particle_indices'][:]
            self.subjet_mask = self.file['subjet_mask'][:]
            self.particle_mask = self.file['particle_mask'][:]

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        # Directly return the data from memory
        return (self.x[idx], self.particle_features[idx], self.subjets[idx],
                self.particle_indices[idx], self.subjet_mask[idx], self.particle_mask[idx])

# Usage

file_name = f"processed_{tag}_20_30.h5"
dataset_path = f"/j-jepa-vol/J-JEPA/data/top/val/"
dataset = HDF5Dataset(dataset_path+file_name)


In [5]:
train_loader = DataLoader(dataset, batch_size=1000, shuffle=True)
train_iter = iter(train_loader)
for i in range(len(dataset) // 1000 + 1):
    x, particle_features, subjets, particle_indices, subjet_mask, particle_mask = next(train_iter)

In [4]:
len(dataset)

261878

In [18]:
from torch.utils.data import Dataset
import torch
import h5py
import os
import numpy as np

class JEPADataset(Dataset):
    def __init__(self, directory_path, num_jets=None):
        # Initialize empty lists to store the datasets
        x_list, particle_features_list, subjets_list = [], [], []
        particle_indices_list, subjet_mask_list, particle_mask_list = [], [], []

        # Loop through each file in the directory
        for filename in os.listdir(directory_path):
            if filename.endswith(".hdf5") or filename.endswith(".h5"):
                file_path = os.path.join(directory_path, filename)
                with h5py.File(file_path, 'r') as file:
                    # Append each dataset to the corresponding list
                    print(f"Loading {filename}")
                    x_list.append(file['x'][:])
                    particle_features_list.append(file['particle_features'][:])
                    subjets_list.append(file['subjets'][:])
                    particle_indices_list.append(file['particle_indices'][:])
                    subjet_mask_list.append(file['subjet_mask'][:])
                    particle_mask_list.append(file['particle_mask'][:])
                print(f"Loaded {filename}")

        # Concatenate all datasets from all files
        self.x = np.concatenate(x_list, axis=0)
        self.particle_features = np.concatenate(particle_features_list, axis=0)
        self.subjets = np.concatenate(subjets_list, axis=0)
        self.particle_indices = np.concatenate(particle_indices_list, axis=0)
        self.subjet_mask = np.concatenate(subjet_mask_list, axis=0)
        self.particle_mask = np.concatenate(particle_mask_list, axis=0)

        if num_jets:
            self.x = self.x[:num_jets]
            self.particle_features = self.particle_features[:num_jets]
            self.subjets = self.subjets[:num_jets]
            self.particle_indices = self.particle_indices[:num_jets]
            self.subjet_mask = self.subjet_mask[:num_jets]
            self.particle_mask = self.particle_mask[:num_jets]

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        return (self.x[idx], self.particle_features[idx], self.subjets[idx],
                self.particle_indices[idx], self.subjet_mask[idx], self.particle_mask[idx])