## Preprocess JetClass dataset and save it in .h5 format

In [1]:
import numpy as np
import awkward as ak
import vector
import uproot
vector.register_awkward()
import torch
from torch.utils.data import Dataset
import h5py

In [2]:
def preprocess_jet_data(parquet_file, labels, config, h5_file):
    data = ak.from_parquet(parquet_file)
    print(f"Loaded {len(data)} jets from {parquet_file}")
    idx = np.random.permutation(len(data))
    data = data[idx]
    p4 = vector.zip({
        "px": data["part"]["px"],
        "py": data["part"]["py"],
        "pz": data["part"]["pz"],
        "energy": data["part"]["energy"]
    })
    p4_jet = ak.sum(p4, axis=1)

    raw_data = ak.zip({
        "part_px": data["part"]["px"],
        "part_py": data["part"]["py"],
        "part_pz": data["part"]["pz"],
        "part_eta": p4.eta,
        "part_phi": p4.phi,
        "part_pT": p4.pt,
        "part_etarel": p4.deltaeta(p4_jet),
        "part_phirel": p4.deltaphi(p4_jet),
    })

        # Apply cuts
    cuts = (np.abs(raw_data["part_etarel"]) < 0.8) & (np.abs(raw_data["part_phirel"]) < 0.8)
    raw_data = raw_data[cuts]
            # Filter jets with enough constituents
    particle_count = ak.count(raw_data["part_pT"], axis=1)
    valid_jets = (particle_count >= config.min_constituent)
    raw_data = raw_data[valid_jets]
    label_data = ak.zip({label: data[label] for label in labels})[valid_jets]

    # Sort and transform
    sorted_indices = ak.argsort(raw_data["part_pT"], ascending=False, axis=1)
    sorted_data = raw_data[sorted_indices]
    sorted_data["part_pT"] = np.log(sorted_data["part_pT"]) - 1.8

    # Pad and stack features
    max_particles = config.max_particles
    scaling_factor = config.scaling_factor
    part_pT = ak.fill_none(ak.pad_none(sorted_data["part_pT"], max_particles, clip=True), 0)
    part_etarel = ak.fill_none(ak.pad_none(sorted_data["part_etarel"], max_particles, clip=True), 0)
    part_phirel = ak.fill_none(ak.pad_none(sorted_data["part_phirel"], max_particles, clip=True), 0)
    features = ak.concatenate([
        part_pT[:, :, None],
        (part_etarel * scaling_factor)[:, :, None],
        (part_phirel * scaling_factor)[:, :, None]
    ], axis=-1)

    # Labels
    label_array_2d = ak.to_numpy([[row[label] for label in labels] for row in label_data])
    labels_array = np.argmax(label_array_2d, axis=1)

    # Convert to numpy for HDF5
    features_np = ak.to_numpy(features)
    masks = (features_np[:, :, 0] != 0).astype(np.float32)
    
    num_jets = len(labels_array)
    event_ids = np.arange(num_jets)

    # Save to .h5
    with h5py.File(h5_file, "w") as f:
        f.create_dataset("pf_features", data=features_np, compression="gzip")
        f.create_dataset("pf_mask", data=masks, compression="gzip")
        f.create_dataset("label", data=labels_array, compression="gzip")
        f.create_dataset("event_id", data=event_ids, compression="gzip")   # adding event_id so we


    print(f"Saved preprocessed data to {h5_file}")
    

In [3]:
from omegaconf import OmegaConf

cfg = OmegaConf.load("../config/config_parquet.yaml")
print(cfg['data']['train_dir'])
print(cfg['labels'])

/sciclone/home/hnayak/scr10/Transfer_Learning/dataset/test_20M/train/train.parquet
['label_QCD', 'label_Tbqq']


In [4]:

preprocess_jet_data(cfg['data']['train_dir'], cfg['labels'], cfg, "jetclass_train_preprocessed.h5")


Loaded 2000000 jets from /sciclone/home/hnayak/scr10/Transfer_Learning/dataset/test_20M/train/train.parquet
Saved preprocessed data to jetclass_train_preprocessed.h5


In [5]:
preprocess_jet_data(cfg['data']['val_dir'], cfg['labels'], cfg, "jetclass_val_preprocessed.h5")


Loaded 1000000 jets from /sciclone/home/hnayak/scr10/Transfer_Learning/dataset/test_20M/val/val.parquet
Saved preprocessed data to jetclass_val_preprocessed.h5


In [6]:
preprocess_jet_data(cfg['data']['test_dir'], cfg['labels'], cfg, "jetclass_test_preprocessed.h5")


Loaded 1000000 jets from /sciclone/home/hnayak/scr10/Transfer_Learning/dataset/test_20M/test/test.parquet
Saved preprocessed data to jetclass_test_preprocessed.h5


In [7]:
with h5py.File("/sciclone/home/hnayak/scr10/Transfer_Learning/data/jetclass_train_preprocessed.h5", "r") as f:
    print(f.keys())
    print(f['pf_features'].shape)
    print(f['pf_mask'].shape)
    print(f['label'].shape)
    print(f['pf_features'].dtype)
    print(f['pf_mask'].dtype)
    print(f['label'].dtype)

<KeysViewHDF5 ['event_id', 'label', 'pf_features', 'pf_mask']>
(1990243, 128, 3)
(1990243, 128)
(1990243,)
float64
float32
int64


In [17]:
file =  h5py.File("/sciclone/home/hnayak/scr10/Transfer_Learning/dataset/top_quark_dataset/test.h5", "r") 


In [18]:
file.keys()

<KeysViewHDF5 ['table']>

In [14]:
from dataset import MultiClassJetDataset
dataset = MultiClassJetDataset("/sciclone/home/hnayak/scr10/Transfer_Learning/data/jetclass_test_preprocessed.h5")
print (len(dataset))

995132


In [16]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=4)
for batch in dataloader:
    print(batch[0].shape, batch[1].shape, batch[2].shape, batch[3].shape)
    print(batch[0], batch[1], batch[2], batch[3])
    break

torch.Size([1, 128, 3]) torch.Size([1, 128]) torch.Size([1]) torch.Size([1])
tensor([[[ 2.5733e+00, -1.4225e-02, -2.6030e-01],
         [ 2.4845e+00,  7.9609e-02,  6.4605e-02],
         [ 2.0133e+00,  1.5142e-01,  5.4746e-02],
         [ 1.9797e+00,  9.3986e-02,  5.8190e-02],
         [ 1.9063e+00,  1.4691e-01,  7.0281e-02],
         [ 1.8047e+00, -1.6003e-03, -2.7472e-01],
         [ 1.6587e+00,  9.1668e-02,  4.3957e-02],
         [ 1.6528e+00, -9.3245e-03, -2.9003e-01],
         [ 1.5666e+00,  1.4648e-01,  8.4111e-02],
         [ 1.4410e+00, -9.7422e-02,  4.1950e-01],
         [ 1.2628e+00,  1.5879e-04, -2.6866e-01],
         [ 1.2300e+00, -2.8788e-01,  4.6101e-01],
         [ 8.8006e-01, -1.9393e-02, -2.7397e-01],
         [ 6.6410e-01,  1.2717e-01,  7.2722e-02],
         [ 6.2907e-01, -2.0264e-01,  3.6536e-01],
         [ 6.0362e-01, -3.5567e-02, -2.8543e-01],
         [ 4.1902e-01, -1.3368e-01,  4.0586e-01],
         [ 2.9793e-01,  1.2026e-01,  3.8098e-02],
         [ 2.7669e-01, 

In [62]:
print(file['pf_mask'][0])

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
class MultiClassJetDataset(Dataset):
    def __init__(self, h5FilePath, 
                 n_load = -1
                 ):
        self.h5FilePath = h5FilePath
        self.n_load = n_load
        with h5py.File(self.h5FilePath, "r") as f:
            self.pf_features = torch.from_numpy(f["pf_features"][:n_load])
            
            if self.pf_features.shape[1] == 17 or self.pf_features.shape[-1] == 128:
                self.pf_features = self.pf_features.permute(0, 2, 1)
                print (f"The input shape is WRONG. Corrected to {self.pf_features.shape}")
            
            self.pf_mask = torch.from_numpy(f["pf_mask"][:n_load])
            
            if self.pf_mask.shape[1] == 1:
                print (f"The mask shape is wrong and is of shape {self.pf_mask.shape}")
                self.pf_mask = self.pf_mask.squeeze(1)
                print (f"Converted the shape of mask to {self.pf_mask.shape}")
            
            self.labels = torch.from_numpy(f["label"][:n_load]).long()
            
            # TO DO 
            # MAKE THE SELECTION MORE GENERIC BASED ON LAEBLS. 
            # iF labels [0, 1, 5] is given choose only thsioe for training

    def __len__(self):
        return self.labels.shape[0]
    
    def to(self, device):
        self.pf_features = self.pf_features.to(device)
        self.pf_mask = self.pf_mask.to(device)
        self.labels = self.labels.to(device)
    
    def device(self):
        return self.pf_features.device

    def __getitem__(self, idx):
        return self.pf_features[idx], self.pf_mask[idx], self.labels[idx]

In [22]:
ds = MultiClassJetDataset("/sciclone/home/hnayak/scr10/Transfer_Learning/data/jetclass_train_preprocessed.h5", n_load=100)

In [25]:
train_loader = torch.utils.data.DataLoader(ds, batch_size=32, shuffle=True) 

In [26]:
for i, m, o in train_loader:
    print(i.shape, m.shape, o.shape)
    break

torch.Size([32, 128, 3]) torch.Size([32, 128]) torch.Size([32])
