In [None]:
# Downloading JetClass small dataset

In [1]:
import numpy as np
import awkward as ak
import uproot
import vector
vector.register_awkward()

In [2]:
import os
import shutil
import zipfile
import tarfile
import urllib
import requests
from tqdm import tqdm

In [3]:
def _download(url, fname, chunk_size=1024):
    '''https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51'''
    resp = requests.get(url, stream=True)
    total = int(resp.headers.get('content-length', 0))
    with open(fname, 'wb') as file, tqdm(
        desc=fname,
        total=total,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in resp.iter_content(chunk_size=chunk_size):
            size = file.write(data)
            bar.update(size)

In [4]:
# Download the example file
example_file = 'JetClass_example_100k.root'
if not os.path.exists(example_file):
    _download('https://hqu.web.cern.ch/datasets/JetClass/example/JetClass_example_100k.root', example_file)

JetClass_example_100k.root: 100%|██████████| 130M/130M [00:08<00:00, 15.3MiB/s] 


Exploring the File

In [5]:
# Load the content from the file
tree = uproot.open(example_file)['tree']

In [6]:
# Display the content of the "tree"
tree.show()

name                 | typename                 | interpretation                
---------------------+--------------------------+-------------------------------
part_px              | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
part_py              | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
part_pz              | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
part_energy          | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
part_deta            | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
part_dphi            | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
part_d0val           | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
part_d0err           | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
part_dzval           | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
part_dzerr           | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
part_charge          | std::

In [None]:
# Load all arrays in the tree
# Each array is a column of the table
table = tree.arrays()

In [None]:
# Arrays of a scalar type (bool/int/float) can be converted to a numpy array directly, e.g.
table['label_QCD'].to_numpy()

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [9]:
# Arrays of a vector type are loaded as a JaggedArray that has varying elements per row
table['part_px']

# A JaggedArray can be (zero-) padded to become a regular numpy array (see later)

In [10]:
# Construct a Lorentz 4-vector from the (px, py, pz, energy) arrays
p4 = vector.zip({'px': table['part_px'], 'py': table['part_py'], 'pz': table['part_pz'], 'energy': table['part_energy']})

In [11]:
# Get the transverse momentum (pt)
p4.pt

In [12]:
p4.eta

In [13]:
p4.phi

In [14]:
def _pad(a, maxlen, value=0, dtype='float32'):
    if isinstance(a, np.ndarray) and a.ndim >= 2 and a.shape[1] == maxlen:
        return a
    elif isinstance(a, ak.Array):
        if a.ndim == 1:
            a = ak.unflatten(a, 1)
        a = ak.fill_none(ak.pad_none(a, maxlen, clip=True), value)
        return ak.values_astype(a, dtype)
    else:
        x = (np.ones((len(a), maxlen)) * value).astype(dtype)
        for idx, s in enumerate(a):
            if not len(s):
                continue
            trunc = s[:maxlen].astype(dtype)
            x[idx, :len(trunc)] = trunc
        return x

In [15]:
# Apply zero-padding and convert to a numpy array
_pad(p4.pt, maxlen=128).to_numpy()

array([[140.19296 ,  95.284584,  87.84807 , ...,   0.      ,   0.      ,
          0.      ],
       [244.67009 ,  62.332603,  45.159416, ...,   0.      ,   0.      ,
          0.      ],
       [143.15791 ,  91.48589 ,  25.372644, ...,   0.      ,   0.      ,
          0.      ],
       ...,
       [157.69547 , 101.245445,  79.816284, ...,   0.      ,   0.      ,
          0.      ],
       [ 88.65814 ,  80.69194 ,  79.14036 , ...,   0.      ,   0.      ,
          0.      ],
       [171.13641 , 121.71926 ,  59.68036 , ...,   0.      ,   0.      ,
          0.      ]], dtype=float32)