# Particle flow network data preprocessing

For photon jet task. Goal is to turn all the images into point clouds.

May 13: Just do all 960 points.

In [1]:
import os
import time
import h5py
import numpy as np
import math

from tqdm import tqdm, trange
from matplotlib import pyplot as plt
from pprint import pprint

In [2]:
# Utility functions and whatnot
def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])


class_labels = ["pion", "photon", "scalar"]

## Grab data

In [3]:
data_dir = "/usatlas/atlas01/atlasdisk/users/atlas_wifeng/photon-jet/data/processed/scalar_test"
os.chdir(data_dir)

In [4]:
# ~5 sec
raw_pions = dict(np.load("pi0_40-250GeV_100k.npz"))
raw_photons = dict(np.load("gamma_40-250GeV_100k.npz"))
raw_scalars = dict(np.load("scalar1_40-250GeV_100k.npz"))

In [5]:
def norm_coords(n):
    """
    Generate list of n consecutive numbers, normally distributed.
        e.g. norm_coords(4) -> [-1.34, -0.45, 0.45, 1.34]
    """
    x = np.arange(n)
    return (x - np.mean(x)) / np.std(x)

def to_cloud(arr, tag):
    """
    Turn arr of shape (samples, rows, cols) into point clouds.
    Point cloud looks like (samples, points, features).
    Features will be a 7-vector of (eta, phi, energy, isLayer0, isLayer1, isLayer2, isLayer3, isLayer4).
    
    Points may be ragged; they will be padded in that case.
    """
    n_samples, n_rows, n_cols = arr.shape
    img_shape = (n_rows, n_cols)
    n_points = n_rows * n_cols
    
    # This shape rebroadcast can take a bit to wrap your head around
    row_coords = np.broadcast_to(norm_coords(n_rows)[:, None], img_shape)
    col_coords = np.broadcast_to(norm_coords(n_cols)[None, :], img_shape)
    
    coords = np.stack((row_coords, col_coords), axis=2).reshape((n_points, -1))
    coords = np.expand_dims(coords, axis=0)
    # coords has shape (n_rows, n_cols)
    
    coords = np.broadcast_to(coords, (n_samples, n_points, 2))
    new_arr = np.expand_dims(np.reshape(arr, (n_samples, -1)), axis=2)
    
    tag_arr = np.zeros((n_samples, n_points, 4))
    tag_arr[:, :, tag] = 1
    
    return np.concatenate((coords, new_arr, tag_arr), axis=2)

In [19]:
def process_dataset(dataset):
    res = []
    layers = [f"layer_{i}" for i in range(4)]
    
    for i, layer in enumerate(layers):
        print(f"Processing {layer}...")
        res.append(to_cloud(dataset[layer], tag=i))
        
    return np.concatenate(res, axis=1)

In [20]:
point_pions = process_dataset(raw_pions)
point_photons = process_dataset(raw_photons)
point_scalars = process_dataset(raw_scalars)

Processing layer_0...
Processing layer_1...
Processing layer_2...
Processing layer_3...
Processing layer_0...
Processing layer_1...
Processing layer_2...
Processing layer_3...
Processing layer_0...
Processing layer_1...
Processing layer_2...
Processing layer_3...


In [33]:
N = 100000

all_jets = np.concatenate([point_pions, point_photons, point_scalars], axis=0)
labels = np.array((0,) * N + (1,) * N + (2,) * N)

In [34]:
assert(len(labels) == len(all_jets))
order = np.random.permutation(len(labels))

all_jets = all_jets[order]
labels = labels[order]

In [35]:
print(f"All jets take up {convert_size(all_jets.nbytes)}")

All jets take up 15.02 GB


In [39]:
# ~10 sec
np.savez(f"all_jets_7_feature_point_cloud.npz", X=all_jets, y=labels)

In [37]:
# ~10 sec
N1 = 30000
np.savez(f"30k_jets_7_feature_point_cloud.npz", X=all_jets[:N1], y=labels[:N1])