In [5]:
# System and coding related packages
import os
import glob
from typing import List, Dict, Union

# Essential analysis packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
def load_data(file_path: str) -> Union[pd.DataFrame, None]:
    """
    Load inner data from a npz file and return a DataFrame.
    
    :param file_path: Single npz file path.
    """
    try:
        with np.load(file_path, allow_pickle=True) as f:
            data_dict = {name: f[name] for name in f.files}
            return pd.DataFrame(data_dict)
    except FileNotFoundError:
        print(f'File not found: {file_path}')
        raise
        return None

def load_all_data(file_paths: List[str]) -> Dict[str, Union[pd.DataFrame, None]]:
    """
    Load outer data from multiple npz files and return a dictionary of DataFrames.
    
    :param file_pathes: List of multiple npz files pathes.
    """
    return {name: load_data(file_path) for name, file_path in zip(DATA_NAMES, file_paths)}

In [11]:
# Specify the path
path = './*npz'

# Retrieve data name and define it as a global variable
file_pathes = glob.glob(path)
DATA_NAMES = [os.path.splitext(os.path.basename(path))[0] for path in file_pathes]
print(DATA_NAMES) # check the retrieved data names

# Load data into a dictionary using a list comprehension
data = load_all_data(file_pathes)

# Delete variables no longer used
del (path, file_pathes, DATA_NAMES)

['dnn_w_lead_pz_data', 'dnn_w_lead_py_data', 'dnn_w_lead_px_data', 'dnn_w_lead_e_data']


In [32]:
data['dnn_w_lead_e_data']['pred_y']

0         140.944946
1         110.683327
2         115.407661
3         159.894806
4         193.133362
             ...    
413548    271.059052
413549     95.519676
413550     83.815125
413551    101.221085
413552    189.987198
Name: pred_y, Length: 413553, dtype: float32

In [36]:
data['dnn_w_lead_e_data'].iloc[:,1]

0         107.617617
1          87.921328
2         118.118008
3         140.543531
4         217.772656
             ...    
413548    201.216547
413549     90.055539
413550     87.029477
413551     73.877992
413552    229.937563
Name: test_y, Length: 413553, dtype: float64

In [53]:
e_pred, e_true = data['dnn_w_lead_e_data'].iloc[:,0], data['dnn_w_lead_e_data'].iloc[:,1]
px_pred, px_true = data['dnn_w_lead_px_data'].iloc[:,0], data['dnn_w_lead_px_data'].iloc[:,1]
py_pred, py_true = data['dnn_w_lead_py_data'].iloc[:,0], data['dnn_w_lead_py_data'].iloc[:,1] 
pz_pred, pz_true = data['dnn_w_lead_pz_data'].iloc[:,0], data['dnn_w_lead_pz_data'].iloc[:,1]

def w_mass(e, px, py, pz):
    momenta = [px, py, pz]
    p_all = np.sqrt(np.sum([np.square(p) for p in momenta], axis=0))
    print(p_all.shape)
    print(e.shape)
    return np.sqrt(np.square(e) - np.square(p_all))

w_mass(e_pred, px_pred, py_pred, pz_pred)


(275702,)
(413553,)


ValueError: operands could not be broadcast together with shapes (413553,) (275702,) 