In [None]:
import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm.auto import tqdm
from helper_functions.feature_engineering import create_features

plt.style.use("default")
plt.rc("text", usetex=True)
plt.rc("font", family="cm")
plt.rcParams["grid.color"] = (0.5, 0.5, 0.5, 0.2)

In [None]:
# load utils
df_pmt_id_conversion = pd.read_csv("/home/ferracci/new_dataset/utils/PMT_ID_conversion.csv")
df_pmt_position = pd.read_csv("/home/ferracci/new_dataset/utils/PMTPos_CD_LPMT.csv")
df_spmt_id_conversion = pd.read_csv("/home/ferracci/new_dataset/utils/SPMT_ID_conversion.csv")
df_spmt_position = pd.read_csv("/home/ferracci/new_dataset/utils/PMTPos_CD_SPMT.csv")

# computes PMTs positions 
x = np.array(df_pmt_position['x']/1000).reshape((-1, 1))
y = np.array(df_pmt_position['y']/1000).reshape((-1, 1))
z = np.array(df_pmt_position['z']/1000).reshape((-1, 1))
pos = np.hstack((x, y, z))

x_s = np.array(df_spmt_position['x']/1000).reshape((-1, 1))
y_s = np.array(df_spmt_position['y']/1000).reshape((-1, 1))
z_s = np.array(df_spmt_position['z']/1000).reshape((-1, 1))
pos_s = np.hstack((x_s, y_s, z_s))

pmt_id_raw_to_id_map = dict(zip(df_pmt_id_conversion['CdID'], df_pmt_id_conversion['PMTID']))
pmt_id_to_pos_map = dict(zip(df_pmt_position['PMTID'], pos))

spmt_id_raw_to_id_map = dict(zip(df_spmt_id_conversion['CdID'], df_spmt_id_conversion['SPMTID']))
spmt_id_to_pos_map = dict(zip(df_spmt_position['SPMTID'], pos_s))

### Training Data

In [None]:
files = list(Path("/mnt/data/train_flat_dataset_processed/").glob("*"))
features, features_dataframe, targets_dataframe = [], [], []

for filename in tqdm(files):
    hits = np.load(filename, allow_pickle=True)["hits"]
    targets = np.load(filename, allow_pickle=True)["primaries"]

    pmt_id_raw, charge, fht = hits[0, :], hits[1, :], hits[2, :]
    spmt_id_raw, charge_s, fht_s = hits[3, :], hits[4, :], hits[5, :]

    # maps raw ids (CdID) to standard ids (PMTID)
    pmt_id = np.array([np.array([pmt_id_raw_to_id_map[n] for n in event]) for event in pmt_id_raw], dtype=object)
    spmt_id = np.array([np.array([spmt_id_raw_to_id_map[n] for n in event]) for event in spmt_id_raw], dtype=object)

    # maps standard ids (PMTID) to PMTs positions
    pmt_pos = np.array([np.array([pmt_id_to_pos_map[n] for n in event]) for event in pmt_id], dtype=object)
    spmt_pos = np.array([np.array([spmt_id_to_pos_map[n] for n in event]) for event in spmt_id], dtype=object)

    # compute features on preprocessed data
    _, f, f_df = create_features(pmt_pos, charge, fht, spmt_pos, charge_s, fht_s, return_dataframe=True)

    t_df = pd.DataFrame({"Qedep": targets[:, 1], "Redep": targets[:, 5]})

    # implement fiducial volume cut at 17.2 m
    f = f[t_df["Redep"] < 17.2]
    f_df = f_df[t_df["Redep"] < 17.2]
    t_df = t_df[t_df["Redep"] < 17.2]

    features.append(f)
    features_dataframe.append(f_df)
    targets_dataframe.append(t_df)

features = np.vstack(features)
features_dataframe = pd.concat(features_dataframe, axis=0, ignore_index=True)
targets_dataframe = pd.concat(targets_dataframe, axis=0, ignore_index=True)

# save compressed features and targets
np.savez_compressed("/mnt/ferracci/features_new", a=features)
features_dataframe.to_csv("/mnt/ferracci/features_dataframe_new.csv.gz", index=False, compression='gzip')
targets_dataframe.to_csv("/mnt/ferracci/targets_dataframe_new.csv.gz", compression="gzip")

### Testing Data

In [None]:
directories = list(Path("/mnt/data/test_dataset_processed/").glob("*"))

for directory in tqdm(directories):
    files = list(Path(f"{directory}/").glob("*"))
    features, features_dataframe, targets_dataframe = [], [], []
    if directory == Path("/mnt/data/test_dataset_processed/e+_0_3") or directory == Path("/mnt/data/test_dataset_processed/e+_0_6"):
        for filename in files:
            hits = np.load(filename, allow_pickle=True)["hits"]
            targets = np.load(filename, allow_pickle=True)["primaries"]

            pmt_id_raw, charge, fht = hits[0, :], hits[1, :], hits[2, :]
            spmt_id_raw, charge_s, fht_s = hits[3, :], hits[4, :], hits[5, :]

            # maps raw ids (CdID) to standard ids (PMTID)
            pmt_id = np.array([np.array([pmt_id_raw_to_id_map[n] for n in event]) for event in pmt_id_raw], dtype=object)
            spmt_id = np.array([np.array([spmt_id_raw_to_id_map[n] for n in event]) for event in spmt_id_raw], dtype=object)

            # maps standard ids (PMTID) to PMTs positions
            pmt_pos = np.array([np.array([pmt_id_to_pos_map[n] for n in event]) for event in pmt_id], dtype=object)
            spmt_pos = np.array([np.array([spmt_id_to_pos_map[n] for n in event]) for event in spmt_id], dtype=object)

            # compute features on preprocessed data
            _, f, f_df = create_features(pmt_pos, charge, fht, spmt_pos, charge_s, fht_s, return_dataframe=True)

            t_df = pd.DataFrame({"Qedep": targets[:, 1], "Redep": targets[:, 5]})

            # implement fiducial volume cut at 17.2 m
            f = f[t_df["Redep"] < 17.2]
            f_df = f_df[t_df["Redep"] < 17.2]
            t_df = t_df[t_df["Redep"] < 17.2]

            features.append(f)
            features_dataframe.append(f_df)
            targets_dataframe.append(t_df)

        features = np.vstack(features)
        features_dataframe = pd.concat(features_dataframe, axis=0, ignore_index=True)
        targets_dataframe = pd.concat(targets_dataframe, axis=0, ignore_index=True)

        # save compressed features and targets
        np.savez_compressed(f"/mnt/ferracci/features_test_{directory.name}", a=features)
        features_dataframe.to_csv(f"/mnt/ferracci/features_dataframe_test_{directory.name}.csv.gz", index=False, compression='gzip')
        targets_dataframe.to_csv(f"/mnt/ferracci/targets_dataframe_test_{directory.name}.csv.gz", compression="gzip")