In [1]:
import h5py
import numpy as np
import pandas as pd
import vector
from sklearn.model_selection import train_test_split

In [2]:
def get_mjj(df):
    jet1 = vector.array(
        {
            "px": df["pxj1"],
            "py": df["pyj1"],
            "pz": df["pzj1"],
            "m": df["mj1"],
        }
    )
    
    jet2 = vector.array(
        {
            "px": df["pxj2"],
            "py": df["pyj2"],
            "pz": df["pzj2"],
            "m": df["mj2"],
        }
    )
    
    return (jet1 + jet2).mass

def get_vectors(df):
    jet1 = vector.array(
        {
            "px": df["pxj1"],
            "py": df["pyj1"],
            "pz": df["pzj1"],
            "m": df["mj1"],
        }
    )
    
    jet2 = vector.array(
        {
            "px": df["pxj2"],
            "py": df["pyj2"],
            "pz": df["pzj2"],
            "m": df["mj2"],
        }
    )
    
    return jet1, jet2


In [3]:
data = pd.read_hdf('./events_extratau.h5', key='df')
extrabg_data = pd.read_hdf('events_anomalydetection_DelphesPythia8_v2_qcd_extra_inneronly_combined_extratau_2_features.h5', key='df')

Correctly sort jets by mass (Jet 1 mass > Jet 2 mass)

In [4]:
for col in data.columns:
    data[col] = data[col].astype(np.float32)

wrong_data = data[(data["mj2"] > data["mj1"])]

tmp_wrong_data = pd.DataFrame()
for col in wrong_data.columns:
    if "j1" in col:
        tmp_wrong_data[col.replace("j1", "j2")] = wrong_data[col]
    elif "j2" in col:
        tmp_wrong_data[col.replace("j2", "j1")] = wrong_data[col]
    else:
        tmp_wrong_data[col] = wrong_data[col]

data = pd.concat([data[(data["mj2"] <= data["mj1"])], tmp_wrong_data], ignore_index=True)

In [5]:
# compute tauX(X-1) and tauX1 subjettiness ratios
for jet in range(1, 3):
    for beta in [1, 5, 2]:
        for tau in range(2, 10):
            data[f"tau{tau}{tau-1}j{jet}_{beta}"] = data[f"tau{tau}j{jet}_{beta}"] / data[f"tau{tau-1}j{jet}_{beta}"]
            data[f"tau{tau}{tau-1}j{jet}_{beta}"][data[f"tau{tau}{tau-1}j{jet}_{beta}"].isna()] = 0
            if tau == 2:
                continue
            else:
                data[f"tau{tau}1j{jet}_{beta}"] = data[f"tau{tau}j{jet}_{beta}"] / data[f"tau1j{jet}_{beta}"]
                data[f"tau{tau}1j{jet}_{beta}"][data[f"tau{tau}1j{jet}_{beta}"].isna()] = 0

  data[f"tau{tau}{tau-1}j{jet}_{beta}"] = data[f"tau{tau}j{jet}_{beta}"] / data[f"tau{tau-1}j{jet}_{beta}"]
  data[f"tau{tau}1j{jet}_{beta}"] = data[f"tau{tau}j{jet}_{beta}"] / data[f"tau1j{jet}_{beta}"]
  data[f"tau{tau}{tau-1}j{jet}_{beta}"] = data[f"tau{tau}j{jet}_{beta}"] / data[f"tau{tau-1}j{jet}_{beta}"]
  data[f"tau{tau}1j{jet}_{beta}"] = data[f"tau{tau}j{jet}_{beta}"] / data[f"tau1j{jet}_{beta}"]
  data[f"tau{tau}{tau-1}j{jet}_{beta}"] = data[f"tau{tau}j{jet}_{beta}"] / data[f"tau{tau-1}j{jet}_{beta}"]
  data[f"tau{tau}1j{jet}_{beta}"] = data[f"tau{tau}j{jet}_{beta}"] / data[f"tau1j{jet}_{beta}"]
  data[f"tau{tau}{tau-1}j{jet}_{beta}"] = data[f"tau{tau}j{jet}_{beta}"] / data[f"tau{tau-1}j{jet}_{beta}"]
  data[f"tau{tau}1j{jet}_{beta}"] = data[f"tau{tau}j{jet}_{beta}"] / data[f"tau1j{jet}_{beta}"]
  data[f"tau{tau}{tau-1}j{jet}_{beta}"] = data[f"tau{tau}j{jet}_{beta}"] / data[f"tau{tau-1}j{jet}_{beta}"]
  data[f"tau{tau}{tau-1}j{jet}_{beta}"] = data[f"tau{tau}j{jet}_{beta}"] / d

In [6]:
# Add delta mj
data["delta_mj"] = data["mj1"] - data["mj2"]

# Add mjj
mjj = get_mjj(data)
data["mjj"] = mjj

  data["delta_mj"] = data["mj1"] - data["mj2"]
  data["mjj"] = mjj


Repeat for extra background

In [7]:
for col in extrabg_data.columns:
    extrabg_data[col] = extrabg_data[col].astype(np.float32)

wrong_extrabg_data = extrabg_data[(extrabg_data["mj2"] > extrabg_data["mj1"])]

tmp_wrong_data = pd.DataFrame()
for col in wrong_extrabg_data.columns:
    if "j1" in col:
        tmp_wrong_data[col.replace("j1", "j2")] = wrong_extrabg_data[col]
    elif "j2" in col:
        tmp_wrong_data[col.replace("j2", "j1")] = wrong_extrabg_data[col]

extrabg_data = pd.concat([extrabg_data[(extrabg_data["mj2"] <= extrabg_data["mj1"])], tmp_wrong_data], ignore_index=True)

In [8]:
# compute tauX(X-1) and tauX1 subjettiness ratios
for jet in range(1, 3):
    for beta in [1, 5, 2]:
        for tau in range(2, 10):
            extrabg_data[f"tau{tau}{tau-1}j{jet}_{beta}"] = extrabg_data[f"tau{tau}j{jet}_{beta}"] / extrabg_data[f"tau{tau-1}j{jet}_{beta}"]
            extrabg_data[f"tau{tau}{tau-1}j{jet}_{beta}"][extrabg_data[f"tau{tau}{tau-1}j{jet}_{beta}"].isna()] = 0
            if tau == 2:
                continue
            else:
                extrabg_data[f"tau{tau}1j{jet}_{beta}"] = extrabg_data[f"tau{tau}j{jet}_{beta}"] / extrabg_data[f"tau1j{jet}_{beta}"]
                extrabg_data[f"tau{tau}1j{jet}_{beta}"][extrabg_data[f"tau{tau}1j{jet}_{beta}"].isna()] = 0

  extrabg_data[f"tau{tau}1j{jet}_{beta}"] = extrabg_data[f"tau{tau}j{jet}_{beta}"] / extrabg_data[f"tau1j{jet}_{beta}"]
  extrabg_data[f"tau{tau}{tau-1}j{jet}_{beta}"] = extrabg_data[f"tau{tau}j{jet}_{beta}"] / extrabg_data[f"tau{tau-1}j{jet}_{beta}"]
  extrabg_data[f"tau{tau}1j{jet}_{beta}"] = extrabg_data[f"tau{tau}j{jet}_{beta}"] / extrabg_data[f"tau1j{jet}_{beta}"]
  extrabg_data[f"tau{tau}{tau-1}j{jet}_{beta}"] = extrabg_data[f"tau{tau}j{jet}_{beta}"] / extrabg_data[f"tau{tau-1}j{jet}_{beta}"]
  extrabg_data[f"tau{tau}1j{jet}_{beta}"] = extrabg_data[f"tau{tau}j{jet}_{beta}"] / extrabg_data[f"tau1j{jet}_{beta}"]
  extrabg_data[f"tau{tau}{tau-1}j{jet}_{beta}"] = extrabg_data[f"tau{tau}j{jet}_{beta}"] / extrabg_data[f"tau{tau-1}j{jet}_{beta}"]
  extrabg_data[f"tau{tau}1j{jet}_{beta}"] = extrabg_data[f"tau{tau}j{jet}_{beta}"] / extrabg_data[f"tau1j{jet}_{beta}"]
  extrabg_data[f"tau{tau}{tau-1}j{jet}_{beta}"] = extrabg_data[f"tau{tau}j{jet}_{beta}"] / extrabg_data[f"tau{tau-1}j{jet}_{

In [9]:
# Add delta mj
extrabg_data["delta_mj"] = extrabg_data["mj1"] - extrabg_data["mj2"]

# Add mjj
extrabg_mjj = get_mjj(extrabg_data)
extrabg_data["mjj"] = extrabg_mjj

  extrabg_data["delta_mj"] = extrabg_data["mj1"] - extrabg_data["mj2"]
  extrabg_data["mjj"] = extrabg_mjj


In [10]:
mjj_mask = (mjj > 3300) & (mjj < 3700)
sig_mask = (data['label'] == 1)
bg_data = data[~sig_mask & mjj_mask]
sig_full_data = data[sig_mask]
sig_data = data[sig_mask & mjj_mask]

# deal with signal first: select 1000 events from full signal dataset,
# then apply mjj mask
n_sig = 1000
rng = np.random.default_rng(42)
idx_arr = np.arange(sig_full_data.shape[0])
rng.shuffle(idx_arr)
selected_sig = sig_full_data.iloc[idx_arr[:n_sig]]
extra_sig = sig_full_data.iloc[idx_arr[n_sig:]]
inner_sig_mask = (selected_sig['mjj'] > 3300) & (selected_sig['mjj'] < 3700)
trainval_sig = selected_sig[inner_sig_mask]
inner_extrasig_mask = (extra_sig['mjj'] > 3300) & (extra_sig['mjj'] < 3700)
inner_extra_sig = extra_sig[inner_extrasig_mask]
inner_test_sig = inner_extra_sig[:20000]
inner_extra_sig = inner_extra_sig[20000:]

#save extra signal
inner_extra_sig.to_hdf("./events_subjettiness_features/innerdata_extrasig.h5", key="df", mode="w")

# split signal in train, val sets
train_sig, val_sig = train_test_split(trainval_sig, test_size=0.5, random_state=42)

train_bg, val_bg = train_test_split(bg_data, test_size=0.5, random_state=42)

innerdata_train = pd.concat([train_bg, train_sig], ignore_index=True)
innerdata_val = pd.concat([val_bg, val_sig], ignore_index=True)

# save inner data
innerdata_train.to_hdf("./events_subjettiness_features/innerdata_train.h5", key="df", mode="w")
innerdata_val.to_hdf("./events_subjettiness_features/innerdata_val.h5", key="df", mode="w")

Deal with extra background events

In [11]:
mjj_mask = (extrabg_mjj > 3300) & (extrabg_mjj < 3700)

extrabg_data = extrabg_data[mjj_mask]

In [12]:
trainval_extrabg, test_extrabg = train_test_split(extrabg_data, test_size=340_000, random_state=42)
train_extrabg, val_extrabg = train_test_split(trainval_extrabg, test_size=0.5, random_state=42)

# concatenate test set with extra signal
test_data = pd.concat([test_extrabg, inner_test_sig], ignore_index=True)

# save inner extra background
train_extrabg.to_hdf("./events_subjettiness_features/innerdata_extrabkg_train.h5", key="df", mode="w")
val_extrabg.to_hdf("./events_subjettiness_features/innerdata_extrabkg_val.h5", key="df", mode="w")
test_data.to_hdf("./events_subjettiness_features/innerdata_test.h5", key="df", mode="w")
