In [1]:
import os
import re
import pandas as pd
import uproot
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import seaborn as sns

In [2]:
signal_files = [
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X300_Y100/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X400_Y100/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X400_Y125/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X400_Y150/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X400_Y200/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X500_Y100/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X500_Y125/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X500_Y150/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X500_Y200/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X500_Y300/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X550_Y100/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X550_Y125/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X550_Y150/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X550_Y200/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X550_Y300/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X600_Y100/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X600_Y125/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X600_Y150/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X600_Y200/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X600_Y300/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X600_Y400/nominal/NOTAG_merged.parquet"),
]

In [3]:
signal_data = []

for file_path in signal_files:
    if os.path.exists(file_path):
        try:
            # Extract mass and y_value using regex
            match = re.search(r'NMSSM_X(\d+)_Y(\d+)', file_path)
            if match:
                mass = int(match.group(1))
                y = int(match.group(2))

                df = pd.read_parquet(file_path)
                df["mass"] = mass
                df["y_value"] = y
                df["label"] = 1  # signal

                signal_data.append(df)
            else:
                print(f"Warning: Could not extract mass/Y from path: {file_path}")
        except Exception as e:
            print(f"Warning: Could not read {file_path}. Error: {e}")
    else:
        print(f"Warning: File does not exist: {file_path}")

# Combine all signal data into one DataFrame
signal_df = pd.concat(signal_data, ignore_index=True) if signal_data else pd.DataFrame()
print(f"Loaded signal samples successfully :-): {len(signal_df)} rows")

Loaded signal samples successfully :-): 85495 rows


In [4]:
signal_df.shape

(85495, 853)

In [5]:
# Load background data from ROOT files
background_files = [
    ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GGJets/preselection"),
    ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt20To40/preselection"),
    ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt40/preselection"),
    ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/ttHToGG/preselection"),
#     ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt20To40/preselection"),
#     ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt40/preselection"),
]
background_data = []
for file_path, tree_name in background_files:
    try:
        with uproot.open(file_path) as file:
            tree = file[tree_name]
            df = tree.arrays(library="pd")
#             df["mass"] = np.random.choice(mass_points, len(df))  # Random mass assignment
            df["label"] = 0
            background_data.append(df)
    except Exception as e:
        print(f"Warning: Could not read {file_path}. Error: {e}")

df_background = pd.concat(background_data, ignore_index=True) if background_data else pd.DataFrame()

In [6]:
# Define features and labels
features = [
    'bbgg_eta', 'bbgg_phi', 'lead_pho_phi', 'sublead_pho_eta', 
    'sublead_pho_phi', 'diphoton_eta', 'diphoton_phi', 'dibjet_eta', 'dibjet_phi', 
    'lead_bjet_pt', 'sublead_bjet_pt', 'lead_bjet_eta', 'lead_bjet_phi', 'sublead_bjet_eta', 
    'sublead_bjet_phi', 'sublead_bjet_PNetB', 'lead_bjet_PNetB', 'CosThetaStar_gg', 
    'CosThetaStar_jj', 'CosThetaStar_CS', 'DeltaR_jg_min', 'pholead_PtOverM', 
    'phosublead_PtOverM', 'lead_pho_mvaID', 'sublead_pho_mvaID',
    
]
# features.extend(["mass", "y_value"])

In [7]:

# Reduce background dataset size by random sampling
background_fraction = 0.1 #  20% of the background
df_background = df_background.sample(frac=background_fraction, random_state=42)

# Combine signal and background
df_combined = pd.concat([signal_df, df_background], ignore_index=True)

# Ensure df_combined is not empty
if df_combined.empty:
    raise ValueError("Error: Combined DataFrame is empty. Check input files.")

# Convert feature data to DataFrame to prevent AttributeError
df_features = df_combined[features]

# Fill missing values with column mean
df_features = df_features.fillna(df_features.mean())

# Extract features (X) and labels (y)
X = df_features.values
y = df_combined["label"].values

In [8]:
df_background.shape

(359150, 82)

In [9]:
df_features.shape

(444645, 25)

Check the ttH killer by Thomas

https://gitlab.cern.ch/hhbbgg/tth-killer-eval