In [11]:
import os
import re
import pandas as pd
import uproot
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import seaborn as sns
from sklearn.metrics import roc_auc_score, accuracy_score


In [2]:
signal_files = [
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X300_Y100/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X400_Y100/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X400_Y125/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X400_Y150/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X400_Y200/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X500_Y100/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X500_Y125/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X500_Y150/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X500_Y200/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X500_Y300/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X550_Y100/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X550_Y125/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X550_Y150/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X550_Y200/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X550_Y300/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X600_Y100/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X600_Y125/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X600_Y150/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X600_Y200/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X600_Y300/nominal/NOTAG_merged.parquet"),
    ("../../../output_parquet/final_production_Syst/merged/NMSSM_X600_Y400/nominal/NOTAG_merged.parquet"),
]

In [3]:
signal_data = []

for file_path in signal_files:
    if os.path.exists(file_path):
        try:
            # Extract mass and y_value using regex
            match = re.search(r'NMSSM_X(\d+)_Y(\d+)', file_path)
            if match:
                mass = int(match.group(1))
                y = int(match.group(2))

                df = pd.read_parquet(file_path)
                df["mass"] = mass
                df["y_value"] = y
                df["label"] = 1  # signal

                signal_data.append(df)
            else:
                print(f"Warning: Could not extract mass/Y from path: {file_path}")
        except Exception as e:
            print(f"Warning: Could not read {file_path}. Error: {e}")
    else:
        print(f"Warning: File does not exist: {file_path}")

# Combine all signal data into one DataFrame
signal_df = pd.concat(signal_data, ignore_index=True) if signal_data else pd.DataFrame()
print(f"Loaded signal samples successfully :-): {len(signal_df)} rows")

Loaded signal samples successfully :-): 85495 rows


In [4]:
signal_df.shape

(85495, 853)

In [5]:
# Load background data from ROOT files
background_files = [
    ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GGJets/preselection"),
    ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt20To40/preselection"),
    ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt40/preselection"),
    ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/ttHToGG/preselection"),
#     ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt20To40/preselection"),
#     ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt40/preselection"),
]
background_data = []
for file_path, tree_name in background_files:
    try:
        with uproot.open(file_path) as file:
            tree = file[tree_name]
            df = tree.arrays(library="pd")
#             df["mass"] = np.random.choice(mass_points, len(df))  # Random mass assignment
            df["label"] = 0
            background_data.append(df)
    except Exception as e:
        print(f"Warning: Could not read {file_path}. Error: {e}")

df_background = pd.concat(background_data, ignore_index=True) if background_data else pd.DataFrame()
print("background shape", df_background)

background shape          run  lumi     event  puppiMET_pt  puppiMET_phi  puppiMET_phiJERDown  \
0          1    13    8457.0    28.794365     -2.702637            -2.724609   
1          1    13    8485.0    28.510080     -1.231934            -1.179688   
2          1    13    8508.0    20.095448      1.424805             0.713379   
3          1    13    8522.0    19.977036     -0.033577            -0.047729   
4          1    13    8520.0    41.458942      2.011719             2.269531   
...      ...   ...       ...          ...           ...                  ...   
3591490    1   619  764410.0    42.836834      0.463562             0.482666   
3591491    1   619  764414.0    41.670010      3.051270             2.968750   
3591492    1   619  764429.0    44.757793     -0.725220            -0.732910   
3591493    1   619  764440.0     4.608137     -1.227539            -1.425781   
3591494    1   619  764431.0   101.927986     -2.302734            -2.382812   

         puppiMET_phiJ

In [6]:
# Define features and labels
features = [
    'bbgg_eta', 'bbgg_phi', 'lead_pho_phi', 'sublead_pho_eta', 
    'sublead_pho_phi', 'diphoton_eta', 'diphoton_phi', 'dibjet_eta', 'dibjet_phi', 
    'lead_bjet_pt', 'sublead_bjet_pt', 'lead_bjet_eta', 'lead_bjet_phi', 'sublead_bjet_eta', 
    'sublead_bjet_phi', 'sublead_bjet_PNetB', 'lead_bjet_PNetB', 'CosThetaStar_gg', 
    'CosThetaStar_jj', 'CosThetaStar_CS', 'DeltaR_jg_min', 'pholead_PtOverM', 
    'phosublead_PtOverM', 'lead_pho_mvaID', 'sublead_pho_mvaID',
    
]
# features.extend(["mass", "y_value"])

In [8]:
# Intersect features
common_features = list(set(signal_df.columns) & set(df_background.columns))
exclude = ['mass', 'label', 'y_value']
features = [f for f in common_features if f not in exclude]

df_all = pd.concat([signal_df, df_background], ignore_index=True)
X = df_all[features]
y = df_all["label"].values

# Impute missing and scale
imputer = SimpleImputer(strategy="mean")
scaler = StandardScaler()

X = imputer.fit_transform(X)
X = scaler.fit_transform(X)

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)

train_X, test_X, train_y, test_y = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

train_ds = TensorDataset(train_X, train_y)
test_ds = TensorDataset(test_X, test_y)
train_loader = DataLoader(train_ds, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=1024)


In [9]:
class DNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

model = DNN(input_dim=X.shape[1])
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=1e-3)


In [None]:
epochs = 10

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for xb, yb in train_loader:
        preds = model(xb)
        loss = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    with torch.no_grad():
        preds = model(test_X)
        auc_score = roc_auc_score(test_y, preds)
        acc = accuracy_score(test_y, preds.round())
    
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test AUC: {auc_score:.4f}, Accuracy: {acc:.4f}")


Epoch 1/10, Train Loss: 115.0589, Test AUC: 0.9922, Accuracy: 0.9828
Epoch 2/10, Train Loss: 103.3632, Test AUC: 0.9936, Accuracy: 0.9847
Epoch 3/10, Train Loss: 98.7702, Test AUC: 0.9951, Accuracy: 0.9867
Epoch 4/10, Train Loss: 95.7061, Test AUC: 0.9934, Accuracy: 0.9842
Epoch 5/10, Train Loss: 92.6203, Test AUC: 0.9961, Accuracy: 0.9844


In [None]:
df_all["score"] = model(torch.tensor(imputer.transform(scaler.inverse_transform(X)), dtype=torch.float32)).detach().numpy()

for mass in sorted(signal_df["mass"].unique()):
    subset = df_all[df_all["mass"] == mass]
    if len(subset) > 0:
        try:
            auc_m = roc_auc_score(subset["label"], subset["score"])
            print(f"AUC for mass {mass} GeV: {auc_m:.4f}")
        except:
            pass


In [7]:

# Reduce background dataset size by random sampling
background_fraction = 0.1 #  20% of the background
df_background = df_background.sample(frac=background_fraction, random_state=42)

# Combine signal and background
df_combined = pd.concat([signal_df, df_background], ignore_index=True)

# Ensure df_combined is not empty
if df_combined.empty:
    raise ValueError("Error: Combined DataFrame is empty. Check input files.")

# Convert feature data to DataFrame to prevent AttributeError
df_features = df_combined[features]

# Fill missing values with column mean
df_features = df_features.fillna(df_features.mean())

# Extract features (X) and labels (y)
X = df_features.values
y = df_combined["label"].values

In [8]:
df_background.shape

(359150, 82)

In [9]:
df_features.shape

(444645, 25)

Check the ttH killer by Thomas

https://gitlab.cern.ch/hhbbgg/tth-killer-eval