# Parametrized DNN
parametrized DNN 
1. Include all signals
2. check the backgrounds.
3. Check the AUC score as this is coming 1
4. Improve the training
5. include the weight of preselection

In [1]:
import os
import pandas as pd
import uproot
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam

# Taking mass X and corresponding Y mass points
# mass_points = [300, 400, 500, 550, 600, 650, 700, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, 3500, 4000]  # Example mass points
# y_values = [ 60, 70, 80, 90, 95, 100, 125, 150, 200, 300, 400, 500, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2600, 3000, 3500]  # Example Y values

mass_points = [300, 400, 500, 550, 600, 650, 700, 900, 1000]  # Example mass points
y_values = [ 100, 125, 150, 200, 300, 400, 500]  # Example Y values



# Load signal data from Parquet files
signal_data = []
for mass in mass_points:
    for y in y_values:
        file_path = f"../../../output_parquet/final_production_Syst/merged/NMSSM_X{mass}_Y{y}/nominal/NOTAG_merged.parquet"
        
        if os.path.exists(file_path):  # Check if file exists
            try:
                df = pd.read_parquet(file_path)  # Load the Parquet file
                df["mass"] = mass  
                df["y_value"] = y  # Store Y value if needed
                df["label"] = 1  # Assuming signal label
                signal_data.append(df)
            except Exception as e:
                print(f"Warning: Could not read {file_path}. Error: {e}")
        else:
            print(f"Warning: File {file_path} does not exist.")

# Combine all signal data into a single DataFrame
signal_df = pd.concat(signal_data, ignore_index=True) if signal_data else pd.DataFrame()



In [2]:
signal_df.shape

(346281, 853)

In [3]:
# Load background data from ROOT files
background_files = [
    ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GGJets/preselection"),
    ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt20To40/preselection"),
    ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt40/preselection"),
#     ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GGJets/preselection"),
#     ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt20To40/preselection"),
#     ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt40/preselection"),
]
background_data = []
for file_path, tree_name in background_files:
    try:
        with uproot.open(file_path) as file:
            tree = file[tree_name]
            df = tree.arrays(library="pd")
            df["mass"] = np.random.choice(mass_points, len(df))  # Random mass assignment
            df["label"] = 0
            background_data.append(df)
    except Exception as e:
        print(f"Warning: Could not read {file_path}. Error: {e}")

df_background = pd.concat(background_data, ignore_index=True) if background_data else pd.DataFrame()

In [4]:
# Define features and labels
features = [
    'bbgg_eta', 'bbgg_phi', 'lead_pho_phi', 'sublead_pho_eta', 
    'sublead_pho_phi', 'diphoton_eta', 'diphoton_phi', 'dibjet_eta', 'dibjet_phi', 
    'lead_bjet_pt', 'sublead_bjet_pt', 'lead_bjet_eta', 'lead_bjet_phi', 'sublead_bjet_eta', 
    'sublead_bjet_phi', 'sublead_bjet_PNetB', 'lead_bjet_PNetB', 'CosThetaStar_gg', 
    'CosThetaStar_jj', 'CosThetaStar_CS', 'DeltaR_jg_min', 'pholead_PtOverM', 
    'phosublead_PtOverM', 'lead_pho_mvaID', 'sublead_pho_mvaID'
]
features.extend(["mass", "y_value"])


In [5]:
# Random mass + y_value assignment for backgrounds (ensure this was done earlier!)
df_background["mass"] = np.random.choice(mass_points, len(df_background))
df_background["y_value"] = np.random.choice(y_values, len(df_background))


In [17]:

# Reduce background dataset size by random sampling
background_fraction = 0.6 #  20% of the background
df_background = df_background.sample(frac=background_fraction, random_state=42)

# Combine signal and background
df_combined = pd.concat([signal_df, df_background], ignore_index=True)
df_combined = df_combined.dropna(subset=['weight_preselection'])  # ✅ drop rows with bad weights
print("Class balance after dropping NaNs:", df_combined['label'].value_counts())


# checking df_combined is not empty
if df_combined.empty:
    raise ValueError("Error: Combined DataFrame is empty. Check input files.")

# Convert feature data to DataFrame to prevent AttributeError
df_features = df_combined[features]

# Fill missing values with column mean
df_features = df_features.fillna(df_features.mean())

# Extract features (X) and labels (y)
X = df_features.values
y = df_combined["label"].values

Class balance after dropping NaNs: 0    1199138
Name: label, dtype: int64


In [7]:
# class balance
print("signal shape", signal_df.shape, "background.shape", df_background.shape)


signal shape (346281, 853) background.shape (1998563, 100)


In [8]:
weights = ['weight_preselection']


In [9]:
from sklearn.model_selection import train_test_split

# Split into 80% train, 20% test (stratified to maintain label distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

# Start fresh from combined_df
X_with_meta = df_combined[features + ['label', 'weight_preselection']].copy()

# Split using stratification
train_df, test_df = train_test_split(
    X_with_meta, test_size=0.2, random_state=42, stratify=X_with_meta['label']
)

# Extract arrays
X_train = train_df[features].values
y_train = train_df['label'].values
# w_train = train_df['weight_preselection'].values

X_test = test_df[features].values
y_test = test_df['label'].values
# w_test = test_df['weight_preselection'].values

def normalize_weights(weights, labels):
    sig_mask = labels == 1
    bkg_mask = labels == 0
    weights = np.array(weights, dtype=np.float32)

    norm_weights = np.zeros_like(weights)
    norm_weights[sig_mask] = 0.5 * weights[sig_mask] / np.sum(weights[sig_mask])
    norm_weights[bkg_mask] = 0.5 * weights[bkg_mask] / np.sum(weights[bkg_mask])
    return norm_weights

# Start with raw preselections
raw_w_train = train_df['weight_preselection'].values
raw_w_test = test_df['weight_preselection'].values

# Normalize to 0.5 sum per class
w_train = normalize_weights(raw_w_train, y_train)
w_test = normalize_weights(raw_w_test, y_test)
assert not np.isnan(w_train).any(), "Still NaNs in train weights!"



# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to torch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
train_weights_tensor = torch.tensor(w_train, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
test_weights_tensor = torch.tensor(w_test, dtype=torch.float32)

# Create PyTorch datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor, train_weights_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor, test_weights_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [11]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np
import os

# Define the Parametric DNN
class ParameterizedDNN(nn.Module):
    def __init__(self, input_dim):
        super(ParameterizedDNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.3),

            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.3),

            nn.Linear(32, 16),
            nn.ReLU(),
            nn.BatchNorm1d(16),
            nn.Dropout(0.3),

            nn.Linear(16, 1)  # Logits
        )

    def forward(self, x):
        return self.model(x)



In [12]:
# Initialize model
input_dim = X_train_tensor.shape[1]
model = ParameterizedDNN(input_dim)
criterion = nn.BCEWithLogitsLoss(reduction='none')  # Enable per-sample loss
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=1e-3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



ParameterizedDNN(
  (model): Sequential(
    (0): Linear(in_features=27, out_features=64, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=32, out_features=16, bias=True)
    (9): ReLU()
    (10): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.3, inplace=False)
    (12): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [13]:
# loss function
def weighted_bce_loss(logits, targets, weights):
    # logits: raw model outputs
    # targets: binary labels (0 or 1)
    # weights: per-sample weights (e.g. preselection * normalization)
    loss = torch.nn.functional.binary_cross_entropy_with_logits(
        logits, targets, weight=weights, reduction='none'
    )
    return loss.mean()


In [14]:
# Training settings
num_epochs = 50
patience = 5
best_auc = 0.0
patience_counter = 0
save_path = "best_parametric_model.pt"

train_losses, train_accuracies, train_aucs = [], [], []

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    y_true_train, y_pred_train = [], []

    for X_batch, y_batch, w_batch in train_dataloader:
        X_batch, y_batch, w_batch = X_batch.to(device), y_batch.to(device), w_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch).squeeze()
        outputs = torch.clamp(outputs, min=-50.0, max=50.0)  # Prevent sigmoid overflow

#         loss = criterion(outputs, y_batch)
#         weighted_loss = (loss * w_batch).mean()
        weighted_loss = weighted_bce_loss(outputs, y_batch, w_batch)
        weighted_loss.backward()
        optimizer.step()

        epoch_loss += weighted_loss.item()

        probs = torch.sigmoid(outputs).detach().cpu().numpy()
        probs = np.nan_to_num(probs, nan=0.0, posinf=1.0, neginf=0.0)
        y_pred_train.extend(probs)
        y_true_train.extend(y_batch.cpu().numpy())

    # Metrics
    y_pred_train_binary = (np.array(y_pred_train) > 0.5).astype(int)
    train_accuracy = accuracy_score(y_true_train, y_pred_train_binary)
    train_auc = roc_auc_score(y_true_train, y_pred_train)

    # Validation
    model.eval()
    with torch.no_grad():
        outputs_test = model(X_test_tensor.to(device)).squeeze()
        outputs_test = torch.clamp(outputs_test, min=-50.0, max=50.0)
        probs_test = torch.sigmoid(outputs_test).cpu().numpy()
        probs_test = np.nan_to_num(probs_test, nan=0.0, posinf=1.0, neginf=0.0)

#         loss_test = criterion(outputs_test, y_test_tensor.to(device))
#         test_loss = (loss_test * test_weights_tensor.to(device)).mean().item()
        test_loss = weighted_bce_loss(outputs_test, y_test_tensor.to(device), test_weights_tensor.to(device)).item()


        y_pred_test_binary = (probs_test > 0.5).astype(int)
        test_accuracy = accuracy_score(y_test, y_pred_test_binary)
        test_auc = roc_auc_score(y_test, probs_test)

    train_losses.append(epoch_loss / len(train_dataloader))
    train_accuracies.append(train_accuracy)
    train_aucs.append(train_auc)

    print(f"\nEpoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {train_losses[-1]:.4f} | Train Acc: {train_accuracy:.4f} | Train AUC: {train_auc:.4f}")
    print(f"Test  Loss: {test_loss:.4f} | Test  Acc: {test_accuracy:.4f} | Test  AUC: {test_auc:.4f}")

    # Early Stopping
    if test_auc > best_auc:
        best_auc = test_auc
        patience_counter = 0
        torch.save(model.state_dict(), save_path)
        print(f"✅ Model improved. Saved to: {save_path}")
    else:
        patience_counter += 1
        print(f"No improvement in AUC for {patience_counter} epoch(s).")

    if patience_counter >= patience:
        print("⛔ Early stopping triggered.")
        break

# Load best model
model.load_state_dict(torch.load(save_path))
print("✅ Best model loaded from checkpoint.")


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [15]:
print("Any NaNs in weights?", np.isnan(w_train).any(), np.isnan(w_test).any())
print("Any Infs in weights?", np.isinf(w_train).any(), np.isinf(w_test).any())
print("Weight stats:", np.min(w_train), np.max(w_train), np.mean(w_train))
print("Label distribution:", np.bincount(y_train))


Any NaNs in weights? False False
Any Infs in weights? False False
Weight stats: -1.410449e-07 7.8918065e-06 3.1272472e-07
Label distribution: [1598850]


In [16]:
for X_batch, y_batch, w_batch in train_dataloader:
    print("X_batch shape:", X_batch.shape)
    print("y_batch min/max:", y_batch.min().item(), y_batch.max().item())
    print("w_batch min/max:", w_batch.min().item(), w_batch.max().item())
    print("X_batch NaNs:", torch.isnan(X_batch).any().item())
    print("First 5 rows of X:", X_batch[:5])
    break


X_batch shape: torch.Size([64, 27])
y_batch min/max: 0.0 0.0
w_batch min/max: 7.05224465491483e-08 7.891806490079034e-06
X_batch NaNs: False
First 5 rows of X: tensor([[ 0.0697,  1.5136,  1.4884,  0.4126,  0.0128,  0.0348,  1.3392,  0.0653,
         -0.5124, -0.1782, -0.1780,  0.0174, -0.2371,  0.1167, -1.4339, -0.3776,
         -0.2196, -0.5763, -0.0898, -0.0410, -1.1316,  0.3156, -0.6895,  0.4607,
          0.5037,  1.7881,  0.3299],
        [ 0.7992, -1.3368,  1.4368, -0.0648, -0.2400,  0.1234,  1.3410,  1.1913,
         -1.1651, -0.5242, -0.5191,  1.9290, -1.2705, -0.7345, -0.9372, -0.3659,
         -0.3811,  0.2713,  1.5017, -1.2542,  0.2397, -0.2299,  0.1994,  0.5972,
          0.5811,  0.3680, -1.0941],
        [-0.6312,  0.8134,  0.1802, -0.7195, -1.5005, -0.3054,  0.1500, -0.1920,
          1.6694, -0.0154,  1.0595, -0.0382, -1.1588, -0.1171,  0.9380, -0.1586,
          3.8482,  0.6326,  0.0817, -0.3181, -0.4692,  0.2216, -0.9589,  0.6778,
         -1.3542, -0.5788, -1.0941],
