In [1]:
import os
import pandas as pd
import uproot
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam

In [2]:
mass_points = [400, 500]


In [3]:
# Load signal and background data
signal_data = []
for mass in mass_points:
    file_path = f"../../outputfiles/hhbbgg_analyzer-v2-trees.root:/NMSSM_X{mass}_Y100/preselection"
    with uproot.open(file_path) as file:
        tree = file["preselection"]
        df = tree.arrays(library="pd")
        df["mass"] = mass  # Assign mass as a feature
        df["label"] = 1
        signal_data.append(df)

In [4]:
df_signal = pd.concat(signal_data, ignore_index=True)


In [5]:
background_files = [
#     ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GGJets/preselection"),
    ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt20To40/preselection"),
    ("../../outputfiles/hhbbgg_analyzer-v2-trees.root", "/GJetPt40/preselection"),
]

In [6]:
background_data = []
for file_path, tree_name in background_files:
    with uproot.open(file_path) as file:
        tree = file[tree_name]
        df = tree.arrays(library="pd")
        df["mass"] = np.random.choice(mass_points, len(df))  # Random mass assignment
        df["label"] = 0
        background_data.append(df)

df_background = pd.concat(background_data, ignore_index=True)


In [7]:
# Reduce background dataset size by random sampling
background_fraction = 0.2  # Adjust this to keep 20% of the background
df_background = df_background.sample(frac=background_fraction, random_state=42)


In [8]:
# Combine signal and background
df_combined = pd.concat([df_signal, df_background], ignore_index=True)

In [9]:
#  Fill Missing Values with Mean/Zero
df_combined = df_combined.fillna(df_combined.mean())  # Replace NaNs with column mean
# df_combined = df_combined.fillna(0)  # Replace NaNs with 0


In [10]:
# Define features and labels
features = [col for col in df_combined.columns if col != "label"]
X = df_combined[features].values
y = df_combined["label"].values


In [None]:
# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)



In [None]:
class ParameterizedDNN(nn.Module):
    def __init__(self, input_dim):
        super(ParameterizedDNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1)  # No Sigmoid here
        )

    def forward(self, x):
        return torch.sigmoid(self.model(x))  # Apply Sigmoid here


In [None]:
# Initialize and train the model
input_dim = X.shape[1]
model = ParameterizedDNN(input_dim)
criterion = nn.BCEWithLogitsLoss()  # More numerically stable
optimizer = Adam(model.parameters(), lr=0.001)


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

num_epochs = 10
train_losses = []
train_accuracies = []
train_aucs = []
fpr_all, tpr_all, thresholds_all = [], [], []

for epoch in range(num_epochs):
    epoch_loss = 0
    y_true = []
    y_pred = []
    
    for batch in dataloader:
        X_batch, y_batch = batch
        optimizer.zero_grad()
        outputs = model(X_batch).squeeze()
        
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        # Store predictions for accuracy & AUC
        y_true.extend(y_batch.cpu().numpy())  # True labels
        y_pred.extend(outputs.detach().cpu().numpy())  # Model outputs
    
    # Compute Metrics
    avg_loss = epoch_loss / len(dataloader)
    y_pred_binary = [1 if p > 0.5 else 0 for p in y_pred]  # Convert to 0/1 labels
    accuracy = accuracy_score(y_true, y_pred_binary)
    auc = roc_auc_score(y_true, y_pred)

    # Store metrics
    train_losses.append(avg_loss)
    train_accuracies.append(accuracy)
    train_aucs.append(auc)
    
    # Compute ROC curve for current epoch (for plotting)
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    fpr_all.append(fpr)
    tpr_all.append(tpr)
    thresholds_all.append(thresholds)
    
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")




In [None]:
# Plot Loss
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.plot(range(1, num_epochs+1), train_losses, marker='o', linestyle='-', color='blue')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss vs. Epochs")


plt.tight_layout()
plt.show()

In [None]:
# Plot Accuracy
plt.subplot(1, 3, 2)
plt.plot(range(1, num_epochs+1), train_accuracies, marker='o', linestyle='-', color='green')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy vs. Epochs")

plt.tight_layout()
plt.show()


In [None]:
# Plot AUC
plt.subplot(1, 3, 3)
plt.plot(range(1, num_epochs+1), train_aucs, marker='o', linestyle='-', color='red')
plt.xlabel("Epoch")
plt.ylabel("AUC")
plt.title("AUC vs. Epochs")


plt.tight_layout()
plt.show()

In [None]:
# Plot AUC scores over epochs
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs+1), train_aucs, label="AUC", color='blue', marker='o')
plt.xlabel('Epoch')
plt.ylabel('AUC')
plt.title('AUC Score over Epochs')
plt.grid(True)
plt.legend()
plt.show()

In [None]:


# Plot the final ROC curve
# Select the ROC curve from the last epoch
fpr_last = fpr_all[-1]
tpr_last = tpr_all[-1]

plt.figure(figsize=(10, 6))
plt.plot(fpr_last, tpr_last, color='darkorange', lw=2, label=f'ROC curve (AUC = {train_aucs[-1]:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Random classifier line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Final ROC Curve (AUC = {train_aucs[-1]:.2f})')
plt.legend(loc="lower right")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

num_epochs = 10
train_losses = []
train_accuracies = []
train_aucs = []
fpr_all, tpr_all, thresholds_all = [], [], []

# Assuming 1 is the signal and 0 is the background
signal_true = []
signal_pred = []
background_true = []
background_pred = []

for epoch in range(num_epochs):
    epoch_loss = 0
    y_true = []
    y_pred = []
    
    for batch in dataloader:
        X_batch, y_batch = batch
        optimizer.zero_grad()
        
        # Apply sigmoid to get probabilities
        outputs = torch.sigmoid(model(X_batch)).squeeze()
        
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        # Store predictions for accuracy & AUC
        y_true.extend(y_batch.cpu().numpy())  # True labels
        y_pred.extend(outputs.detach().cpu().numpy())  # Model outputs (probabilities)
    
    # Compute Metrics
    avg_loss = epoch_loss / len(dataloader)
    y_pred_binary = [1 if p > 0.5 else 0 for p in y_pred]  # Convert to 0/1 labels
    accuracy = accuracy_score(y_true, y_pred_binary)
    auc_value = roc_auc_score(y_true, y_pred)

    # Store metrics
    train_losses.append(avg_loss)
    train_accuracies.append(accuracy)
    train_aucs.append(auc_value)
    
    # Separate signal and background for current epoch
    for i in range(len(y_true)):
        if y_true[i] == 1:  # Signal class
            signal_true.append(y_true[i])
            signal_pred.append(y_pred[i])
        else:  # Background class
            background_true.append(y_true[i])
            background_pred.append(y_pred[i])
    
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}, AUC: {auc_value:.4f}")

# Plot AUC scores over epochs
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs+1), train_aucs, label="AUC", color='blue', marker='o')
plt.xlabel('Epoch')
plt.ylabel('AUC')
plt.title('AUC Score over Epochs')
plt.grid(True)
plt.legend()
plt.show()

# Plot the final ROC curve
# Separate signal and background (on ROC plot)
fpr, tpr, thresholds = roc_curve(signal_true + background_true, signal_pred + background_pred)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Random classifier line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve (AUC = {roc_auc:.2f})')
plt.legend(loc="lower right")
plt.show()

# Optionally: Plot separate signal and background distributions
plt.figure(figsize=(10, 6))

# Plot signal
plt.hist(signal_pred, bins=50, alpha=0.7, label='Signal', color='green')

# Plot background
plt.hist(background_pred, bins=50, alpha=0.7, label='Background', color='red')

plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.title('Signal vs Background Prediction Distribution')
plt.legend()
plt.grid(True)
plt.show()
