In [1]:
import os
import pandas as pd
import uproot
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import seaborn as sns


In [2]:

# File paths
signal_files_lowX_lowY = [
    ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/NMSSM_X300_Y60/preselection"),
    ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/NMSSM_X300_Y70/preselection"),
    ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/NMSSM_X300_Y80/preselection"),
    ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/NMSSM_X300_Y90/preselection"),
    ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/NMSSM_X400_Y60/preselection"),
    ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/NMSSM_X400_Y70/preselection"),
    ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/NMSSM_X400_Y80/preselection"),
    ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/NMSSM_X400_Y90/preselection"),
#     ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/NMSSM_X550_Y80/preselection"),
#     ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/NMSSM_X550_Y90/preselection"),
# #     ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/NMSSM_X550_Y95/preselection"),
# #     ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/NMSSM_X550_Y100/preselection"),
]

background_files = [
    ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/GGJets/preselection"),
    ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/GJetPt20To40/preselection"),
    ("../../outputfiles/hhbbgg_analyzerNMSSM-trees.root", "/GJetPt40/preselection"),
]

In [3]:
# Columns to be loaded
keys = [
     'dibjet_pt', 'diphoton_pt', 'bbgg_pt', 'bbgg_eta', 'bbgg_phi',
    'lead_pho_phi', 'sublead_pho_eta', 'sublead_pho_phi', 'diphoton_eta', 
    'diphoton_phi', 'dibjet_eta', 'dibjet_phi', 'lead_bjet_pt', 'sublead_bjet_pt', 
    'lead_bjet_eta', 'lead_bjet_phi', 'sublead_bjet_eta', 'sublead_bjet_phi', 
    'sublead_bjet_PNetB', 'lead_bjet_PNetB', 'CosThetaStar_gg', 'CosThetaStar_jj', 
    'CosThetaStar_CS', 'DeltaR_jg_min',   'pholead_PtOverM', 'phosublead_PtOverM',
    'weight_preselection',
]
# Variables removed :- 'bbgg_mass','FirstJet_PtOverM', 'SecondJet_PtOverM', 'diphoton_bbgg_mass', 'dibjet_bbgg_mass', 'lead_pho_eta',


In [4]:
# Load DataFrames
dfs = {}

# Load signal files
for file, key in signal_files_lowX_lowY:
    try:
        with uproot.open(file) as f:
            dfs[key] = f[key].arrays(keys, library="pd")
    except Exception as e:
        print(f"Error loading {file} with key {key}: {e}")

# Load background files
for file, key in background_files:
    try:
        with uproot.open(file) as f:
            dfs[key] = f[key].arrays(keys, library="pd")
    except Exception as e:
        print(f"Error loading {file} with key {key}: {e}")

# Combine signal DataFrames
signal_df = pd.concat([dfs[key] for key in dfs if 'NMSSM' in key], ignore_index=True)
background_df = pd.concat([dfs[key] for key in dfs if 'GJet' in key or 'GGJets' in key], ignore_index=True)



# signal_df = pd.concat([])
# Print combined sample sizes
print(f'Total Signal Shape: {signal_df.shape}')
print(f'Total Background Shape: {background_df.shape}')


Total Signal Shape: (170942, 27)
Total Background Shape: (98635, 27)


In [5]:
# Check if 'weight_preselection' exists in all DataFrames
if 'weight_preselection' not in signal_df.columns or 'weight_preselection' not in background_df.columns:
    print("Error: 'weight_preselection' column missing in one or more DataFrames.")
    exit()

# Assign labels
signal_df['label'] = 1
background_df['label'] = 0

In [6]:
# Combine signal and background data
combined_df = pd.concat([signal_df, background_df], ignore_index=True)
print(f'Combined DataFrame Shape: {combined_df.shape}')

# Define features and labels
features = [
    'bbgg_eta', 'bbgg_phi', 'lead_pho_phi', 'sublead_pho_eta', 
    'sublead_pho_phi', 'diphoton_eta', 'diphoton_phi', 'dibjet_eta', 'dibjet_phi', 
    'lead_bjet_pt', 'sublead_bjet_pt', 'lead_bjet_eta', 'lead_bjet_phi', 'sublead_bjet_eta', 
    'sublead_bjet_phi', 'sublead_bjet_PNetB', 'lead_bjet_PNetB', 'CosThetaStar_gg', 
    'CosThetaStar_jj', 'CosThetaStar_CS', 'DeltaR_jg_min', 'pholead_PtOverM', 
    'phosublead_PtOverM'
]

# variables removed:  'bbgg_mass',  'lead_pho_eta','FirstJet_PtOverM', 'SecondJet_PtOverM', 'diphoton_bbgg_mass', 'dibjet_bbgg_mass'

Combined DataFrame Shape: (269577, 28)


In [7]:

X = combined_df[features]
y = combined_df['label']
weights = combined_df['weight_preselection']

# Impute missing values and scale the data
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Extract weights for train and test datasets
X_train_weights = combined_df.loc[X_train.index, 'weight_preselection']
X_test_weights = combined_df.loc[X_test.index, 'weight_preselection']

# Impute and scale the features
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [8]:
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Convert data to torch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
X_train_weights_tensor = torch.tensor(X_train_weights.values, dtype=torch.float32)
X_test_weights_tensor = torch.tensor(X_test_weights.values, dtype=torch.float32)

# Create TensorDataset and DataLoader
train_data = TensorDataset(X_train_tensor, y_train_tensor, X_train_weights_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor, X_test_weights_tensor)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)



In [10]:
import random
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import pandas as pd

# Define the neural network model
class SimpleDNN(nn.Module):
    def __init__(self, input_dim, n_hidden_layers, total_neurons):
        super(SimpleDNN, self).__init__()
        layers = []
        layer_sizes = [input_dim] + [total_neurons // n_hidden_layers] * n_hidden_layers + [1]
        
        for i in range(len(layer_sizes) - 1):
            layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))
            if i < len(layer_sizes) - 2:
                layers.append(nn.ReLU())
                layers.append(nn.Dropout(0.3))  # Dropout for regularization
        self.net = nn.Sequential(*layers)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        return self.sigmoid(self.net(x))


# Training and evaluation function
def train_and_evaluate_model(learning_rate, n_hidden_layers, total_neurons):
    input_dim = X_train_tensor.shape[1]
    
    # Initialize the model
    model = SimpleDNN(input_dim, n_hidden_layers, total_neurons)
    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)
    
    # Train the model
    epochs = 20
    for epoch in range(epochs):
        model.train()
        for X_batch, y_batch, weight_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch).squeeze()
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

    # Evaluate on the test set
    model.eval()
    with torch.no_grad():
        test_outputs = []
        test_labels = []
        for X_batch, y_batch, weight_batch in test_loader:
            outputs = model(X_batch).squeeze()
            test_outputs.append(outputs.numpy())
            test_labels.append(y_batch.numpy())

    # Combine lists into numpy arrays
    test_outputs = np.concatenate(test_outputs)
    test_labels = np.concatenate(test_labels)

    # Calculate binary accuracy
    preds = (test_outputs > 0.5).astype(np.float32)
    accuracy = np.mean(preds == test_labels)

    return accuracy


# Random search setup
def random_search_with_logging(num_trials=20):
    learning_rates = np.logspace(-4, -1, num=20)  # log-scaled learning rates between 1e-4 and 1e-1
    n_hidden_layers_list = [2, 3, 4]  # Number of hidden layers
    total_neurons_list = [64, 128, 256, 512]  # Total neurons in the hidden layers

    best_accuracy = 0.0
    best_params = None
    results = []

    for trial in range(num_trials):
        # Randomly sample hyperparameters
        learning_rate = random.choice(learning_rates)
        n_hidden_layers = random.choice(n_hidden_layers_list)
        total_neurons = random.choice(total_neurons_list)

        # Train and evaluate the model
        accuracy = train_and_evaluate_model(learning_rate, n_hidden_layers, total_neurons)
        results.append((learning_rate, n_hidden_layers, total_neurons, accuracy))

        # Log the best accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = (learning_rate, n_hidden_layers, total_neurons)

        print(f'Trial {trial+1}/{num_trials}: LR = {learning_rate:.4e}, '
              f'Hidden Layers = {n_hidden_layers}, Total Neurons = {total_neurons}, '
              f'Accuracy = {accuracy:.4f}')

    print(f'Best Accuracy: {best_accuracy:.4f}')
    print(f'Best Hyperparameters: LR = {best_params[0]:.4e}, '
          f'Hidden Layers = {best_params[1]}, Total Neurons = {best_params[2]}')

    return results


# Plotting function
def plot_search_results(results):
    results_df = pd.DataFrame(results, columns=["learning_rate", "n_hidden_layers", "total_neurons", "accuracy"])
    
    # Plot learning rate vs accuracy
    plt.figure(figsize=(10, 6))
    plt.subplot(1, 3, 1)
    plt.plot(results_df['learning_rate'], results_df['accuracy'], 'bo')
    plt.xscale('log')
    plt.xlabel('Learning Rate')
    plt.ylabel('Accuracy')
    plt.title('Learning Rate vs Accuracy')

    # Plot number of hidden layers vs accuracy
    plt.subplot(1, 3, 2)
    sns.boxplot(x='n_hidden_layers', y='accuracy', data=results_df)
    plt.xlabel('Number of Hidden Layers')
    plt.ylabel('Accuracy')
    plt.title('Hidden Layers vs Accuracy')

    # Plot total neurons vs accuracy
    plt.subplot(1, 3, 3)
    sns.boxplot(x='total_neurons', y='accuracy', data=results_df)
    plt.xlabel('Total Neurons')
    plt.ylabel('Accuracy')
    plt.title('Total Neurons vs Accuracy')

    plt.tight_layout()
    plt.show()


# Run the random search
results = random_search_with_logging(num_trials=100)

# Plot the results
plot_search_results(results)


KeyboardInterrupt: 

## moving to GPU

In [None]:
import random
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import pandas as pd

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the neural network model
class SimpleDNN(nn.Module):
    def __init__(self, input_dim, n_hidden_layers, total_neurons):
        super(SimpleDNN, self).__init__()
        layers = []
        layer_sizes = [input_dim] + [total_neurons // n_hidden_layers] * n_hidden_layers + [1]
        
        for i in range(len(layer_sizes) - 1):
            layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1]))
            if i < len(layer_sizes) - 2:
                layers.append(nn.ReLU())
                layers.append(nn.Dropout(0.3))  # Dropout for regularization
        self.net = nn.Sequential(*layers)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        return self.sigmoid(self.net(x))

# Training and evaluation function
def train_and_evaluate_model(learning_rate, n_hidden_layers, total_neurons):
    input_dim = X_train_tensor.shape[1]
    
    # Initialize the model and move it to GPU
    model = SimpleDNN(input_dim, n_hidden_layers, total_neurons).to(device)
    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)
    
    # Train the model
    epochs = 20
    for epoch in range(epochs):
        model.train()
        for X_batch, y_batch, weight_batch in train_loader:
            # Move data to GPU
            X_batch, y_batch, weight_batch = X_batch.to(device), y_batch.to(device), weight_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch).squeeze()
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

    # Evaluate on the test set
    model.eval()
    with torch.no_grad():
        test_outputs = []
        test_labels = []
        for X_batch, y_batch, weight_batch in test_loader:
            # Move data to GPU
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            outputs = model(X_batch).squeeze()
            test_outputs.append(outputs.cpu().numpy())  # Move back to CPU for numpy conversion
            test_labels.append(y_batch.cpu().numpy())

    # Combine lists into numpy arrays
    test_outputs = np.concatenate(test_outputs)
    test_labels = np.concatenate(test_labels)

    # Calculate binary accuracy
    preds = (test_outputs > 0.5).astype(np.float32)
    accuracy = np.mean(preds == test_labels)

    return accuracy

# Random search setup
def random_search_with_logging(num_trials=20):
    learning_rates = np.logspace(-4, -1, num=20)  # log-scaled learning rates between 1e-4 and 1e-1
    n_hidden_layers_list = [2, 3, 4]  # Number of hidden layers
    total_neurons_list = [64, 128, 256, 512]  # Total neurons in the hidden layers

    best_accuracy = 0.0
    best_params = None
    results = []

    for trial in range(num_trials):
        # Randomly sample hyperparameters
        learning_rate = random.choice(learning_rates)
        n_hidden_layers = random.choice(n_hidden_layers_list)
        total_neurons = random.choice(total_neurons_list)

        # Train and evaluate the model
        accuracy = train_and_evaluate_model(learning_rate, n_hidden_layers, total_neurons)
        results.append((learning_rate, n_hidden_layers, total_neurons, accuracy))

        # Log the best accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = (learning_rate, n_hidden_layers, total_neurons)

        print(f'Trial {trial+1}/{num_trials}: LR = {learning_rate:.4e}, '
              f'Hidden Layers = {n_hidden_layers}, Total Neurons = {total_neurons}, '
              f'Accuracy = {accuracy:.4f}')

    print(f'Best Accuracy: {best_accuracy:.4f}')
    print(f'Best Hyperparameters: LR = {best_params[0]:.4e}, '
          f'Hidden Layers = {best_params[1]}, Total Neurons = {best_params[2]}')

    return results

# Plotting function
def plot_search_results(results):
    results_df = pd.DataFrame(results, columns=["learning_rate", "n_hidden_layers", "total_neurons", "accuracy"])
    
    # Plot learning rate vs accuracy
    plt.figure(figsize=(10, 6))
    plt.subplot(1, 3, 1)
    plt.plot(results_df['learning_rate'], results_df['accuracy'], 'bo')
    plt.xscale('log')
    plt.xlabel('Learning Rate')
    plt.ylabel('Accuracy')
    plt.title('Learning Rate vs Accuracy')

    # Plot number of hidden layers vs accuracy
    plt.subplot(1, 3, 2)
    sns.boxplot(x='n_hidden_layers', y='accuracy', data=results_df)
    plt.xlabel('Number of Hidden Layers')
    plt.ylabel('Accuracy')
    plt.title('Hidden Layers vs Accuracy')

    # Plot total neurons vs accuracy
    plt.subplot(1, 3, 3)
    sns.boxplot(x='total_neurons', y='accuracy', data=results_df)
    plt.xlabel('Total Neurons')
    plt.ylabel('Accuracy')
    plt.title('Total Neurons vs Accuracy')

    plt.tight_layout()
    plt.show()

# Run the random search
results = random_search_with_logging(num_trials=100)

# Plot the results
plot_search_results(results)


Trial 1/100: LR = 5.4556e-03, Hidden Layers = 4, Total Neurons = 512, Accuracy = 0.9061
Trial 2/100: LR = 2.6367e-03, Hidden Layers = 4, Total Neurons = 64, Accuracy = 0.9116
Trial 3/100: LR = 5.4556e-03, Hidden Layers = 4, Total Neurons = 512, Accuracy = 0.9128
Trial 4/100: LR = 1.0000e-04, Hidden Layers = 4, Total Neurons = 64, Accuracy = 0.9143
Trial 5/100: LR = 4.2813e-04, Hidden Layers = 2, Total Neurons = 64, Accuracy = 0.9185
Trial 6/100: LR = 3.3598e-02, Hidden Layers = 2, Total Neurons = 256, Accuracy = 0.8685
Trial 7/100: LR = 2.9764e-04, Hidden Layers = 3, Total Neurons = 256, Accuracy = 0.9207
Trial 8/100: LR = 4.2813e-04, Hidden Layers = 2, Total Neurons = 64, Accuracy = 0.9183
Trial 9/100: LR = 2.0691e-04, Hidden Layers = 3, Total Neurons = 64, Accuracy = 0.9150
Trial 10/100: LR = 1.0000e-04, Hidden Layers = 2, Total Neurons = 64, Accuracy = 0.9173
Trial 11/100: LR = 7.8476e-03, Hidden Layers = 2, Total Neurons = 64, Accuracy = 0.9153
Trial 12/100: LR = 6.9519e-02, Hidden

In [11]:
import matplotlib.pyplot as plt
import pandas as pd

# Modify random_search to collect data for plotting
def random_search_with_logging(num_trials=100):
    results = {
        'lr': [],
        'dropout_rate': [],
        'n_hidden': [],
        'total_neurons': [],
        'accuracy': []
    }

    best_accuracy = 0.0
    best_params = None

    for trial in range(num_trials):
        # Randomly sample hyperparameters
        lr = 10 ** random.uniform(-4, -2)  # Learning rate between 0.0001 and 0.01
        dropout_rate = random.uniform(0.1, 0.5)  # Dropout between 0.1 and 0.5
        hidden_dims = [random.randint(64, 256) for _ in range(4)]  # Layer sizes between 64 and 256

        # Initialize model, criterion, and optimizer
        model = SimpleDNN(input_dim, hidden_dims, dropout_rate)
        criterion = nn.BCELoss()
        optimizer = Adam(model.parameters(), lr=lr)

        # Train and evaluate the model
        accuracy = train_model(model, criterion, optimizer, train_loader, test_loader, epochs=50)

        print(f'Trial {trial + 1}: LR={lr:.5f}, Dropout={dropout_rate:.2f}, Hidden Dims={hidden_dims}, Validation Accuracy={accuracy:.4f}')

        # Log trial results
        results['lr'].append(lr)
        results['dropout_rate'].append(dropout_rate)
        results['n_hidden'].append(len(hidden_dims))  # Number of hidden layers
        results['total_neurons'].append(sum(hidden_dims))  # Total number of neurons across layers
        results['accuracy'].append(accuracy)

        # Update the best parameters
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = (lr, dropout_rate, hidden_dims)

    # Print best hyperparameters
    print(f'Best Validation Accuracy: {best_accuracy:.4f}')
    print(f'Best Hyperparameters: LR={best_params[0]}, Dropout={best_params[1]}, Hidden Dims={best_params[2]}')

    return results

# Perform random search with logging
results = random_search_with_logging(num_trials=100)

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Plot mean test score vs learning rate, n_hidden, total_neurons
fig, ax = plt.subplots(1, 3, figsize=(18, 5))

# Mean test score vs learning rate
ax[0].scatter(results_df['lr'], results_df['accuracy'], alpha=0.5)
ax[0].set_xlabel('Learning Rate')
ax[0].set_ylabel('Mean Test Score (Accuracy)')
ax[0].set_xscale('log')
ax[0].set_title('Mean Test Score vs Learning Rate')

# Mean test score vs number of hidden layers
ax[1].scatter(results_df['n_hidden'], results_df['accuracy'], alpha=0.5)
ax[1].set_xlabel('Number of Hidden Layers')
ax[1].set_ylabel('Mean Test Score (Accuracy)')
ax[1].set_title('Mean Test Score vs Number of Hidden Layers')

# Mean test score vs total neurons
ax[2].scatter(results_df['total_neurons'], results_df['accuracy'], alpha=0.5)
ax[2].set_xlabel('Total Neurons in Hidden Layers')
ax[2].set_ylabel('Mean Test Score (Accuracy)')
ax[2].set_title('Mean Test Score vs Total Neurons')

plt.tight_layout()
plt.show()


KeyboardInterrupt: 