In [1]:
import matplotlib
from matplotlib import pyplot as plt
import torch
import numpy as np
import scipy
import sklearn
import sklearn.metrics
import math
import uproot
import xgboost as xgb
plt.rcParams.update({'font.family': 'serif', 'font.serif': ['Times New Roman']})


  from pandas import MultiIndex, Int64Index


In [2]:
data = uproot.open("/Users/vinayakkapoor/Downloads/ee_modified_root_1_allorentz_gen.root")

FileNotFoundError: file not found

    '/Users/vinayakkapoor/Downloads/ee_modified_root_1_allorentz_gen.root'

Files may be specified as:
   * str/bytes: relative or absolute filesystem path or URL, without any colons
         other than Windows drive letter or URL schema.
         Examples: "rel/file.root", "C:\abs\file.root", "http://where/what.root"
   * str/bytes: same with an object-within-ROOT path, separated by a colon.
         Example: "rel/file.root:tdirectory/ttree"
   * pathlib.Path: always interpreted as a filesystem path or URL only (no
         object-within-ROOT path), regardless of whether there are any colons.
         Examples: Path("rel:/file.root"), Path("/abs/path:stuff.root")

Functions that accept many files (uproot.iterate, etc.) also allow:
   * glob syntax in str/bytes and pathlib.Path.
         Examples: Path("rel/*.root"), "/abs/*.root:tdirectory/ttree"
   * dict: keys are filesystem paths, values are objects-within-ROOT paths.
         Example: {"/data_v1/*.root": "ttree_v1", "/data_v2/*.root": "ttree_v2"}
   * already-open TTree objects.
   * iterables of the above.


In [None]:
tree = data['ttBar_treeVariables_step8;21;1']

In [None]:
ones_mask = tree['production_mode'].array(library='numpy') == 1
zeros_mask = tree['production_mode'].array(library='numpy') == 0

In [None]:
X_ones = torch.cat([torch.from_numpy(tree[key].array(library='numpy')[ones_mask][:, None]).float() for key in tree.keys() if (key != 'production_mode' and key != 'eventWeight' and key != '__index__')], dim=1) 

X_zeros = torch.cat([torch.from_numpy(tree[key].array(library='numpy')[zeros_mask][:, None]).float() for key in tree.keys() if (key != 'production_mode' and key != 'eventWeight' and key != '__index__')], dim=1) 


In [None]:
train_split = 0.7 # 70% of the data is training
valid_split = 0.2 # 20% of the data is validation
test_split = 0.1 # 10% of the data is testing

num_ones = len(X_ones) # calculate total number of data points with class labels = 1
num_zeros = len(X_zeros) # calculate total number of data points with class labels = 0

# split the data into training data
training_data = torch.cat((X_ones[:math.ceil(num_ones * train_split)], X_zeros[:math.ceil(num_zeros * train_split)]))
# split the labels for just training data
training_labels = torch.cat((torch.ones(math.ceil(num_ones * train_split)), torch.zeros(math.ceil(num_zeros * train_split))))

# split the data into validation data
validation_data = torch.cat((X_ones[math.ceil(num_ones * train_split):math.ceil(num_ones * (train_split + valid_split))],
                             X_zeros[math.ceil(num_zeros * train_split):math.ceil(num_zeros * (train_split + valid_split))]))
validation_labels = torch.cat((torch.ones(math.ceil(num_ones * valid_split)), torch.zeros(math.ceil(num_zeros * valid_split))))

# split the data into testing data
test_data = torch.cat((X_ones[math.ceil(num_ones * (train_split + valid_split)):],
                      X_zeros[math.ceil(num_zeros * (train_split + valid_split)):]))
test_labels = torch.cat((torch.ones(len(X_ones[math.ceil(num_ones * (train_split + valid_split)):])),
                        torch.zeros(len(X_zeros[math.ceil(num_zeros * (train_split + valid_split)):]))))

In [None]:
num_ones_in_training = int(train_split * num_ones)
num_zeros_in_training = int(train_split * num_zeros)

# upsample the training data
if num_ones_in_training > num_zeros_in_training: # need to upsample the zeros
    multiplicity = math.ceil(num_ones_in_training / num_zeros_in_training)
    
    training_data = torch.cat((training_data[training_labels == 1], training_data[training_labels == 0].repeat(multiplicity, 1)[:num_ones_in_training]))
    training_labels = torch.cat((training_labels[training_labels == 1][:num_zeros_in_training], training_labels[training_labels == 0].repeat(multiplicity)[:num_ones_in_training]))
elif num_zeros_in_training > num_ones_in_training: # need to upsample the ones
    multiplicity = math.ceil(num_zeros_in_training / num_ones_in_training)
    
    training_data = torch.cat((training_data[training_labels == 1].repeat(multiplicity, 1)[:num_zeros_in_training], training_data[training_labels == 0]))
    training_labels = torch.cat((training_labels[training_labels == 1].repeat(multiplicity)[:num_zeros_in_training], training_labels[training_labels == 0]))

In [None]:
num_ones_in_validation = len(validation_labels[validation_labels == 1])
num_zeros_in_validation = len(validation_labels[validation_labels == 0])

# upsample the validation data
if num_ones_in_validation > num_zeros_in_validation: # need to upsample the zeros
    multiplicity = math.ceil(num_ones_in_validation / num_zeros_in_validation)
    
    validation_data = torch.cat((validation_data[validation_labels == 1], validation_data[validation_labels == 0].repeat(multiplicity, 1)[:num_ones_in_validation]))
    validation_labels = torch.cat((validation_labels[validation_labels == 1][:num_zeros_in_validation], validation_labels[validation_labels == 0].repeat(multiplicity)[:num_ones_in_validation]))
elif num_zeros_in_validation > num_ones_in_validation: # need to upsample the ones
    multiplicity = math.ceil(num_zeros_in_validation / num_ones_in_validation)
    
    validation_data = torch.cat((validation_data[validation_labels == 1].repeat(multiplicity, 1)[:num_zeros_in_validation], validation_data[validation_labels == 0]))
    validation_labels = torch.cat((validation_labels[validation_labels == 1].repeat(multiplicity)[:num_zeros_in_validation], validation_labels[validation_labels == 0]))

In [None]:
# standardize the data
standardized_training_data = (training_data - torch.mean(training_data, 0)) / torch.std(training_data, 0, True)
standardized_validation_data = (validation_data - torch.mean(validation_data, 0)) / torch.std(validation_data, 0, True)
standardized_testing_data = (test_data - torch.mean(test_data, 0)) / torch.std(test_data, 0, True)

# replace nans with 0 for zero width values
standardized_training_data = torch.nan_to_num(standardized_training_data, nan=0.0, posinf=0.0, neginf=0.0)
standardized_validation_data = torch.nan_to_num(standardized_validation_data, nan=0.0, posinf=0.0, neginf=0.0)
standardized_testing_data = torch.nan_to_num(standardized_testing_data, nan=0.0, posinf=0.0, neginf=0.0)

In [None]:
standardized_training_data

In [None]:
standardized_training_data

In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score,roc_curve
import matplotlib.pyplot as plt


xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=0,max_depth=5,reg_alpha = 10, n_estimators = 300,learning_rate = 0.01)
xgb_model.fit(standardized_training_data, training_labels)

In [None]:

# Make predictions on the test set
y_pred_proba = xgb_model.predict_proba(standardized_testing_data)[:,1]

# Calculate the AUC
auc = roc_auc_score(test_labels, y_pred_proba)

In [None]:
# Plot the ROC curve
fpr, tpr, thresholds = roc_curve(test_labels, y_pred_proba,)
fig, ax = plt.subplots()
ax.plot(fpr, tpr, 'black')
ax.grid(linestyle='--', linewidth='0.5', color='gray', alpha=0.7)

ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)

ax.set_title('ROC Curve', fontsize=14)

# Show the plot
plt.show()


In [None]:
auc

In [30]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define the parameter grid
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.05,0.01],
    #learning rate increase exponentially 
    'n_estimators': [100, 200,300],
    #'subsample': [0.8, 1],
    #'colsample_bytree': [0.8, 1],
    'reg_alpha': [0.1, 1, 10],
}
#lasso

# Create the XGBoost model
xgb_model = xgb.XGBClassifier(objective="binary:logistic")

# Create the grid search object
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

# Fit the grid search to the data
xgb_model.fit(standardized_training_data, training_labels)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
grid_search.fit(standardized_training_data, training_labels)
print("Best parameters:", grid_search.best_params_)
print("Best AUC-ROC score:", grid_search.best_score_)

  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  return array[key] if axis == 0 else array[:, key]
  return array[key] if axis == 0 else array[:, key]
  return array[key] if axis == 0 else array[:, key]
  return array[key] if axis == 0 else array[:, key]
  return array[key] if axis == 0 else array[:, key]
  return array[key] if axis == 0 else array[:, key]
  return array[key] if axis == 0 else array[:, key]
  return array[key] if axis == 0 else array[:, key]




In [None]:
#save intermediate results 
#save model, predictions
#save training loss
#repeat gridsearch
