In [1]:
%load_ext autoreload
%autoreload 2

import time
import os
import math
import copy
import torch
from torch import nn, optim
import numpy as np
import awkward as ak
import uproot
import pandas as pd
import dask
import vector
import particle
import hepunits

import zuko
import torch
from torch import nn, optim
import lightning as L
from lightning.pytorch import loggers as pl_loggers

from torch.utils.data import DataLoader

from memflow.dataset.data import RootData,ParquetData
from dataset2 import AcceptanceDataset
from memflow.ttH.classifier.classifier_models import *

from ttH_dataclasses_acceptance import ttHHardDataset, ttHRecoDataset

from memflow.ttH.classifier.classifier_callbacks import *
from memflow.ttH.models.callbacks import ModelCheckpoint

from matplotlib import pyplot as plt

plt.rcParams.update({'figure.max_open_warning': 100})

vector.register_awkward()

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
print (f"Running on GPU : {torch.cuda.is_available()}")
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
print (f"Accelerator : {accelerator}")
torch.set_float32_matmul_precision('medium')  
if accelerator =='gpu':
    torch.cuda.empty_cache()
    print (torch.cuda.memory_summary(device=None, abbreviated=True))

In [2]:
data_hard = ParquetData(
    files = [
        '/cephfs/dice/users/sa21722/datasets/MEM_data/ttH/TF_v6/hard/2018/ttH/ttH_HToInvisible_M125.parquet',
    ],
    lazy = True,
)
data_reco = ParquetData(
    files = [
        '/cephfs/dice/users/sa21722/datasets/MEM_data/ttH/TF_v6/reco/2018/ttH/ttH_HToInvisible_M125.parquet',
    ],
    lazy = True,
)
print(data_hard)

In [3]:
hard_dataset = ttHHardDataset(
    data = data_hard,
    selection = [
        # 'higgs',
        # 'tops',
        'bottoms',
        # 'Ws',
        # 'Zs',
        'quarks',
        'neutrinos',
    ],
    build = False,
    fit = True,
    coordinates = 'cylindrical',
    apply_preprocessing = True,
    apply_boost = False,
    dtype = torch.float32,
)
print(hard_dataset)

reco_dataset = ttHRecoDataset(
    data = data_reco,
    selection = [
        'jets',
        'met',
    ],
    build = False,
    fit = True,
    coordinates = 'cylindrical',
    apply_preprocessing = True,
    apply_boost = False,
    dtype = torch.float32,
)
print(reco_dataset)

In [4]:
dataset = AcceptanceDataset(
    hard_dataset = hard_dataset,
    reco_dataset = reco_dataset,
)

train_frac = 0.8 # from 0.7
indices = torch.arange(len(dataset))
sep = int(train_frac*len(dataset))
train_indices = indices[:sep]
valid_indices = indices[sep:]

dataset_train = torch.utils.data.Subset(dataset,train_indices)
dataset_valid = torch.utils.data.Subset(dataset,valid_indices)

print (f'Dataset : training {len(dataset_train)} / validation {len(dataset_valid)}')

batch_size = 1024 # from 1024

loader_train = DataLoader(
    dataset_train,
    batch_size = batch_size,
    shuffle = True,
)
loader_valid = DataLoader(
    dataset_valid,
    batch_size = 10000,
    shuffle = False,
)
print (f'Batching {len(loader_train)} / Validation {len(loader_valid)}')

In [5]:
print("Before Preprocessing")
dataset.hard_dataset.plot(selection=True, raw=True,log=False)
print("After prerocessing")
dataset.hard_dataset.plot(selection=True, raw=False,log=False,fields_to_plot=['pt','eta','phi','mass','pdgid'])

# ttH Model

In [6]:
# We want a model that can classify between selected (1) and not selected (0)
# We can either use a fully connected network
# backbone : DNN
# head : DNN

# backbone = BaseMLP(
#     dim_in = dataset.flatten_dim,
#     dim_out = None,
#     neurons = [64]*7, # changed from [64]*3
#     hidden_activation = nn.GELU,
#     batch_norm = True,
# )
# head = BaseMLP(
#     dim_in = 64, # changed from 64 to match the last layer of the backbone
#     dim_out = 1,
#     output_activation = nn.Sigmoid,
#     batch_norm = True,
# )

# Or we can use
# backbone : transformer (+ mean pooling)
# head : DNN

backbone = BaseTransformerEncoder(
    n_particles_per_type = dataset.hard_dataset.number_particles_per_type,
    particle_type_names = dataset.hard_dataset.selection,
    input_features_per_type = dataset.hard_dataset.input_features,
    embed_dims = [64,256],
    activation = nn.GELU,
    num_layers = 6, # From 4
    nhead = 8, # From 4
    dim_feedforward = 512, # From 256
    layer_norm = True,
    dropout = 0.2, # From 0.1
)
head = BaseMLP(
    dim_in = 256,
    neurons = [128, 64, 64, 32], # changed from [128, 64, 32]
    dim_out = 1,
    output_activation = nn.Sigmoid,
    batch_norm = True,
)

# Combine the backbone and head into classifier
#weights = torch.tensor([2.0]) # new line, used because of class imbalance between unselected and selected events, makes the mistakes on minority class (selected) more expensive as its 2.0 compared to 1.0
model = Classifier(
    backbone = backbone,
    head = head,
    loss_function = nn.BCELoss(reduce='none')#nn.BCELoss(weight=weights, reduction='none'), # Was loss_function = nn.BCELoss(reduce='none')
)
print (model)

batch = next(iter(loader_train))

out = model(batch)
print (out.shape)

loss = model.shared_eval(batch,0,'test')
print(loss)

# Training

In [7]:
acceptance = AcceptanceCallback(
    # Boilerplate arguments #
    dataset = dataset_valid,
    selection = dataset.hard_dataset.selection,
    features_per_type = dataset.hard_dataset.input_features,
    preprocessing = dataset.hard_dataset.preprocessing,
    # Plotting arguments #
    frequency = 20,     # frequency of plotting as callback
    raw = True,        # undo preprocessing (see raw values)
    bins = 50,         # number of bins for histograms
    batch_size=10000,  # number of events per batch for model evaluation
    # N_batch = 1,     # cutdown number of batches used (to make it faster, optional)
    min_selected_events_per_bin = { # rebinning option to make plots nicer (optional)
        'pt'   : 10,
        'eta'  : 10,
        'phi'  : None,
        'mass' : 10,
        'pdgid': None,
    },
    label_names = { # rename option to make plot labels nicer (optional)
        'pt'   : r'$p_T$',
        'eta'  : r'$\eta$',
        'phi'  : r'$\phi$',
        'mass' : r'$M$',
        'pdgid': r'PDG ID',
    }
)

In [8]:
## Parameters #####
epochs = 200
steps_per_epoch_train = math.ceil(len(dataset_train)/loader_train.batch_size)

print (f'Training   : Batch size = {loader_train.batch_size} => {steps_per_epoch_train} steps per epoch')
##### Optimizer #####
optimizer = optim.RAdam(model.parameters(), lr=1e-5, weight_decay=1e-5)
model.set_optimizer(optimizer)

##### Scheduler #####
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer = optimizer,
    mode = 'min', 
    factor = 0.5, 
    patience = 10, 
    threshold = 0., 
    threshold_mode = 'rel',  
    cooldown = 0, 
    min_lr = 1e-7,
)
model.set_scheduler_config(
    {
        'scheduler' : scheduler,
        'interval' : 'step' if isinstance(scheduler,optim.lr_scheduler.OneCycleLR) else 'epoch',
        'frequency' : 1,
        'monitor' : 'val/loss_tot',
        'strict' : True, 
        'name' : 'scheduler',
    }
)


##### Callbacks #####
callbacks = [
    L.pytorch.callbacks.LearningRateMonitor(logging_interval='epoch'),
    L.pytorch.callbacks.ModelSummary(max_depth=2),
    acceptance,
    ModelCheckpoint(save_every_n_epochs=5, save_dir="trained_model_checkpoints/acceptance_checkpoints")
] 

##### Logger #####
logger = pl_loggers.CometLogger(
    save_dir = '../comet_logs',
    project_name = 'mem-flow-ttH',
    experiment_name = 'Acceptance',
    offline = False,
) 
logger.log_graph(model)
# logger.log_hyperparams()
# logger.experiment.log_code(folder='../src/')
logger.experiment.log_notebook(filename='acceptance_process_DL.ipynb',overwrite=True)

##### Trainer #####
trainer = L.Trainer(    
    min_epochs = 5,
    max_epochs = epochs,
    callbacks = callbacks,
    devices = 'auto',
    accelerator = accelerator,
    logger = logger,
    log_every_n_steps = steps_per_epoch_train,
)
##### Fit #####
trainer.fit(
    model = model, 
    train_dataloaders = loader_train,
    val_dataloaders = loader_valid,
    ckpt_path="trained_model_checkpoints/acceptance_checkpoints/model_epoch_95.ckpt" # Use to resume training from a checkpoint
)

In [9]:
acceptance_model = Classifier.load_from_checkpoint(checkpoint_path="trained_model_checkpoints/acceptance_checkpoints/model_epoch_95.ckpt")
acceptance_model.eval()

In [None]:
acceptance.N_batch = np.inf # for final plots, want to make sure we use as much stats as possible
figs = acceptance.make_plots(model=acceptance_model.cuda(),show=True)