In [1]:
from data_processing import Dataset
from noise import NoiseScheduler
import matplotlib.pyplot as plt
import numpy as np
from torchvision import transforms  #provides common image transformations for data preprocessing
import torch    #used for tensor computations and building neural networks
from torch.utils.data import TensorDataset, DataLoader  #TensorDataset wraps tensors. DataLoader creates iterable dataloaders for datasets.
from diffusers.optimization import get_cosine_schedule_with_warmup # This function creates a learning rate scheduler with a cosine decay and warmup period
import torch.nn.functional as F
from tqdm import tqdm # Library used to display progress bars for loops, making it easy to track the progress of an iteration
from torch.optim import Adam # Popular optimization algorithm used in training neural networks.
from pathlib import Path # Object-oriented interface for filesystem paths.
import os
import numpy as np
import random
random.seed(10)

if torch.cuda.is_available():
    print("CUDA is available!")
    print("Number of available GPUs:", torch.cuda.device_count())
    print("Current GPU:", torch.cuda.current_device())
else:
    print("CUDA is not available. Running on CPU.")

CUDA is available!
Number of available GPUs: 1
Current GPU: 0


In [2]:
data_dir = "Datasets"       # Set to directory where data is stored

dataset = Dataset(10000, (120, 72), signal_file=f"{data_dir}/CaloImages_signal.root", pile_up_file=f"{data_dir}/CaloImages_bkg.root", save=False) # Can set to 10000
# 1000: number of samples in dataset
# (120, 72): Shape of each data sample (eg. image with dimensions 120x72)
# signal_file: Signal file for the dataset
# pile_up_file: This file contains background/ pileup data for the dataset
# save=False means the dataset should not be saved to disk after creation


In [3]:
dataset() # once this is cached, you don't have to re-load

INFO:root:loading file Datasets/CaloImages_signal.root
loading file Datasets/CaloImages_signal.root
100%|██████████| 10000/10000 [00:08<00:00, 1173.75it/s]
100%|██████████| 10000/10000 [00:08<00:00, 1140.96it/s]


In [4]:
new_dim=(64,64) #resize each data sample image into 64x64 resolution

In [5]:
dataset.preprocess(16, new_dim)   # Adjust the saturation energy and image dimensions here
# Pixels with an energy greater than the first number (eg.16 or 64 etc) will be clipped and shown as this number

INFO:root:scaling
scaling
INFO:root:re-sizing
re-sizing


In [6]:
# Converting from image from dataset from original format into PyTorch tensor
preprocess = transforms.Compose( # Chain together multiple image transformations. The transformations are applied sequentially in the order they are specified within the list.
        [   
            transforms.ToTensor()   # This is a transformation that converts an image (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
                                    # H stands for height, W stands for width, and C stands for the number of channels (e.g., 3 for RGB images).
        ]
)

MOVING DATA TO GPU

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
# Apply preprocess tranformation from prev cell to signal data and pileup data from dataset
clean_frames = preprocess(dataset.signal).float().permute(1, 2, 0).to(device) #pytorch semantics
pile_up = preprocess(dataset.pile_up).float().permute(1, 2, 0).to(device)

# Permute changes the order from (C, H, W) (default for PyTorch tensors after ToTensor) to (H, W, C)
# This is done to match the common image representation format where the last dimension is the number of channels (e.g., RGB)


In [9]:
# Creating a DataLoader object for the clean_frames dataset

# batch_size determines how many samples will be processed together in each iteration during training or evaluation.
batch_size = 16  # Adjust as needed, DataLoader will return batches of this many samples at a time

dataloader = DataLoader(clean_frames.unsqueeze(1), batch_size=batch_size, shuffle=False)
# clean_frames.unsqueeze(1) adds an extra dimension to the tensor, which is necessary to match the expected input shape for the DataLoader.

In [10]:
#check tensor shape

for batch in dataloader:
    for tensor in batch:
        print(tensor.shape)
        break
    break

torch.Size([1, 64, 64])


In [11]:
from models_stripped_kernels import Model, TrainingConfig, UNetLite_hls #see models.py file

modtype = 'UNet_lite' # Change Model type here

if modtype == 'UNet2d':
    model = Model('UNet', new_dim)
    model = model.__getitem__()
    config = TrainingConfig(output_dir='retrained_models_UNet2d') # Holds parameters used for training the model eg. learning rate, image size, number of epochs ....

elif modtype == 'UNet_lite':
    model = UNetLite_hls()
    config = TrainingConfig(output_dir='trained_models_lite/temp') # Holds parameters used for training the model eg. learning rate, image size, number of epochs ....

print('Number of learnable params: ', sum(p.numel() for p in model.parameters() if p.requires_grad)) #number of learnable params

model = model.to(device)

  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)


Number of learnable params:  29654


In [12]:
# torch.optim.AdamW: variant of the Adam optimizer that incorporates weight decay (L2 regularization) to help prevent overfitting
# model.parameters(): This passes all the parameters of the model that should be optimized
# lr=config.learning_rate: This sets the learning rate for the optimizer, which controls how much to adjust the model parameters at each step of the training process
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate) #optimizer updates model parameters to minimise the loss function

# get_cosine_schedule_with_warmup: This function creates a learning rate scheduler with a warmup period followed by a cosine decay.
# Warmup Period: The learning rate starts at a lower value and gradually increases to the initial learning rate over a specified number of steps (num_warmup_steps).
# This helps in stabilizing the training process at the beginning.
lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=config.lr_warmup_steps, # Sets number of warmup steps
    num_training_steps=len(dataloader) * config.num_epochs # Calculates total number of training steps based on num of batches per epoch and total num of epochs
)

In [13]:
def train_loop(config, model, noise_sample, optimizer, train_dataloader, lr_scheduler,noise_scheduler, n_events):

#This function defines the main training loop for the model

# config: Configuration object containing training settings.
# model: The neural network model to be trained.
# noise_sample: Noise data used for augmenting the images.
# optimizer: Optimizer for updating the model parameters.
# train_dataloader: DataLoader providing batches of training data.
# lr_scheduler: Learning rate scheduler.
# noise_scheduler: Scheduler for adding noise to the images.
# n_events: Number of events, possibly related to the dataset size or specific augmentations.
###

    global_step = torch.tensor(0) # Tensor to keep track of the number of steps taken during training
    # Now you train the model
    for epoch in range(10):
        
        # Just creating the progress bar
        progress_bar = tqdm(total=len(train_dataloader))
        progress_bar.set_description(f"Epoch {epoch}")

        # Iterate over each batch in the training DataLoader
        for step, batch in enumerate(train_dataloader):

            clean_images = batch
            
            # Sample noise to add to the images
            bs = clean_images[0].shape[0] # Batch Size: Extracts the batch size from the first dimension of clean_images.
            timesteps = torch.randint(
                0, config.num_train_timesteps, (bs,), device=clean_images.device
            ).long()

            random_seed = np.random.randint(0, n_events)

            noisy_images, noise_added = noise_scheduler.add_noise(clean_frame=clean_images, noise_sample=noise_sample, timestep=timesteps, random_seed=random_seed, n_events = n_events)

            noisy_images = noisy_images.to(device)
            noise_added = noise_added.to(device)

            # Predict the noise residual
            noise_pred = model(noisy_images, timesteps)[0] # The model takes the noisy images and timesteps as input and outputs predictions.
            loss = F.mse_loss(noise_pred, noise_added.float()) # Calculates the mean squared error loss between the predicted noise and the added noise

            optimizer.zero_grad() # Clears the old gradients from the last step by setting them to zero. This is necessary because by default, gradients are accumulated in PyTorch.
            loss.backward() # Computes the gradient of the loss with respect to the model parameters
            optimizer.step() # Updates the model parameters based on the computed gradients
            lr_scheduler.step() # Updates the learning rate according to the scheduler's policy

            progress_bar.update(1)
            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
            progress_bar.set_postfix(**logs)
            global_step += 1

            torch.save(model.state_dict(), os.path.join(config.output_dir, f"model_epoch_{epoch}.pt"))

In [14]:
from accelerate import notebook_launcher

args = (config, model, pile_up, optimizer, dataloader, lr_scheduler, NoiseScheduler('pile-up'), torch.tensor(1000))

notebook_launcher(train_loop, args, num_processes=1) #will port to GPU if availible (can't train on mutli-GPU at Bristol) 

Launching training on one GPU.


  loss = F.mse_loss(noise_pred, noise_added.float()) # Calculates the mean squared error loss between the predicted noise and the added noise
  loss = F.mse_loss(noise_pred, noise_added.float()) # Calculates the mean squared error loss between the predicted noise and the added noise
Epoch 0: 100%|██████████| 625/625 [00:10<00:00, 59.98it/s, loss=0.000762, lr=9.99e-5, step=tensor(624)]
Epoch 0: 100%|██████████| 625/625 [00:10<00:00, 60.96it/s, loss=0.000762, lr=9.99e-5, step=tensor(624)]

Epoch 1:   0%|          | 0/625 [00:00<?, ?it/s]
Epoch 1:   0%|          | 1/625 [00:00<00:20, 30.30it/s, loss=0.00271, lr=9.99e-5, step=tensor(625)]
Epoch 1:   0%|          | 2/625 [00:00<00:20, 30.30it/s, loss=0.000597, lr=9.99e-5, step=tensor(626)]
Epoch 1:   0%|          | 3/625 [00:00<00:20, 30.61it/s, loss=0.00136, lr=9.99e-5, step=tensor(627)] 
Epoch 1:   1%|          | 4/625 [00:00<00:20, 30.53it/s, loss=0.00136, lr=9.99e-5, step=tensor(627)]
Epoch 1:   1%|          | 4/625 [00:00<00:20, 30.53i