# Graph Neural Network Training

Creation of graphs were done with 
```tools/training/GraphCreationModel.py```
Files can be found in: 
```/eos/cms/store/user/folguera/L1TMuon/INTREPID/Graphs_v240725_241015/```
in two flavours, with "all" connected layers and with up to "3-neighbour" layers connections. 

In [1]:
from TrainModelFromGraph import TrainModelFromGraph

## Load data

In [2]:
import os
## check if EOS folder exists otherwise use local folder
if os.path.exists("/eos/cms/store/user/folguera/L1TMuon/INTREPID/Graphs_v240725_241015/"):
    GraphDIR = "/eos/cms/store/user/folguera/L1TMuon/INTREPID/Graphs_v240725_241015/"
else:
    GraphDIR = "../../graph_folder/"

ModelOutDIR = "Bsize64_lr5e-4_NOnormNodes/"
BatchSize = 64
LearningRate = 0.0005
Epochs = 500
UsingOnly = 5

trainer = TrainModelFromGraph(Graph_path=GraphDIR, Out_path=ModelOutDIR, BatchSize=BatchSize, LearningRate=LearningRate, Epochs=Epochs, UsingOnly=UsingOnly)
trainer.activate_debug()


TypeError: TrainModelFromGraph.__init__() got an unexpected keyword argument 'UsingOnly'

In [3]:
trainer.load_data()

Using files: ['vix_graph_3_15Oct_onlypt_001.pkl', 'vix_graph_3_15Oct_onlypt_002.pkl', 'vix_graph_3_15Oct_onlypt_003.pkl', 'vix_graph_3_15Oct_onlypt_004.pkl', 'vix_graph_3_15Oct_onlypt_005.pkl', 'vix_graph_3_15Oct_onlypt_006.pkl', 'vix_graph_3_15Oct_onlypt_007.pkl', 'vix_graph_3_15Oct_onlypt_008.pkl', 'vix_graph_3_15Oct_onlypt_009.pkl', 'vix_graph_3_15Oct_onlypt_010.pkl', 'vix_graph_3_15Oct_onlypt_011.pkl', 'vix_graph_3_15Oct_onlypt_012.pkl', 'vix_graph_3_15Oct_onlypt_013.pkl', 'vix_graph_3_15Oct_onlypt_014.pkl', 'vix_graph_3_15Oct_onlypt_015.pkl', 'vix_graph_3_15Oct_onlypt_016.pkl', 'vix_graph_3_15Oct_onlypt_017.pkl', 'vix_graph_3_15Oct_onlypt_018.pkl', 'vix_graph_3_15Oct_onlypt_019.pkl', 'vix_graph_3_15Oct_onlypt_020.pkl', 'vix_graph_3_15Oct_onlypt_021.pkl', 'vix_graph_3_15Oct_onlypt_022.pkl', 'vix_graph_3_15Oct_onlypt_023.pkl', 'vix_graph_3_15Oct_onlypt_024.pkl', 'vix_graph_3_15Oct_onlypt_025.pkl', 'vix_graph_3_15Oct_onlypt_026.pkl', 'vix_graph_3_15Oct_onlypt_027.pkl', 'vix_graph_3_1

In [7]:
trainer.initialize_model()

Using device: cuda
Model initialized
GATRegressor(
  (conv1): GATConv(5, 64, heads=1)
  (conv2): GATConv(64, 64, heads=1)
  (fc1): Linear(in_features=128, out_features=1, bias=True)
)
Try to compile model...
Model compiled


In [9]:
import torch 
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("Current CUDA device:", torch.cuda.current_device())
print("CUDA device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
CUDA device count: 1
Current CUDA device: 0
CUDA device name: Tesla T4


In [None]:
trainer.Debug=False
trainer.loop_over_epochs()

Start training...
Epoch: 01, Train loss: 10.1278, Test loss: 8.8727


# Other (older) tests

In [None]:
BatchSize=64

from torch_geometric.transforms import BaseTransform
import torch_geometric.transforms as T

# Normalización de los datos
class NormalizeEdgeFeatures(object):
    def __call__(self, data):        
        # Normalizar características de los bordes
        edge_attr = torch.stack([data.deltaPhi, data.deltaEta], dim=1)
        edge_attr = (edge_attr - edge_attr.mean(dim=0)) / edge_attr.std(dim=0)
        data.deltaPhi, data.deltaEta = edge_attr[:, 0], edge_attr[:, 1]
        
        return data

class NormalizeNodeAndEdgeFeatures(BaseTransform):
    def __call__(self, data):
        # Normalizar características de los nodos
        data.x = (data.x - data.x.mean(dim=0)) / data.x.std(dim=0)
        
        # Normalizar características de los bordes
        edge_attr = torch.stack([data.deltaPhi, data.deltaEta], dim=1)
        edge_attr = (edge_attr - edge_attr.mean(dim=0)) / edge_attr.std(dim=0)
        data.deltaPhi, data.deltaEta = edge_attr[:, 0], edge_attr[:, 1]
        
        return data


# Aplicar transformaciones de normalización
#transform = T.Compose([T.RemoveIsolatedNodes(),T.NormalizeFeatures()]) #NormalizeEdgeFeatures()])
transform = T.Compose([T.RemoveIsolatedNodes(),NormalizeNodeAndEdgeFeatures()]) 

Graphs_for_training = sum(Allgraphs, [])
Graphs_for_training_reduced = Graphs_for_training
Graphs_for_training_filtered = [g for g in Graphs_for_training_reduced if g.edge_index.size(1) > 0]  # remove empty graphs

# remove extra dimenson in y
print(f"Total Graphs: {len(Graphs_for_training)}")
for i in range(0, len(Graphs_for_training_filtered)):
    Graphs_for_training_filtered[i].y = Graphs_for_training_filtered[i].y.mean(dim=0)

print("====================================")
print("Example of data (before normalization):")
print(Graphs_for_training_filtered[0].x)
print(Graphs_for_training_filtered[0].edge_index)
print(Graphs_for_training_filtered[0].edge_attr)
print(Graphs_for_training_filtered[0].deltaPhi)
print(Graphs_for_training_filtered[0].deltaEta)
print(Graphs_for_training_filtered[0].y)

# Normalize the data...
#Graphs_for_training_filtered = normalize_graphs(Graphs_for_training_filtered)

Graphs_for_training_filtered = [transform(g) for g in Graphs_for_training_filtered]

print(f"Total Graphs after filtering: {len(Graphs_for_training_filtered)}")

# Train and test split:
events = len(Graphs_for_training_filtered)
ntrain = int((events * 0.7) / BatchSize) * BatchSize  # to have full batches
print(f"Training events: {ntrain}")
train_dataset = Graphs_for_training_filtered[:ntrain]
test_dataset = Graphs_for_training_filtered[ntrain:ntrain * 2]

print("====================================")
print("Example of data (after normalization):")
print(train_dataset[0].x)
print(train_dataset[0].edge_index)
print(train_dataset[0].edge_attr)
print(train_dataset[0].deltaPhi)
print(train_dataset[0].deltaEta)
print(train_dataset[0].y)
print("====================================")


In [None]:
# Load data
train_loader = DataLoader(train_dataset, batch_size=BatchSize, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

### Training loop

In [None]:
from models import GATRegressor 

num_node_features = 5
hidden_dim = BatchSize
output_dim = 1
LearningRate=0.0005
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")
model = GATRegressor(num_node_features, hidden_dim, output_dim).to(device)
torch.compile(model, dynamic=True)

optimizer = torch.optim.Adam(model.parameters(), lr=LearningRate, weight_decay=0.75)
if torch.cuda.device_count() > 1:
    self.model = torch.nn.DataParallel(self.model)

loss_fn = torch.nn.MSELoss()
print("Model initialized")
print(model)


In [None]:
train_losses = []
test_losses = []
#
#path = "/eos/cms/store/user/folguera/L1TMuon/INTREPID/Model_v240725_241022/"
path = "../model_folder_v2/"
if not os.path.exists(path):
    os.makedirs(path)

In [None]:
def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)  # Mueve los datos al dispositivo
        data.y = data.y.float()  # Asegurarse de que los datos sean float32
        data.x = data.x.float()  # Asegurarse de que los datos sean float32
        out = model(data)
        optimizer.zero_grad()
        loss = loss_fn(out, data.y.view(out.size()))
        loss.backward()
        optimizer.step()
        total_loss += float(loss)
    return total_loss / len(train_loader.dataset)

def test():
    with torch.no_grad():
        model.eval()
        total_loss = 0
        for data in test_loader:
            data = data.to(device)
            data.x = data.x.float()  # Asegurarse de que los datos sean float32
            data.y = data.y.float()  # Asegurarse de que los datos sean float32
            out = model(data)
            loss = loss_fn(out, data.y.view(out.size()))
            total_loss += float(loss)
    return total_loss / len(test_loader.dataset)


print("Start training...")
for epoch in range(100):
    train_loss = train()
    test_loss = test()
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    
    if epoch == 0:
        print(f'Epoch: {epoch + 1:02d}, Train loss: {train_loss:.4f}, Test loss: {test_loss:.4f}')
        torch.save(test_loss, f"{path}/testloss_{epoch + 1}.pt")
        torch.save(train_loss, f"{path}/trainloss_{epoch + 1}.pt")
    elif (epoch + 1) % 10 == 0:
        print(f'Epoch: {epoch + 1:02d}, Train loss: {train_loss:.4f}, Test loss: {test_loss:.4f}')
        torch.save(model, f"{path}/model_{epoch + 1}.pth")
        torch.save(test_loss, f"{path}/testloss_{epoch + 1}.pt")
        torch.save(train_loss, f"{path}/trainloss_{epoch + 1}.pt")

        plt.plot(train_losses, "b", label="Train loss")
        plt.plot(test_losses, "k", label="Test loss")
        plt.yscale('log')
        plt.savefig(f"{path}/loss_plot.png")


## Plotting and checking
Now we need to plot everything and check the predicted momentum

In [None]:
class PlotRegression:
    def __init__(self, model, test_loader, batch_size):
        self.model = model
        self.test_loader = test_loader
        self.batch_size = batch_size
        self.pt_pred_arr = []
        self.pt_truth_arr = []

    def evaluate(self):
        with torch.no_grad():
            for data in self.test_loader:
                out = self.model(data)
                for item in range(0, out.size(0)):
                    vector_pred = out[item]
                    vector_real = data[item].y
                    self.pt_pred_arr.append(vector_pred.item())
                    self.pt_truth_arr.append(vector_real.item())

    def plot_regression(self, output_dir):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        plt.clf()
        print(f"Plotting regression in {output_dir}")
        plt.hist(self.pt_truth_arr, bins=100, color='skyblue', alpha=0.5, label="truth")
        plt.hist(self.pt_pred_arr, bins=100, color='g', alpha=0.5, label="prediction")
        plt.legend()
        plt.savefig(os.path.join(output_dir, "pt_regression.png"))
        plt.clf()

        print(f"Plotting scatter in {output_dir}")
        plt.plot(self.pt_truth_arr, self.pt_pred_arr, 'o')
        plt.xlabel("Truth")
        plt.ylabel("Prediction")
        plt.savefig(os.path.join(output_dir, "pt_regression_scatter.png"))
        plt.clf()

        print(f"Plotting difference in {output_dir}")
        # plot difference between truth and prediction
        diff = [x - y for x, y in zip(self.pt_truth_arr, self.pt_pred_arr)]
        plt.hist(diff, bins=100, color='r', alpha=0.5, label="difference")
        plt.legend()
        plt.savefig(os.path.join(output_dir, "pt_regression_diff.png"))
        plt.clf()
        



In [None]:
model = torch.load("../model_folder/model_100.pth")
            
evaluator = PlotRegression(model, test_loader, batch_size=BatchSize)
evaluator.evaluate()
evaluator.plot_regression(output_dir="../model_folder/")
