In [194]:
!pip install dgl



In [195]:
%matplotlib inline
import os

os.environ["DGLBACKEND"] = "pytorch"
import dgl
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import dgl.function as fn
import torch.nn.functional as F
import shutil
from torch.utils.data import DataLoader
import cloudpickle
from dgl.nn import GraphConv

##Set Path

this segment involves directory and file operations. It creates directories, deletes directories, and extracts the contents of a ZIP file.

In [196]:
current_dir = "./"
checkpoint_path = current_dir + "save_models/model_checkpoints/" + "checkpoint"
os.makedirs(checkpoint_path, exist_ok=True)

best_model_path = current_dir + "save_models/best_model/"

folder_data_temp = current_dir +"data_temp/"
shutil.rmtree(folder_data_temp, ignore_errors=True)

path_save = current_dir + "free.zip"
shutil.unpack_archive(path_save, folder_data_temp)

##Custom PyTorch Datasets

The `DGLDatasetReg` class is a custom dataset class specifically designed for regression tasks using Deep Graph Library (DGL). the `DGLDatasetReg` class provides a convenient way to handle regression datasets in DGL. It supports optional feature scaling and allows for customization through the `transform` parameter.

In [197]:
from sklearn import preprocessing

In [198]:
# """ Regression Dataset """
# class DGLDatasetReg(torch.utils.data.Dataset):
#     def __init__(self, address, transform=None, train=False, scaler=None):
#             self.transform=transform
#             self.train = train
#             self.scaler = scaler
#             self.data_set, train_labels_masks_globals = dgl.load_graphs(address+".bin")
#             num_graphs = len(self.data_set)
#             self.labels = train_labels_masks_globals["labels"].view(num_graphs,-1)
#             self.masks = train_labels_masks_globals["masks"].view(num_graphs,-1)
#             self.globals = train_labels_masks_globals["globals"].view(num_graphs,-1)

#     def scaler_method(self):
#         if self.train:
#             scaler =preprocessing.StandardScaler().fit(self.labels)
#             self.scaler = scaler
#         return self.scaler
#     #  def scaler_method(self):
#     #       if self.train:
#     #           scaler = preprocessing.StandardScaler().fit(self.labels)
#     #           self.scaler = scaler  # Assign scaler to self.scaler
#     #       return self.scaler

#     def __len__(self):
#         return len(self.data_set)
#     def __getitem__(self, idx):
#             return  self.data_set[idx], torch.tensor(self.scaler.transform(self.labels)[idx]).float(), self.masks[idx], self.globals[idx]



In [199]:
class DGLDatasetReg(torch.utils.data.Dataset):
    def __init__(self, address, transform=None, train=False, scaler=None):
        self.transform = transform
        self.train = train
        self.data_set, train_labels_masks_globals = dgl.load_graphs(address + ".bin")
        num_graphs = len(self.data_set)
        self.labels = train_labels_masks_globals["labels"].view(num_graphs, -1)
        self.masks = train_labels_masks_globals["masks"].view(num_graphs, -1)
        self.globals = train_labels_masks_globals["globals"].view(num_graphs, -1)
        if scaler is None:
            self.scaler = self.scaler_method()
        else:
            self.scaler = scaler

    def scaler_method(self):
        if self.train:
            scaler = preprocessing.StandardScaler().fit(self.labels)
            return scaler
        return None

    def __len__(self):
        return len(self.data_set)

    def __getitem__(self, idx):
        labels = self.labels
        if self.scaler is not None:
            labels = torch.tensor(self.scaler.transform(self.labels)[idx]).float()
        return (
            self.data_set[idx],
            labels,
            self.masks[idx],
            self.globals[idx],
        )



##Defining Train, Validation, and Test Set

this code snippet demonstrates the construction of training, validation, and test datasets using the DGLDatasetReg class. It also shows the use of a scaler to preprocess the data, ensuring that the validation and test sets are scaled consistently with the training set.

In [200]:
path_data_temp = folder_data_temp + "scaffold" + "_" + str(0)
train_set = DGLDatasetReg(address=path_data_temp + "_train", train=True)
scaler = train_set.scaler_method()
val_set = DGLDatasetReg(address=path_data_temp + "_val", scaler=scaler)
test_set = DGLDatasetReg(address=path_data_temp + "_test", scaler=scaler)


print(len(train_set), len(val_set), len(test_set))

513 64 65


In [201]:
train_set = DGLDatasetReg(address=path_data_temp + "_train", train=True)
scaler = train_set.scaler_method()
val_set = DGLDatasetReg(address=path_data_temp + "_val", train=False, scaler=scaler)  # Set train=False
test_set = DGLDatasetReg(address=path_data_temp + "_test", train=False, scaler=scaler)  # Set train=False
print(len(train_set), len(val_set), len(test_set))

513 64 65


this code snippet loads the train, validation, and test datasets, concatenates them into a single dataset, and then analyzes the graph properties such as the number of vertices, edges, and graphs. It also computes and prints the shape of the adjacency matrices for each graph.

In [202]:

# Concatenate the datasets
dataset = torch.utils.data.ConcatDataset([train_set, val_set, test_set])

# Get all the graphs in the dataset
graphs = [data[0] for data in dataset]

# Get the number of vertices, edges, and graphs
num_vertices = [graph.number_of_nodes() for graph in graphs]
num_edges = [graph.number_of_edges() for graph in graphs]
num_graphs = len(dataset)

print("Number of vertices:", num_vertices)
print("Number of edges:", num_edges)
print("Number of graphs:", num_graphs)

# Get the adjacency matrix shape
adj_matrix = [graph.adjacency_matrix() for graph in graphs]
adj_shape = [matrix.shape for matrix in adj_matrix]

print("Adjacency matrix shape:", adj_shape)


Number of vertices: [2, 2, 2, 1, 2, 2, 3, 2, 3, 8, 1, 5, 5, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 1, 3, 2, 7, 3, 3, 5, 2, 2, 1, 2, 1, 3, 1, 4, 2, 2, 3, 3, 2, 3, 2, 1, 2, 2, 3, 1, 3, 4, 2, 3, 6, 1, 2, 2, 3, 2, 2, 9, 3, 3, 2, 3, 3, 3, 3, 5, 2, 1, 1, 2, 6, 2, 1, 2, 1, 2, 3, 7, 2, 2, 3, 2, 2, 5, 4, 2, 1, 4, 3, 1, 5, 3, 4, 3, 3, 2, 1, 3, 3, 2, 2, 2, 2, 4, 4, 2, 8, 5, 2, 3, 3, 3, 3, 6, 3, 3, 3, 2, 1, 3, 3, 5, 3, 2, 3, 2, 7, 4, 1, 5, 1, 2, 4, 1, 3, 2, 8, 8, 1, 3, 3, 2, 4, 2, 2, 2, 4, 1, 2, 2, 5, 3, 4, 3, 1, 7, 5, 2, 3, 2, 3, 3, 2, 2, 3, 3, 1, 3, 3, 2, 2, 3, 2, 2, 5, 2, 2, 2, 2, 2, 2, 3, 2, 2, 1, 2, 7, 2, 3, 5, 2, 2, 6, 2, 5, 3, 3, 3, 1, 3, 3, 4, 1, 2, 2, 2, 2, 2, 5, 3, 7, 3, 1, 3, 6, 3, 2, 3, 3, 2, 2, 2, 2, 3, 5, 2, 2, 2, 2, 3, 3, 4, 3, 3, 2, 2, 1, 7, 3, 3, 1, 6, 3, 1, 2, 1, 5, 3, 2, 3, 2, 4, 2, 3, 3, 3, 2, 2, 2, 1, 4, 2, 4, 2, 4, 5, 3, 4, 2, 3, 4, 3, 5, 2, 2, 2, 2, 3, 2, 2, 1, 8, 3, 3, 1, 3, 4, 2, 2, 4, 1, 2, 4, 2, 2, 3, 1, 4, 2, 1, 3, 3, 4, 5, 4, 2, 1, 2, 2, 2, 5, 2, 2, 2, 3, 2, 6, 2, 2, 3, 3, 2, 1

This  code focuses on analyzing the properties of a single graph rather than analyzing properties for all graphs in the dataset. It retrieves the first graph, calculates the number of vertices and edges, and computes the shape of the adjacency matrix.

In [203]:
# Concatenate the datasets
dataset = torch.utils.data.ConcatDataset([train_set, val_set, test_set])

# Get the first graph in the dataset
graph = dataset[0][0]

# Get the number of vertices, edges, and graphs
num_vertices = graph.number_of_nodes()
num_edges = graph.number_of_edges()
num_graphs = len(dataset)

print("Number of vertices:", num_vertices)
print("Number of edges:", num_edges)
print("Number of graphs:", num_graphs)

# Get the adjacency matrix shape
adj_matrix = graph.adjacency_matrix()
adj_shape = adj_matrix.shape

print("Adjacency matrix shape:", adj_shape)


Number of vertices: 2
Number of edges: 2
Number of graphs: 642
Adjacency matrix shape: (2, 2)


In this code snippet, the cumulative counts of vertices and edges are calculated for all graphs in the dataset. Additionally, the shape of the adjacency matrix is determined

In [204]:
# Concatenate the datasets
dataset = torch.utils.data.ConcatDataset([train_set, val_set, test_set])

# Get the total number of graphs
num_graphs = len(dataset)

# Initialize variables to store cumulative counts
total_vertices = 0
total_edges = 0
adj_shape = None

# Iterate over all graphs in the dataset
for i in range(num_graphs):
    graph = dataset[i][0]  # Get the graph
    num_vertices = graph.number_of_nodes()  # Get the number of vertices
    num_edges = graph.number_of_edges()  # Get the number of edges

    # Update cumulative counts
    total_vertices += num_vertices
    total_edges += num_edges

    if adj_shape is None:
        adj_matrix = graph.adjacency_matrix()  # Get the adjacency matrix
        adj_shape = adj_matrix.shape  # Get the shape of the adjacency matrix

# Print the results
print("Number of vertices:", total_vertices)
print("Number of edges:", total_edges)
print("Adjacency matrix shape:", adj_shape)
print("Number of graphs:", num_graphs)


Number of vertices: 2111
Number of edges: 3030
Adjacency matrix shape: (2, 2)
Number of graphs: 642


In this code snippet, the focus is on a single dataset rather than concatenating multiple datasets.
The code focuses on a single dataset and extracts information from the first graph in that dataset, including the number of vertices, edges, and the shape of the adjacency matrix. The results are then printed.

In [205]:
# Get the dataset
dataset =DGLDatasetReg(address=path_data_temp + "_train", train=True)
scaler = train_set.scaler_method()

# Get the first graph in the dataset
graph = dataset[0][0]

# Get the number of vertices, edges, and graphs
num_vertices = graph.number_of_nodes()
num_edges = graph.number_of_edges()
num_graphs = len(dataset)

print("Number of vertices:", num_vertices)
print("Number of edges:", num_edges)
print("Number of graphs:", num_graphs)

# Get the adjacency matrix shape
adj_matrix = graph.adjacency_matrix()
adj_shape = adj_matrix.shape

print("Adjacency matrix shape:", adj_shape)



Number of vertices: 2
Number of edges: 2
Number of graphs: 513
Adjacency matrix shape: (2, 2)


In [206]:
# Get the dataset
dataset = DGLDatasetReg(address=path_data_temp + "_val", train=False, scaler=scaler)

# Get the first graph in the dataset
graph = dataset[0][0]

# Get the number of vertices, edges, and graphs
num_vertices = graph.number_of_nodes()
num_edges = graph.number_of_edges()
num_graphs = len(dataset)

print("Number of vertices:", num_vertices)
print("Number of edges:", num_edges)
print("Number of graphs:", num_graphs)

# Get the adjacency matrix shape
adj_matrix = graph.adjacency_matrix()
adj_shape = adj_matrix.shape

print("Adjacency matrix shape:", adj_shape)


Number of vertices: 2
Number of edges: 2
Number of graphs: 64
Adjacency matrix shape: (2, 2)


By loading the DGL dataset, accessing the first graph, and retrieving information such as the number of vertices, edges, and the shape of the adjacency matrix, the code provides insights into the structure of the graph stored in the dataset.

In [207]:
# Get the dataset
dataset =DGLDatasetReg(address=path_data_temp + "_test", train=False, scaler=scaler)

# Get the first graph in the dataset
graph = dataset[0][0]

# Get the number of vertices, edges, and graphs
num_vertices = graph.number_of_nodes()
num_edges = graph.number_of_edges()
num_graphs = len(dataset)

print("Number of vertices:", num_vertices)
print("Number of edges:", num_edges)
print("Number of graphs:", num_graphs)

# Get the adjacency matrix shape
adj_matrix = graph.adjacency_matrix()
adj_shape = adj_matrix.shape

print("Adjacency matrix shape:", adj_shape)


Number of vertices: 2
Number of edges: 2
Number of graphs: 65
Adjacency matrix shape: (2, 2)


The code accesses the global features from the train, validation, and test sets and prints them. Here's how it works:

1. `train_globals = [globals for _, _, _, globals in train_set]`: This line retrieves the global features from the train set and assigns them to the `train_globals` list. Each global feature corresponds to a molecule in the train set.
2. `val_globals = [globals for _, _, _, globals in val_set]`: This line retrieves the global features from the validation set and assigns them to the `val_globals` list. Each global feature corresponds to a molecule in the validation set.
3. `test_globals = [globals for _, _, _, globals in test_set]`: This line retrieves the global features from the test set and assigns them to the `test_globals` list. Each global feature corresponds to a molecule in the test set.

4. The following lines print the global features:
   - `print("Global Features for Train Set:")`: Prints a header indicating the train set global features are being displayed.
   - `for i, globals in enumerate(train_globals):`: Iterates over the train_globals list.
     - `print(f"Molecule {i+1}: {globals}")`: Prints the global features for each molecule in the train set.
   - `print()`: Prints an empty line for separation.
   - `print("Global Features for Validation Set:")`: Prints a header indicating the validation set global features are being displayed.
   - `for i, globals in enumerate(val_globals):`: Iterates over the val_globals list.
     - `print(f"Molecule {i+1}: {globals}")`: Prints the global features for each molecule in the validation set.
   - `print()`: Prints an empty line for separation.
   - `print("Global Features for Test Set:")`: Prints a header indicating the test set global features are being displayed.
   - `for i, globals in enumerate(test_globals):`: Iterates over the test_globals list.
     - `print(f"Molecule {i+1}: {globals}")`: Prints the global features for each molecule in the test set.

By accessing the global features using the tuple unpacking syntax (`for _, _, _, globals in ...`), the code retrieves and displays the global features for each molecule in the train, validation, and test sets.

In [208]:
# Access the global features
train_globals = [globals for _, _, _, globals in train_set]
val_globals = [globals for _, _, _, globals in val_set]
test_globals = [globals for _, _, _, globals in test_set]

# Print the global features
print("Global Features for Train Set:")
for i, globals in enumerate(train_globals):
    print(f"Molecule {i+1}: {globals}")
print()

print("Global Features for Validation Set:")
for i, globals in enumerate(val_globals):
    print(f"Molecule {i+1}: {globals}")
print()

print("Global Features for Test Set:")
for i, globals in enumerate(test_globals):
    print(f"Molecule {i+1}: {globals}")
print()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        1.4939e-01, 2.5099e-11, 6.0925e-01, 3.4341e-01, 1.0887e-06, 2.3915e-11,
        7.5225e-13, 9.8121e-03, 7.9826e-01, 4.9928e-01, 1.9227e-01, 2.4204e-01,
        8.7687e-01, 9.1700e-03, 1.3219e-02, 1.0000e+00, 9.7380e-03, 2.9713e-03,
        2.0128e-01, 3.7172e-03, 1.8981e-01, 3.4348e-01, 1.8981e-01, 9.6218e-01,
        8.9069e-01, 7.9607e-01, 3.2009e-02, 8.7581e-01, 3.7825e-02, 1.4199e-03,
        9.6265e-03, 7.0728e-01, 1.9367e-01, 9.9386e-22, 1.4249e-01, 6.2963e-02,
        3.4723e-02, 8.6504e-01, 8.3758e-02, 8.4739e-02, 7.3166e-01, 4.9077e-01,
        5.6952e-08, 2.1523e-03, 0.0000e+00, 1.1424e-21, 2.4025e-23, 4.8172e-02,
        1.2694e-01, 4.5625e-01, 3.6088e-15, 1.4598e-01, 9.7548e-01, 9.6076e-01,
        6.2830e-01, 6.3417e-01, 9.9050e-01, 1.5707e-01, 6.0306e-26, 2.3724e-02,
        1.5012e-01, 2.7501e-02, 2.6103e-02, 4.9837e-01, 7.3182e-02, 1.6710e-01,
        4.1633e-01, 8.7862e-24, 1.0863e-01, 3.0449e-02,

The code accesses the output masks for the train, validation, and test sets, and then prints the masks.As you can see, there is no empty cell and values of 1 are given.

In [209]:
# Access the output masks
train_masks = train_set.masks
val_masks = val_set.masks
test_masks = test_set.masks

# Print the output masks
print("Train Set Output Masks:")
print(train_masks)
print()

print("Validation Set Output Masks:")
print(val_masks)
print()

print("Test Set Output Masks:")
print(test_masks)


Train Set Output Masks:
tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1

This code checks if there are any empty cells (cells with NaN values) in the train, validation, and test sets.
By iterating over the samples and checking for NaN values in the labels tensor, the code determines if there are any empty cells present in the train, validation, and test sets.

In [210]:
# Check if there is an empty cell in the train_set
empty_cells_train = any(torch.isnan(labels).any() for _, labels, _, _ in train_set)
if empty_cells_train:
    print("Train set contains empty cells.")
else:
    print("Train set does not contain empty cells.")

# Check if there is an empty cell in the val_set
empty_cells_val = any(torch.isnan(labels).any() for _, labels, _, _ in val_set)
if empty_cells_val:
    print("Validation set contains empty cells.")
else:
    print("Validation set does not contain empty cells.")

# Check if there is an empty cell in the test_set
empty_cells_test = any(torch.isnan(labels).any() for _, labels, _, _ in test_set)
if empty_cells_test:
    print("Test set contains empty cells.")
else:
    print("Test set does not contain empty cells.")


Train set does not contain empty cells.
Validation set does not contain empty cells.
Test set does not contain empty cells.


In [211]:
# Check if there is an empty cell in the train_set
empty_cells_train = any(torch.isnan(labels).any() for _, labels, _, _ in train_set)
if empty_cells_train:
    print("Train set contains empty cells.")
else:
    print("Train set does not contain empty cells.")

# Check if there is an empty cell in the val_set
empty_cells_val = any(torch.isnan(labels).any() for _, labels, _, _ in val_set)
if empty_cells_val:
    print("Validation set contains empty cells.")
else:
    print("Validation set does not contain empty cells.")

# Check if there is an empty cell in the test_set
empty_cells_test = any(torch.isnan(labels).any() for _, labels, _, _ in test_set)
if empty_cells_test:
    print("Test set contains empty cells.")
else:
    print("Test set does not contain empty cells.")


Train set does not contain empty cells.
Validation set does not contain empty cells.
Test set does not contain empty cells.


In [212]:
# Access globals from the train_set
train_globals = [globals for _, _, _, globals in train_set]
print(train_globals)

# Access globals from the val_set
val_globals = [globals for _, _, _, globals in val_set]
print(val_globals)

# Access globals from the test_set
test_globals = [globals for _, _, _, globals in test_set]
print(test_globals)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        5.7087e-19, 3.3236e-10, 9.6420e-10, 7.1054e-15, 5.8371e-13, 9.0122e-01,
        1.0000e+00, 1.6704e-01, 1.6650e-01, 1.6649e-01, 2.0286e-01, 6.9366e-02,
        7.1054e-15, 1.6835e-01, 1.6798e-01, 6.8719e-10, 8.3841e-01, 1.6433e-01,
        8.3778e-04, 1.6633e-01, 1.6303e-01, 1.0000e+00, 9.5697e-08, 3.4971e-08,
        1.6821e-01, 1.6581e-01, 1.6735e-01, 7.1396e-07, 2.6412e-12, 9.9913e-02,
        2.8681e-10, 3.7774e-01, 4.5062e-03, 1.3325e-01, 3.4730e-02, 1.6148e-09,
        1.8752e-18, 2.0941e-07, 7.1054e-15, 4.9926e-01, 1.6493e-01, 1.3174e-17,
        2.1116e-16, 1.1682e-09, 9.0951e-01, 6.2460e-10, 1.6815e-01, 1.6545e-01,
        1.1711e-13, 0.0000e+00, 1.6467e-01, 1.6692e-01, 0.0000e+00, 5.1007e-08,
        7.1054e-15, 1.5465e-01, 2.7942e-22, 0.0000e+00, 1.6764e-01, 6.3150e-25,
        1.6819e-01, 9.0885e-03, 1.6836e-01, 8.2654e-11, 1.5635e-01, 0.0000e+00,
        0.0000e+00, 2.1135e-02, 2.1135e-02, 2.3882e-20,

In [213]:
# Access masks from the train_set
train_masks = [masks for _, _, masks, _ in train_set]
print(train_masks)

# Access masks from the val_set
val_masks = [masks for _, _, masks, _ in val_set]
print(val_masks)

# Access masks from the test_set
test_masks = [masks for _, _, masks, _ in test_set]
print(test_masks)


[tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tenso

In [214]:
# Access the output masks
train_masks = train_set.masks
val_masks = val_set.masks
test_masks = test_set.masks

# Print the output masks
print("Train Set Output Masks:")
print(train_masks)
print()

print("Validation Set Output Masks:")
print(val_masks)
print()

print("Test Set Output Masks:")
print(test_masks)


Train Set Output Masks:
tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1

In [215]:
# Access the global features
train_globals = [globals for _, _, _, globals in train_set]
val_globals = [globals for _, _, _, globals in val_set]
test_globals = [globals for _, _, _, globals in test_set]

# Print the global features
print("Global Features for Train Set:")
for i, globals in enumerate(train_globals):
    print(f"Molecule {i+1}: {globals}")
print()

print("Global Features for Validation Set:")
for i, globals in enumerate(val_globals):
    print(f"Molecule {i+1}: {globals}")
print()

print("Global Features for Test Set:")
for i, globals in enumerate(test_globals):
    print(f"Molecule {i+1}: {globals}")
print()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        1.4939e-01, 2.5099e-11, 6.0925e-01, 3.4341e-01, 1.0887e-06, 2.3915e-11,
        7.5225e-13, 9.8121e-03, 7.9826e-01, 4.9928e-01, 1.9227e-01, 2.4204e-01,
        8.7687e-01, 9.1700e-03, 1.3219e-02, 1.0000e+00, 9.7380e-03, 2.9713e-03,
        2.0128e-01, 3.7172e-03, 1.8981e-01, 3.4348e-01, 1.8981e-01, 9.6218e-01,
        8.9069e-01, 7.9607e-01, 3.2009e-02, 8.7581e-01, 3.7825e-02, 1.4199e-03,
        9.6265e-03, 7.0728e-01, 1.9367e-01, 9.9386e-22, 1.4249e-01, 6.2963e-02,
        3.4723e-02, 8.6504e-01, 8.3758e-02, 8.4739e-02, 7.3166e-01, 4.9077e-01,
        5.6952e-08, 2.1523e-03, 0.0000e+00, 1.1424e-21, 2.4025e-23, 4.8172e-02,
        1.2694e-01, 4.5625e-01, 3.6088e-15, 1.4598e-01, 9.7548e-01, 9.6076e-01,
        6.2830e-01, 6.3417e-01, 9.9050e-01, 1.5707e-01, 6.0306e-26, 2.3724e-02,
        1.5012e-01, 2.7501e-02, 2.6103e-02, 4.9837e-01, 7.3182e-02, 1.6710e-01,
        4.1633e-01, 8.7862e-24, 1.0863e-01, 3.0449e-02,

In [216]:
# Retrieve the global features for the first molecule in the training set
first_mol_globals = train_set.globals[0]

# Print the global features
print("Global Features for the First Molecule:")
print(first_mol_globals)


Global Features for the First Molecule:
tensor([9.9540e-01, 2.3695e-03, 1.7299e-04, 2.2098e-05, 3.6436e-04, 3.8349e-05,
        1.1994e-05, 6.1438e-03, 9.0236e-05, 3.1133e-02, 6.3535e-07, 3.0868e-06,
        1.7034e-05, 1.3407e-05, 7.1832e-01, 4.5629e-01, 6.8936e-07, 1.0022e-01,
        1.9674e-02, 1.6590e-01, 9.2584e-11, 5.8561e-17, 1.0887e-06, 3.3966e-01,
        7.5225e-13, 6.6737e-04, 9.8909e-01, 1.7764e-01, 1.6418e-02, 9.9939e-01,
        9.9180e-01, 7.6972e-05, 9.2296e-04, 1.0000e+00, 3.7972e-04, 3.9032e-05,
        1.0000e+00, 1.2925e-04, 4.4329e-02, 6.0509e-02, 4.4329e-02, 2.8397e-01,
        9.9396e-01, 2.0284e-01, 4.9500e-02, 9.4105e-01, 4.1243e-02, 1.2130e-04,
        6.9029e-04, 6.1316e-02, 7.9832e-02, 9.9386e-22, 1.4249e-01, 6.2963e-02,
        3.4723e-02, 4.8299e-15, 1.1178e-02, 8.4739e-02, 5.5287e-02, 9.8237e-02,
        5.6952e-08, 2.1523e-03, 0.0000e+00, 1.1424e-21, 2.4025e-23, 2.1217e-02,
        8.7215e-03, 5.7614e-21, 3.6088e-15, 8.9266e-01, 1.7356e-22, 1.1809e-10,


##Data Loader

this code snippet defines a collate function that concatenates the graphs, labels, masks, and globals in a batch of data. It also defines a loader function that creates data loaders for the training, validation, and test sets using the collate function. The data loaders can be used to iterate over the data in batches during training and evaluation.

In [217]:
def collate(batch):
    # batch is a list of tuples (graphs, labels, masks, globals)
    # Concatenate a sequence of graphs
    graphs = [e[0] for e in batch]
    g = dgl.batch(graphs)

    # Concatenate a sequence of tensors (labels) along a new dimension
    labels = [e[1] for e in batch]
    labels = torch.stack(labels, 0)

    # Concatenate a sequence of tensors (masks) along a new dimension
    masks = [e[2] for e in batch]
    masks = torch.stack(masks, 0)

    # Concatenate a sequence of tensors (globals) along a new dimension
    globals = [e[3] for e in batch]
    globals = torch.stack(globals, 0)

    return g, labels, masks, globals


def loader(batch_size=64):
    train_dataloader = DataLoader(train_set,
                              batch_size=batch_size,
                              collate_fn=collate,
                              drop_last=False,
                              shuffle=True,
                              num_workers=1)

    val_dataloader =  DataLoader(val_set,
                             batch_size=batch_size,
                             collate_fn=collate,
                             drop_last=False,
                             shuffle=False,
                             num_workers=1)

    test_dataloader = DataLoader(test_set,
                             batch_size=batch_size,
                             collate_fn=collate,
                             drop_last=False,
                             shuffle=False,
                             num_workers=1)
    return train_dataloader, val_dataloader, test_dataloader

In [218]:
train_dataloader, val_dataloader, test_dataloader = loader(batch_size=64)

 this code snippet iterate over the training data batch by batch, where each batch contains a graph g and other associated information (labels, masks, globals) that can be used for training model.

In [219]:
g,_,_,_=next(iter(train_dataloader))
g

Graph(num_nodes=233, num_edges=340,
      ndata_schemes={'v': Scheme(shape=(128,), dtype=torch.float32)}
      edata_schemes={'e': Scheme(shape=(13,), dtype=torch.float32)})

##Defining A GNN

###Some Variables

In [220]:
#Bace dataset has 1 task. Some other datasets may have some more number of tasks, e.g., tox21 has 12 tasks.
num_tasks = 1

# Size of global feature of each graph
global_size = 200

# Number of epochs to train the model
num_epochs = 100

# Number of steps to wait if the model performance on the validation set does not improve
patience = 10

#Configurations to instantiate the model
config = {"node_feature_size":127, "edge_feature_size":12, "hidden_size":100}


the GNN class defines a GNN model with two graph convolution layers and a fully connected layer. It processes the input graph and produces predictions based on the node and global features. The model is designed for regression tasks andcan handle graphs with varying sizes and edge features.

In [221]:
class GNN(nn.Module):
    def __init__(self, config, global_size=200):
        super().__init__()
        self.config = config


        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = GraphConv(self.node_feature_size, self.hidden_size,allow_zero_in_degree=True)
        self.conv2 = GraphConv(self.hidden_size, self.hidden_size,allow_zero_in_degree=True)  # Adjusted to have the same hidden size
        self.fc = nn.Linear(self.hidden_size,1)  # Added fully connected layer for regression

    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"] = mol_dgl_graph.ndata["v"][:, :self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:, :self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        h = F.relu(h)  # Added ReLU activation after the second convolution
        mol_dgl_graph.ndata["h"] = h
        hg = dgl.mean_nodes(mol_dgl_graph, "h")
        output = self.fc(hg)  # Apply fully connected layer for regression
        return output


##Function to Compute Score of the Model

 the compute_score function evaluates the trained GNN model on a given dataset, calculates the RMSE score for the prediction tasks, and returns the score as a measure of the model's performance.

In [222]:
def compute_score(model, data_loader, val_size):
    model.eval()
    loss_sum=nn.MSELoss(reduction='sum')
    final_loss=0
    with torch.no_grad():
        for i, (mol_dgl_graph, labels, masks, globals) in enumerate(data_loader):
            prediction = model(mol_dgl_graph, globals)
            prediction = torch.tensor(scaler.inverse_transform(prediction.detach().cpu()))
            labels = torch.tensor(scaler.inverse_transform(labels.cpu()))
            loss=loss_sum(prediction,labels)
            final_loss+=loss.item()
        final_loss /=val_size
        final_loss=np.sqrt(final_loss)


    return (final_loss) ** 0.5  # Return the RMSE score


##Loss Function

 the `torch.nn.functional` module provides a collection of utility functions for neural network operations, and the `loss_func` function uses the `torch.nn.MSELoss` function from that module to calculate the mean squared error loss between the `output` and `label` tensors.

In [223]:
import torch.nn.functional as F

def loss_func(output, label):
    criterion = torch.nn.MSELoss(reduction='mean')
    loss = criterion(output, label)
    return loss


##Training and Evaluation

###Training Function

The train_epoch function encapsulates the training loop for one epoch. It performs the forward pass, loss calculation, backpropagation, and optimization steps for each batch in the training data. The average training loss is then returned, which can be used for monitoring the training progress.

In [224]:
def train_epoch(train_dataloader, model, optimizer):
    epoch_train_loss = 0
    iterations = 0
    model.train()  # Prepare model for training
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(train_dataloader):
        prediction = model(mol_dgl_graph, globals)
        loss_train = loss_func(prediction, labels)
        optimizer.zero_grad(set_to_none=True)
        loss_train.backward()
        optimizer.step()
        epoch_train_loss += loss_train.detach().item()
        iterations += 1
    epoch_train_loss /= iterations
    return epoch_train_loss


the code performs training and evaluation for a GNN model. It iterates over multiple epochs, updating the model based on the training loss and evaluating its performance on the validation set. The best model is saved based on the validation score, and the final results are printed.

In [225]:
def train_evaluate():

    model = GNN(config, global_size)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

    best_val = np.Inf
    patience_count = 1
    epoch = 1

    while epoch <= num_epochs:
        if patience_count <= patience:
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            model.eval()
            score_val = compute_score(model, val_dataloader, len(val_set))
            if score_val < best_val:
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
            epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")
        epoch += 1

    # best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(np.mean(best_val)), "\n")


In [226]:
def train_evaluate():

    model = GNN(config, global_size)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

    best_val = np.Inf
    patience_count = 1
    epoch = 1

    while epoch <= num_epochs:
        if patience_count <= patience:
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            model.eval()
            score_val = compute_score(model, val_dataloader, len(val_set))
            if score_val < best_val:
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
            epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")
        epoch += 1

    # best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(np.mean(best_val)), "\n")


##Function to compute test set score of the final saved model

the test_evaluate function loads the best model checkpoint, evaluates the model on the test dataset, and reports the test score along with the execution time.

In [227]:
def test_evaluate():
    final_model = GNN(config, global_size)
    path = os.path.join(best_model_path, 'checkpoint.pth')
    with open(path, 'rb') as f:
        checkpoint = cloudpickle.load(f)
    final_model.load_state_dict(checkpoint["model_state_dict"])
    final_model.eval()
    test_score = compute_score(final_model, test_dataloader, len(test_set))

    print("Test Score: {:.3f}".format(test_score), "\n")
    print("Execution time: {:.3f} seconds".format(time.time() - start_time))


##Train the model and evaluate its performance

 By calling train_evaluate() and test_evaluate() one after the other, the code performs both training and testing of the graph-based model. The start_time variable is used to calculate and print the total execution time for both operations.

In [228]:
import time
start_time = time.time()

train_evaluate()
test_evaluate()


Save checkpoint
Epoch: 1/100 | Training Loss: 0.892 | Valid Score: 2.058
 
Epoch: 1/100 | Best Valid Score Until Now: 2.058 

Save checkpoint
Epoch: 2/100 | Training Loss: 0.906 | Valid Score: 2.057
 
Epoch: 2/100 | Best Valid Score Until Now: 2.057 

Save checkpoint
Epoch: 3/100 | Training Loss: 0.888 | Valid Score: 2.056
 
Epoch: 3/100 | Best Valid Score Until Now: 2.056 

Save checkpoint
Epoch: 4/100 | Training Loss: 0.927 | Valid Score: 2.055
 
Epoch: 4/100 | Best Valid Score Until Now: 2.055 

Save checkpoint
Epoch: 5/100 | Training Loss: 0.862 | Valid Score: 2.054
 
Epoch: 5/100 | Best Valid Score Until Now: 2.054 

Save checkpoint
Epoch: 6/100 | Training Loss: 0.860 | Valid Score: 2.051
 
Epoch: 6/100 | Best Valid Score Until Now: 2.051 

Save checkpoint
Epoch: 7/100 | Training Loss: 0.910 | Valid Score: 2.047
 
Epoch: 7/100 | Best Valid Score Until Now: 2.047 

Save checkpoint
Epoch: 8/100 | Training Loss: 2.125 | Valid Score: 2.043
 
Epoch: 8/100 | Best Valid Score Until Now: 

##SAGEConv

In [229]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
from dgl.nn import GraphConv
import dgl.function as fn
from dgl.nn import SAGEConv


class SAGEConv(nn.Module):

    def __init__(self, in_feat, out_feat, aggregator_type='mean'):
        super(SAGEConv, self).__init__()
        self.aggregator_type = aggregator_type
        self.linear = nn.Linear(in_feat * 2, out_feat)

    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.update_all(
                message_func=fn.copy_u("h", "m"),
                reduce_func=getattr(fn, self.aggregator_type)("m", "h_N"),
            )
            h_N = g.ndata["h_N"]
            h_total = torch.cat([h, h_N], dim=1)
            return self.linear(h_total)

"""this code defines a GNN model that performs graph convolution operations using the SAGEConv module. It applies two SAGEConv layers to obtain node representations and aggregates them to compute the final output of the model."""

class GNN(nn.Module):
    def __init__(self, config, global_size=200):
        super().__init__()
        self.config = config


        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv(self.node_feature_size, self.hidden_size)
        self.conv2 = SAGEConv(self.hidden_size, 1)

    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"] = mol_dgl_graph.ndata["v"][:, :self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:, :self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

import time
start_time = time.time()

train_evaluate()
test_evaluate()


Save checkpoint
Epoch: 1/100 | Training Loss: 0.886 | Valid Score: 2.140
 
Epoch: 1/100 | Best Valid Score Until Now: 2.140 

Save checkpoint
Epoch: 2/100 | Training Loss: 0.948 | Valid Score: 2.133
 
Epoch: 2/100 | Best Valid Score Until Now: 2.133 

Save checkpoint
Epoch: 3/100 | Training Loss: 0.861 | Valid Score: 2.121
 
Epoch: 3/100 | Best Valid Score Until Now: 2.121 

Save checkpoint
Epoch: 4/100 | Training Loss: 1.159 | Valid Score: 2.112
 
Epoch: 4/100 | Best Valid Score Until Now: 2.112 

Save checkpoint
Epoch: 5/100 | Training Loss: 0.896 | Valid Score: 2.105
 
Epoch: 5/100 | Best Valid Score Until Now: 2.105 

Save checkpoint
Epoch: 6/100 | Training Loss: 0.877 | Valid Score: 2.097
 
Epoch: 6/100 | Best Valid Score Until Now: 2.097 

Save checkpoint
Epoch: 7/100 | Training Loss: 0.872 | Valid Score: 2.093
 
Epoch: 7/100 | Best Valid Score Until Now: 2.093 

Save checkpoint
Epoch: 8/100 | Training Loss: 0.829 | Valid Score: 2.090
 
Epoch: 8/100 | Best Valid Score Until Now: 

In [230]:

"""##message add and reduce sum

The `SAGEConv1` class extends the `nn.Module` class and defines a SAGEConv convolutional layer. `SAGEConv1` applies a linear transformation to the concatenation of the node features and the sum of their neighbors' features.

The `forward` method of `SAGEConv1` takes a DGL graph `g` and a tensor of node features `h` as inputs. It first sets the node features in the graph `g`, then performs message passing using the `u_add_v` and `sum` built-in DGL functions to compute the sum of the neighbor features for each node. Finally, it concatenates the original node features with the sum of neighbor features, applies a linear transformation, and returns the result.
"""

class SAGEConv1(nn.Module):

    def __init__(self, in_feat, out_feat):
        super(SAGEConv1, self).__init__()

        self.linear = nn.Linear(in_feat * 2, out_feat)

        #The forward method of SAGEConv1 takes a DGL graph g and a tensor of node features h as inputs.
        # It first sets the node features in the graph g, then performs message passing using the u_add_v and sum built-in DGL functions to compute the sum of the neighbor features for each node.
        # Finally, it concatenates the original node features with the sum of neighbor features, applies a linear transformation, and returns the result.


    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.update_all(
                message_func=fn.u_add_v("h","h", "m"),
                reduce_func=fn.sum("m", "h_N"),
            )
            h_N = g.ndata["h_N"]
            h_total = torch.cat([h, h_N], dim=1)
            return self.linear(h_total)

"""
The `GNN` class extends the `nn.Module` class and initializes a GNN model. It takes a configuration dictionary (`config`), global size, and number of tasks as arguments. The model has two SAGEConv1 layers (`self.conv1` and `self.conv2`) for performing graph convolution operations.
The `forward` method performs the forward pass of the GNN model. It takes a DGL graph (`mol_dgl_graph`) and global features (`globals`) as inputs. The method first restricts the node features and edge features to their respective sizes by slicing the tensors.

Then, it applies the first SAGEConv1 layer (`self.conv1`) to the node features, followed by a ReLU activation function (`F.relu`). Next, it applies the second SAGEConv1 layer (`self.conv2`) to obtain the final node representations.

The final node representations are stored in the graph with `mol_dgl_graph.ndata["h"] = h`. Finally, the method computes the mean of the node representations using `dgl.mean_nodes` with the feature name "h" and returns the result.

"""

class GNN(nn.Module):
    def __init__(self, config, global_size=200):
        super().__init__()
        self.config = config


        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv1(self.node_feature_size, self.hidden_size)
        self.conv2 = SAGEConv1(self.hidden_size,1)

    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"] = mol_dgl_graph.ndata["v"][:, :self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:, :self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

import time
start_time = time.time()

train_evaluate()
test_evaluate()



Save checkpoint
Epoch: 1/100 | Training Loss: 1.037 | Valid Score: 2.024
 
Epoch: 1/100 | Best Valid Score Until Now: 2.024 

Save checkpoint
Epoch: 2/100 | Training Loss: 0.874 | Valid Score: 1.957
 
Epoch: 2/100 | Best Valid Score Until Now: 1.957 

Save checkpoint
Epoch: 3/100 | Training Loss: 0.875 | Valid Score: 1.907
 
Epoch: 3/100 | Best Valid Score Until Now: 1.907 

Save checkpoint
Epoch: 4/100 | Training Loss: 0.769 | Valid Score: 1.894
 
Epoch: 4/100 | Best Valid Score Until Now: 1.894 

Save checkpoint
Epoch: 5/100 | Training Loss: 0.742 | Valid Score: 1.884
 
Epoch: 5/100 | Best Valid Score Until Now: 1.884 

Save checkpoint
Epoch: 6/100 | Training Loss: 0.753 | Valid Score: 1.879
 
Epoch: 6/100 | Best Valid Score Until Now: 1.879 

Patience 1
Epoch: 7/100 | Training Loss: 0.692 | Valid Score: 1.881
 
Epoch: 7/100 | Best Valid Score Until Now: 1.879 

Save checkpoint
Epoch: 8/100 | Training Loss: 0.650 | Valid Score: 1.870
 
Epoch: 8/100 | Best Valid Score Until Now: 1.870

In [231]:
"""##message add and reduce sum"""

class SAGEConv2(nn.Module):
    def __init__(self, in_feat, out_feat):
        super(SAGEConv2, self).__init__()
        self.linear = nn.Linear(in_feat * 2, out_feat)

    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.update_all(fn.u_add_v("h", "h", "m"), fn.sum("m", "h_N"))

            h_N = g.ndata.pop("h_N")  # Retrieve the "h_N" feature and remove it from the graph's node data
            h_total = torch.cat([h, h_N], dim=1)
            return self.linear(h_total)


class GNN(nn.Module):
    def __init__(self, config, global_size=200):
        super().__init__()
        self.config = config


        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv2(self.node_feature_size, self.hidden_size)
        self.conv2 = SAGEConv2(self.hidden_size, 1)
#The forward method of SAGEConv2 takes a DGL graph g and a tensor of node features h as inputs.
# It first sets the node features in the graph g, then performs message passing and aggregation using the send_and_recv function.
#It sends messages by applying the "add" message function fn.u_add_v("h", "h", "m") to each edge of the graph.
#The messages are then received and aggregated using the "sum" reduce function fn.sum("m", "h_N") to compute the sum of the neighbor features for each node.
#After the message passing and aggregation step, it retrieves the aggregated neighbor features h_N from the graph data. It concatenates the original node features h with the aggregated neighbor features h_N, applies a linear transformation, and returns the result.
    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"] = mol_dgl_graph.ndata["v"][:, :self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:, :self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

#

start_time = time.time()

train_evaluate()
test_evaluate()



Save checkpoint
Epoch: 1/100 | Training Loss: 1.114 | Valid Score: 2.022
 
Epoch: 1/100 | Best Valid Score Until Now: 2.022 

Save checkpoint
Epoch: 2/100 | Training Loss: 0.850 | Valid Score: 2.009
 
Epoch: 2/100 | Best Valid Score Until Now: 2.009 

Save checkpoint
Epoch: 3/100 | Training Loss: 0.874 | Valid Score: 2.001
 
Epoch: 3/100 | Best Valid Score Until Now: 2.001 

Save checkpoint
Epoch: 4/100 | Training Loss: 0.824 | Valid Score: 1.993
 
Epoch: 4/100 | Best Valid Score Until Now: 1.993 

Save checkpoint
Epoch: 5/100 | Training Loss: 0.814 | Valid Score: 1.991
 
Epoch: 5/100 | Best Valid Score Until Now: 1.991 

Save checkpoint
Epoch: 6/100 | Training Loss: 0.995 | Valid Score: 1.975
 
Epoch: 6/100 | Best Valid Score Until Now: 1.975 

Save checkpoint
Epoch: 7/100 | Training Loss: 0.711 | Valid Score: 1.958
 
Epoch: 7/100 | Best Valid Score Until Now: 1.958 

Patience 1
Epoch: 8/100 | Training Loss: 0.725 | Valid Score: 1.961
 
Epoch: 8/100 | Best Valid Score Until Now: 1.958

In [232]:
"""##message div and reduce max

this code defines a SAGEConv3 module for graph convolutional operations, which uses "div" as the message function to compute element-wise division of node features and "mean" as the reduce function to compute the mean of the aggregated messages from neighboring nodes.
"""

class SAGEConv3(nn.Module):

    def __init__(self, in_feat, out_feat):
        super(SAGEConv3, self).__init__()

        self.linear = nn.Linear(in_feat * 2, out_feat)

    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.update_all(
                message_func=fn.u_div_v("h","h", "m"),
                reduce_func=fn.mean("m", "h_N"),
            )
            h_N = g.ndata["h_N"]
            h_total = torch.cat([h, h_N], dim=1)
            return self.linear(h_total)

class GNN(nn.Module):
    def __init__(self, config, global_size=200):
        super().__init__()
        self.config = config


        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv3(self.node_feature_size, self.hidden_size)
        self.conv2 = SAGEConv3(self.hidden_size, 1)

    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"] = mol_dgl_graph.ndata["v"][:, :self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:, :self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

start_time = time.time()

train_evaluate()
test_evaluate()



Patience 1
Epoch: 1/100 | Training Loss: nan | Valid Score: nan
 
Epoch: 1/100 | Best Valid Score Until Now: inf 

Patience 2
Epoch: 2/100 | Training Loss: nan | Valid Score: nan
 
Epoch: 2/100 | Best Valid Score Until Now: inf 

Patience 3
Epoch: 3/100 | Training Loss: nan | Valid Score: nan
 
Epoch: 3/100 | Best Valid Score Until Now: inf 

Patience 4
Epoch: 4/100 | Training Loss: nan | Valid Score: nan
 
Epoch: 4/100 | Best Valid Score Until Now: inf 

Patience 5
Epoch: 5/100 | Training Loss: nan | Valid Score: nan
 
Epoch: 5/100 | Best Valid Score Until Now: inf 

Patience 6
Epoch: 6/100 | Training Loss: nan | Valid Score: nan
 
Epoch: 6/100 | Best Valid Score Until Now: inf 

Patience 7
Epoch: 7/100 | Training Loss: nan | Valid Score: nan
 
Epoch: 7/100 | Best Valid Score Until Now: inf 

Patience 8
Epoch: 8/100 | Training Loss: nan | Valid Score: nan
 
Epoch: 8/100 | Best Valid Score Until Now: inf 

Patience 9
Epoch: 9/100 | Training Loss: nan | Valid Score: nan
 
Epoch: 9/100 |

In [233]:
"""##message sub and reduce mean

this code defines a SAGEConv4 module for graph convolutional operations, which uses "sub" as the message function to compute element-wise subtraction of node features and "mean" as the reduce function to compute the mean of the aggregated messages from neighboring nodes.
"""

class SAGEConv4(nn.Module):

    def __init__(self, in_feat, out_feat):
        super(SAGEConv4, self).__init__()

        self.linear = nn.Linear(in_feat * 2, out_feat)
#The forward method of SAGEConv4 takes a DGL graph g and a tensor of node features h as inputs. It first sets the node features in the graph g, then performs message passing and aggregation using the update_all function. It sends messages by applying the "sub" message function fn.u_sub_v("h", "h", "m") to each edge of the graph. The messages are then received and aggregated using the "mean" reduce function fn.mean("m", "h_N") to compute the mean of the neighbor features for each node.

    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.update_all(
                message_func = fn.u_sub_v('h', 'h', 'm'),
                reduce_func=fn.mean("m", "h_N"),
            )
            h_N = g.ndata["h_N"]
            h_total = torch.cat([h, h_N], dim=1)
            return self.linear(h_total)

class GNN(nn.Module):
    def __init__(self, config, global_size=200):
        super().__init__()
        self.config = config


        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv4(self.node_feature_size, self.hidden_size)
        self.conv2 = SAGEConv4(self.hidden_size, 1)

    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"] = mol_dgl_graph.ndata["v"][:, :self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:, :self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

start_time = time.time()

train_evaluate()
test_evaluate()



Save checkpoint
Epoch: 1/100 | Training Loss: 0.942 | Valid Score: 2.071
 
Epoch: 1/100 | Best Valid Score Until Now: 2.071 

Patience 1
Epoch: 2/100 | Training Loss: 0.934 | Valid Score: 2.072
 
Epoch: 2/100 | Best Valid Score Until Now: 2.071 

Patience 2
Epoch: 3/100 | Training Loss: 0.960 | Valid Score: 2.072
 
Epoch: 3/100 | Best Valid Score Until Now: 2.071 

Save checkpoint
Epoch: 4/100 | Training Loss: 0.871 | Valid Score: 2.070
 
Epoch: 4/100 | Best Valid Score Until Now: 2.070 

Save checkpoint
Epoch: 5/100 | Training Loss: 0.898 | Valid Score: 2.069
 
Epoch: 5/100 | Best Valid Score Until Now: 2.069 

Save checkpoint
Epoch: 6/100 | Training Loss: 0.935 | Valid Score: 2.069
 
Epoch: 6/100 | Best Valid Score Until Now: 2.069 

Patience 1
Epoch: 7/100 | Training Loss: 0.854 | Valid Score: 2.069
 
Epoch: 7/100 | Best Valid Score Until Now: 2.069 

Save checkpoint
Epoch: 8/100 | Training Loss: 0.850 | Valid Score: 2.069
 
Epoch: 8/100 | Best Valid Score Until Now: 2.069 

Save ch

In [234]:
"""##message mul and reduce mean

this code defines a SAGEConv5 module for graph convolutional operations, which uses "mul" as the message function to compute element-wise multiplication of node features and "mean" as the reduce function to compute the mean of the aggregated messages from neighboring nodes.
"""

class SAGEConv5(nn.Module):

    def __init__(self, in_feat, out_feat):
        super(SAGEConv5, self).__init__()

        self.linear = nn.Linear(in_feat * 2, out_feat)
#It performs message forwarding and aggregation using the ``update_all'' function. Sends messages by applying the "mul" message function "fn.u_mul_v("h", "h", "m")" to each edge of the graph. The messages are then received and summed using the mean reduction function "fn.mean("m", "h_N")" to calculate the average of the element-wise multiplied messages from the neighboring nodes.
    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.update_all(
                message_func = fn.u_mul_v('h', 'h', 'm'),
                reduce_func=fn.mean("m", "h_N"),
            )
            h_N = g.ndata["h_N"]
            h_total = torch.cat([h, h_N], dim=1)
            return self.linear(h_total)

class GNN(nn.Module):
    def __init__(self, config, global_size=200):
        super().__init__()
        self.config = config


        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv5(self.node_feature_size, self.hidden_size)
        self.conv2 = SAGEConv5(self.hidden_size,1)

    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"] = mol_dgl_graph.ndata["v"][:, :self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:, :self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

start_time = time.time()

train_evaluate()
test_evaluate()



Save checkpoint
Epoch: 1/100 | Training Loss: 0.923 | Valid Score: 2.120
 
Epoch: 1/100 | Best Valid Score Until Now: 2.120 

Save checkpoint
Epoch: 2/100 | Training Loss: 0.884 | Valid Score: 2.117
 
Epoch: 2/100 | Best Valid Score Until Now: 2.117 

Save checkpoint
Epoch: 3/100 | Training Loss: 0.878 | Valid Score: 2.113
 
Epoch: 3/100 | Best Valid Score Until Now: 2.113 

Save checkpoint
Epoch: 4/100 | Training Loss: 1.422 | Valid Score: 2.108
 
Epoch: 4/100 | Best Valid Score Until Now: 2.108 

Save checkpoint
Epoch: 5/100 | Training Loss: 0.868 | Valid Score: 2.102
 
Epoch: 5/100 | Best Valid Score Until Now: 2.102 

Save checkpoint
Epoch: 6/100 | Training Loss: 0.905 | Valid Score: 2.098
 
Epoch: 6/100 | Best Valid Score Until Now: 2.098 

Save checkpoint
Epoch: 7/100 | Training Loss: 0.864 | Valid Score: 2.094
 
Epoch: 7/100 | Best Valid Score Until Now: 2.094 

Save checkpoint
Epoch: 8/100 | Training Loss: 0.855 | Valid Score: 2.090
 
Epoch: 8/100 | Best Valid Score Until Now: 

In [235]:
"""##message mul and reduce sum

this code defines a SAGEConv6 module for graph convolutional operations, which uses "mul" as the message function to compute element-wise multiplication of node features and "sum" as the reduce function to compute the sum of the aggregated messages from neighboring nodes.
"""

class SAGEConv6(nn.Module):

    def __init__(self, in_feat, out_feat):
        super(SAGEConv6, self).__init__()

        self.linear = nn.Linear(in_feat * 2, out_feat)

    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.update_all(
                message_func = fn.u_mul_v('h', 'h', 'm'),
                reduce_func=fn.sum("m", "h_N"),
            )
            h_N = g.ndata["h_N"]
            h_total = torch.cat([h, h_N], dim=1)
            return self.linear(h_total)

class GNN(nn.Module):
    def __init__(self, config, global_size=200):
        super().__init__()
        self.config = config


        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv6(self.node_feature_size, self.hidden_size)
        self.conv2 = SAGEConv6(self.hidden_size, 1)

    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"] = mol_dgl_graph.ndata["v"][:, :self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:, :self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

start_time = time.time()

train_evaluate()
test_evaluate()



Save checkpoint
Epoch: 1/100 | Training Loss: 0.974 | Valid Score: 2.146
 
Epoch: 1/100 | Best Valid Score Until Now: 2.146 

Save checkpoint
Epoch: 2/100 | Training Loss: 0.877 | Valid Score: 2.135
 
Epoch: 2/100 | Best Valid Score Until Now: 2.135 

Save checkpoint
Epoch: 3/100 | Training Loss: 1.025 | Valid Score: 2.127
 
Epoch: 3/100 | Best Valid Score Until Now: 2.127 

Save checkpoint
Epoch: 4/100 | Training Loss: 0.859 | Valid Score: 2.122
 
Epoch: 4/100 | Best Valid Score Until Now: 2.122 

Save checkpoint
Epoch: 5/100 | Training Loss: 0.935 | Valid Score: 2.116
 
Epoch: 5/100 | Best Valid Score Until Now: 2.116 

Save checkpoint
Epoch: 6/100 | Training Loss: 1.212 | Valid Score: 2.107
 
Epoch: 6/100 | Best Valid Score Until Now: 2.107 

Save checkpoint
Epoch: 7/100 | Training Loss: 0.843 | Valid Score: 2.098
 
Epoch: 7/100 | Best Valid Score Until Now: 2.098 

Save checkpoint
Epoch: 8/100 | Training Loss: 0.857 | Valid Score: 2.090
 
Epoch: 8/100 | Best Valid Score Until Now: 

`SAGEConv7`: This class extends the `nn.Module` base class from PyTorch and defines a custom graph convolutional layer using the SAGE (GraphSAGE) algorithm. It takes the input feature size `in_feat` and output feature size `out_feat` as arguments. The `forward` method performs the graph convolution operation using element-wise multiplication as the message function (`fn.u_mul_v`) and sum reduction (`fn.sum`) to aggregate messages from neighboring nodes. The resulting node features are concatenated with the original node features, passed through a linear transformation, and returned.

- `GNN`: This class extends the `nn.Module` base class and implements a Graph Neural Network model. It takes a `config` dictionary, `global_size`, and `num_tasks` as arguments. The class defines the architecture of the GNN model, which includes three instances of the `SAGEConv9` layer (`conv1`, `conv2`, `conv3`). It also includes batch normalization layers (`bn1`, `bn2`) between the convolutional layers. The `forward` method performs the forward pass of the model, applying the graph convolutional layers on the input graph `mol_dgl_graph`. It first truncates the node and edge features to the desired sizes, applies the convolutional layers with batch normalization and ReLU activation, and computes the mean of the resulting node features.
At first we work with 3 layers

In [236]:
#add layers

import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
from dgl.nn import GraphConv
import dgl.function as fn
from dgl.nn import SAGEConv
import time


class SAGEConv7(nn.Module):
    def __init__(self, in_feat, out_feat):
        super(SAGEConv9, self).__init__()
        self.linear = nn.Linear(in_feat * 2, out_feat)

    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.update_all(
                message_func=fn.u_mul_v('h', 'h', 'm'),
                reduce_func=fn.sum("m", "h_N"),
            )
            h_N = g.ndata["h_N"]
            h_total = torch.cat([h, h_N], dim=1)
            return self.linear(h_total)


class GNN(nn.Module):
    def __init__(self, config, global_size=200, num_tasks=1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks

        # Node feature size
        self.node_feature_size = self.config.get('node_feature_size', 127)

        # Edge feature size
        self.edge_feature_size = self.config.get('edge_feature_size', 12)

        # Hidden size
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv9(self.node_feature_size, self.hidden_size)
        self.bn1 = nn.BatchNorm1d(self.hidden_size)  # Define batch normalization layer bn1
        self.conv2 = SAGEConv9(self.hidden_size, self.hidden_size)
        self.bn2 = nn.BatchNorm1d(self.hidden_size)  # Define batch normalization layer bn2
        self.conv3 = SAGEConv9(self.hidden_size, self.num_tasks)

    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"] = mol_dgl_graph.ndata["v"][:, :self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:, :self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = self.bn1(h)
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        h = self.bn2(h)
        h = F.relu(h)
        h = self.conv3(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")

config = {}  # Replace with your config
gnn = GNN(config, num_tasks=1)  # Set num_tasks to 1 for binary classification

start_time = time.time()

train_evaluate()
test_evaluate()


Save checkpoint
Epoch: 1/100 | Training Loss: 2.494 | Valid Score: 2.075
 
Epoch: 1/100 | Best Valid Score Until Now: 2.075 

Save checkpoint
Epoch: 2/100 | Training Loss: 1.321 | Valid Score: 2.063
 
Epoch: 2/100 | Best Valid Score Until Now: 2.063 

Save checkpoint
Epoch: 3/100 | Training Loss: 1.023 | Valid Score: 2.049
 
Epoch: 3/100 | Best Valid Score Until Now: 2.049 

Save checkpoint
Epoch: 4/100 | Training Loss: 1.038 | Valid Score: 2.040
 
Epoch: 4/100 | Best Valid Score Until Now: 2.040 

Patience 1
Epoch: 5/100 | Training Loss: 0.763 | Valid Score: 2.101
 
Epoch: 5/100 | Best Valid Score Until Now: 2.040 

Save checkpoint
Epoch: 6/100 | Training Loss: 0.800 | Valid Score: 2.036
 
Epoch: 6/100 | Best Valid Score Until Now: 2.036 

Patience 1
Epoch: 7/100 | Training Loss: 0.701 | Valid Score: 2.100
 
Epoch: 7/100 | Best Valid Score Until Now: 2.036 

Patience 2
Epoch: 8/100 | Training Loss: 0.745 | Valid Score: 2.224
 
Epoch: 8/100 | Best Valid Score Until Now: 2.036 

Patienc