In [174]:
!pip install dgl



In [175]:
%matplotlib inline
import os

os.environ["DGLBACKEND"] = "pytorch"
import dgl
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import dgl.function as fn
import torch.nn.functional as F
import shutil
from torch.utils.data import DataLoader
import cloudpickle
from dgl.nn import GraphConv

##Set Path

this segment involves directory and file operations. It creates directories, deletes directories, and extracts the contents of a ZIP file.

In [176]:
current_dir = "./"
checkpoint_path = current_dir + "save_models/model_checkpoints/" + "checkpoint"
os.makedirs(checkpoint_path, exist_ok=True)

best_model_path = current_dir + "save_models/best_model/"

folder_data_temp = current_dir +"data_temp/"
shutil.rmtree(folder_data_temp, ignore_errors=True)

path_save = current_dir + "free.zip"
shutil.unpack_archive(path_save, folder_data_temp)

##Custom PyTorch Datasets

The `DGLDatasetReg` class is a custom dataset class specifically designed for regression tasks using Deep Graph Library (DGL). the `DGLDatasetReg` class provides a convenient way to handle regression datasets in DGL. It supports optional feature scaling and allows for customization through the `transform` parameter.

In [177]:
""" Regression Dataset """
class DGLDatasetReg(torch.utils.data.Dataset):
    def __init__(self, address, transform=None, train=False, scaler=None, scaler_regression=None):
            self.train = train
            self.scaler = scaler
            self.data_set, train_labels_masks_globals = dgl.load_graphs(address+".bin")
            num_graphs = len(self.data_set)
            self.labels = train_labels_masks_globals["labels"].view(num_graphs,-1)
            self.masks = train_labels_masks_globals["masks"].view(num_graphs,-1)
            self.globals = train_labels_masks_globals["globals"].view(num_graphs,-1)
            self.transform = transform
            self.scaler_regression = scaler_regression
    def scaler_method(self):
        if self.train:
            scaler = preprocessing.StandardScaler().fit(self.labels)
            self.scaler = scaler
        return self.scaler
    def __len__(self):
        return len(self.data_set)
    def __getitem__(self, idx):
        if self.scaler_regression:
            """ With Scaler"""
            return  self.data_set[idx], torch.tensor(self.scaler.transform(self.labels)[idx]).float(), self.masks[idx], self.globals[idx]
        else:
            """ Without Scaler """
            return  self.data_set[idx], self.labels[idx].float(), self.masks[idx], self.globals[idx]


##Defining Train, Validation, and Test Set

this code snippet demonstrates the construction of training, validation, and test datasets using the DGLDatasetReg class. It also shows the use of a scaler to preprocess the data, ensuring that the validation and test sets are scaled consistently with the training set.

In [178]:
from sklearn import preprocessing

In [179]:
path_data_temp = folder_data_temp + "scaffold" + "_" + str(0)
train_set = DGLDatasetReg(address=path_data_temp + "_train", train=True)
scaler = train_set.scaler_method()
val_set = DGLDatasetReg(address=path_data_temp + "_val", scaler=scaler)
test_set = DGLDatasetReg(address=path_data_temp + "_test", scaler=scaler)

print(len(train_set), len(val_set), len(test_set))

513 64 65


In [180]:
# path_data_temp = folder_data_temp + "scaffold"+"_"+str(0)
# train_data =DGLDatasetReg(address=path_data_temp+"_train")
# val_data = DGLDatasetReg(address=path_data_temp+"_val")
# test_data = DGLDatasetReg(address=path_data_temp+"_test")

# print(len(train_data), len(val_data), len(test_data))


In [181]:
# def scale_data(train_data, val_data, test_data, scaler):
#     # Fit scaler on training data and transform training, validation, and test data
#     train_features, train_labels = train_data
#     val_features, val_labels = val_data
#     test_features, test_labels = test_data

#     scaler.fit(train_features)

#     train_features_scaled = scaler.transform(train_features)
#     val_features_scaled = scaler.transform(val_features)
#     test_features_scaled = scaler.transform(test_features)

#     train_data_scaled = (train_features_scaled, train_labels)
#     val_data_scaled = (val_features_scaled, val_labels)
#     test_data_scaled = (test_features_scaled, test_labels)

#     return train_data_scaled, val_data_scaled, test_data_scaled

##Data Loader

this code snippet defines a collate function that concatenates the graphs, labels, masks, and globals in a batch of data. It also defines a loader function that creates data loaders for the training, validation, and test sets using the collate function. The data loaders can be used to iterate over the data in batches during training and evaluation.

In [182]:
from sklearn import preprocessing

In [183]:
def collate(batch):
    # batch is a list of tuples (graphs, labels, masks, globals)
    # Concatenate a sequence of graphs
    graphs = [e[0] for e in batch]
    g = dgl.batch(graphs)

    # Concatenate a sequence of tensors (labels) along a new dimension
    labels = [e[1] for e in batch]
    labels = torch.stack(labels, 0).float()  # Convert labels to float

    # Concatenate a sequence of tensors (masks) along a new dimension
    masks = [e[2] for e in batch]
    masks = torch.stack(masks, 0)

    # Concatenate a sequence of tensors (globals) along a new dimension
    globals = [e[3] for e in batch]
    globals = torch.stack(globals, 0)

    return g, labels, masks, globals


def loader(batch_size=64):
    train_dataloader = DataLoader(train_set,
                                  batch_size=batch_size,
                                  collate_fn=collate,
                                  drop_last=False,
                                  shuffle=True,
                                  num_workers=1)

    val_dataloader = DataLoader(val_set,
                                batch_size=batch_size,
                                collate_fn=collate,
                                drop_last=False,
                                shuffle=False,
                                num_workers=1)

    test_dataloader = DataLoader(test_set,
                                 batch_size=batch_size,
                                 collate_fn=collate,
                                 drop_last=False,
                                 shuffle=False,
                                 num_workers=1)
    return train_dataloader, val_dataloader, test_dataloader


In [184]:
train_dataloader, val_dataloader, test_dataloader = loader(batch_size=64)

##Defining A GNN

###Some Variables

In [185]:
#Bace dataset has 1 task. Some other datasets may have some more number of tasks, e.g., tox21 has 12 tasks.
num_tasks = 1

# Size of global feature of each graph
global_size = 200

# Number of epochs to train the model
num_epochs = 100

# Number of steps to wait if the model performance on the validation set does not improve
patience = 10

#Configurations to instantiate the model
config = {"node_feature_size":127, "edge_feature_size":12, "hidden_size":100}


the GNN class defines a GNN model with two graph convolution layers and a fully connected layer. It processes the input graph and produces predictions based on the node and global features. The model is designed for regression tasks andcan handle graphs with varying sizes and edge features.

In [186]:
import torch
import torch.nn as nn
import dgl.function as fn

class SAGEConv(nn.Module):
    """Graph convolution module used by the GraphSAGE model.

    Parameters
    ----------
    in_feat : int
        Input feature size.
    out_feat : int
        Output feature size.
    aggregator_type : str
        Aggregator type, e.g., 'mean', 'max', 'sum'.
    """

    def __init__(self, in_feat, out_feat, aggregator_type='mean'):
        super(SAGEConv, self).__init__()
        self.aggregator_type = aggregator_type
        self.linear = nn.Linear(in_feat * 2, out_feat)

    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.update_all(
                message_func=fn.copy_u("h", "m"),
                reduce_func=getattr(fn, self.aggregator_type)("m", "h_N"),
            )
            h_N = g.ndata["h_N"]
            h_total = torch.cat([h, h_N], dim=1)
            return self.linear(h_total)


In [187]:
class GNN(nn.Module):
    def __init__(self, config, global_size=200, num_tasks=1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks

        self.node_feature_size = self.config.get('node_feature_size', 127)
        self.edge_feature_size = self.config.get('edge_feature_size', 12)
        self.hidden_size = self.config.get('hidden_size', 100)

        self.conv1 = SAGEConv(self.node_feature_size, self.hidden_size)
        self.conv2 = SAGEConv(self.hidden_size, self.hidden_size)
        self.conv3 = SAGEConv(self.hidden_size, self.hidden_size)
        self.conv4 = SAGEConv(self.hidden_size, self.num_tasks)

    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph.ndata["v"] = mol_dgl_graph.ndata["v"][:, :self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:, :self.edge_feature_size]
        h = mol_dgl_graph.ndata["v"]
        h = self.conv1(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.conv3(mol_dgl_graph, h)
        h = F.relu(h)
        h = self.conv4(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")


##Function to Compute Score of the Model

 the compute_score function evaluates the trained GNN model on a given dataset, calculates the RMSE score for the prediction tasks, and returns the score as a measure of the model's performance.

In [188]:
from sklearn.metrics import mean_squared_error

def compute_score(model, data_loader, val_size, num_tasks, scaler=None):
    model.eval()
    metric = mean_squared_error
    with torch.no_grad():
        prediction_all = torch.empty(0)
        labels_all = torch.empty(0)
        masks_all = torch.empty(0)
        for i, (mol_dgl_graph, labels, masks, globals) in enumerate(data_loader):
            if scaler is not None:
                mol_dgl_graph.ndata['feat'] = scaler.transform(mol_dgl_graph.ndata['feat'])
            prediction = model(mol_dgl_graph, globals)
            prediction_all = torch.cat((prediction_all, prediction), 0)
            labels_all = torch.cat((labels_all, labels), 0)
            masks_all = torch.cat((masks_all, masks), 0)
        average = torch.tensor([0.])
        for i in range(num_tasks):
            a1 = prediction_all[:, i][masks_all[:, i] == 1]
            a2 = labels_all[:, i][masks_all[:, i] == 1]
            t = metric(a2.cpu(), a1.cpu()).item()
            average += t
    return (average.item() / num_tasks) ** 0.5  # Return the RMSE score


##Loss Function

 the loss_func function calculates the MSE loss between the model's output and the target labels, applying a masking operation to handle missing or padded values. The loss is then averaged based on the number of valid elements in the batch and returned as the result.

In [189]:
import torch.nn.functional as F

def loss_func(output, label, mask, num_tasks):
    criterion = torch.nn.MSELoss(reduction='none')
    loss = mask * criterion(output, label)
    loss = loss.sum() / mask.sum()
    return loss


##Training and Evaluation

###Training Function

The train_epoch function encapsulates the training loop for one epoch. It performs the forward pass, loss calculation, backpropagation, and optimization steps for each batch in the training data. The average training loss is then returned, which can be used for monitoring the training progress.

In [190]:
def train_epoch(train_dataloader, model, optimizer):
    epoch_train_loss = 0
    iterations = 0
    model.train()  # Prepare model for training
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(train_dataloader):
        prediction = model(mol_dgl_graph, globals)
        loss_train = loss_func(prediction, labels, masks, num_tasks)
        optimizer.zero_grad(set_to_none=True)
        loss_train.backward()
        optimizer.step()
        epoch_train_loss += loss_train.detach().item()
        iterations += 1
    epoch_train_loss /= iterations
    return epoch_train_loss


the code performs training and evaluation for a GNN model. It iterates over multiple epochs, updating the model based on the training loss and evaluating its performance on the validation set. The best model is saved based on the validation score, and the final results are printed.

In [191]:
from sklearn.preprocessing import StandardScaler

num_epochs = 100

def train_evaluate():
    path_data_temp = folder_data_temp + "scaffold" + "_" + str(0)
    train_set = DGLDatasetReg(address=path_data_temp + "_train", train=True)
    scaler = train_set.scaler_method()
    val_set = DGLDatasetReg(address=path_data_temp + "_val", scaler=scaler)
    test_set = DGLDatasetReg(address=path_data_temp + "_test", scaler=scaler)

    train_data = train_set  # Assign the train_set to train_data
    val_data = val_set  # Assign the val_set to val_data
    test_data = test_set  # Assign the test_set to test_data

    model =  GNN(config, global_size, num_tasks)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

    best_val = float('inf')  # Initialize with infinity for regression
    patience_count = 1
    epoch = 1

    while epoch <= num_epochs:
        if patience_count <= patience:
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            model.eval()
            score_val = compute_score(model, val_dataloader, len(val_data), num_tasks)

            if score_val < best_val:  # Update if score_val is lower (better) for regression
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
                epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")

        epoch += 1

    # Best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(best_val), "\n")




##Function to compute test set score of the final saved model

the test_evaluate function loads the best model checkpoint, evaluates the model on the test dataset, and reports the test score along with the execution time.

In [192]:
def test_evaluate():
    final_model = GNN(config, global_size, num_tasks)
    path = os.path.join(best_model_path, 'checkpoint.pth')
    with open(path, 'rb') as f:
        checkpoint = cloudpickle.load(f)
    final_model.load_state_dict(checkpoint["model_state_dict"])
    final_model.eval()
    test_score = compute_score(final_model, test_dataloader, len(test_set), num_tasks)

    print("Test Score: {:.3f}".format(test_score), "\n")
    print("Execution time: {:.3f} seconds".format(time.time() - start_time))


##Train the model and evaluate its performance

 By calling train_evaluate() and test_evaluate() one after the other, the code performs both training and testing of the graph-based model. The start_time variable is used to calculate and print the total execution time for both operations.

In [193]:
import time
start_time = time.time()

train_evaluate()
test_evaluate()


Save checkpoint
Epoch: 1/100 | Training Loss: 23.449 | Valid Score: 6.827
 
Epoch: 1/100 | Best Valid Score Until Now: 6.827 

Save checkpoint
Epoch: 2/100 | Training Loss: 27.452 | Valid Score: 6.796
 
Epoch: 2/100 | Best Valid Score Until Now: 6.796 

Save checkpoint
Epoch: 3/100 | Training Loss: 30.655 | Valid Score: 6.758
 
Epoch: 3/100 | Best Valid Score Until Now: 6.758 

Save checkpoint
Epoch: 4/100 | Training Loss: 23.325 | Valid Score: 6.706
 
Epoch: 4/100 | Best Valid Score Until Now: 6.706 

Save checkpoint
Epoch: 5/100 | Training Loss: 21.145 | Valid Score: 6.632
 
Epoch: 5/100 | Best Valid Score Until Now: 6.632 

Save checkpoint
Epoch: 6/100 | Training Loss: 20.395 | Valid Score: 6.525
 
Epoch: 6/100 | Best Valid Score Until Now: 6.525 

Save checkpoint
Epoch: 7/100 | Training Loss: 25.025 | Valid Score: 6.364
 
Epoch: 7/100 | Best Valid Score Until Now: 6.364 

Save checkpoint
Epoch: 8/100 | Training Loss: 19.547 | Valid Score: 6.124
 
Epoch: 8/100 | Best Valid Score Unt