In [164]:
!pip install dgl



In [165]:
%matplotlib inline
import os

os.environ["DGLBACKEND"] = "pytorch"
import dgl
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import dgl.function as fn
import torch.nn.functional as F
import shutil
from torch.utils.data import DataLoader
import cloudpickle
from dgl.nn import GraphConv

#### Set Path

In [166]:
current_dir = "./"
checkpoint_path = current_dir + "save_models/model_checkpoints/" + "checkpoint"
os.makedirs(checkpoint_path, exist_ok=True)

best_model_path = current_dir + "save_models/best_model/"

folder_data_temp = current_dir +"data_temp/"
shutil.rmtree(folder_data_temp, ignore_errors=True)

path_save = current_dir + "free.zip"
shutil.unpack_archive(path_save, folder_data_temp)

#### Custom PyTorch Datasets

In [167]:
""" Regression Dataset """
class DGLDatasetReg(torch.utils.data.Dataset):
    def __init__(self, address, transform=None, train=False, scaler=None, scaler_regression=None):
            self.train = train
            self.scaler = scaler
            self.data_set, train_labels_masks_globals = dgl.load_graphs(address+".bin")
            num_graphs = len(self.data_set)
            self.labels = train_labels_masks_globals["labels"].view(num_graphs,-1)
            self.masks = train_labels_masks_globals["masks"].view(num_graphs,-1)
            self.globals = train_labels_masks_globals["globals"].view(num_graphs,-1)
            self.transform = transform
            self.scaler_regression = scaler_regression
    def scaler_method(self):
        if self.train:
            scaler = preprocessing.StandardScaler().fit(self.labels)
            self.scaler = scaler
        return self.scaler
    def __len__(self):
        return len(self.data_set)
    def __getitem__(self, idx):
        if self.scaler_regression:
            """ With Scaler"""
            return  self.data_set[idx], torch.tensor(self.scaler.transform(self.labels)[idx]).float(), self.masks[idx], self.globals[idx]
        else:
            """ Without Scaler """
            return  self.data_set[idx], self.labels[idx].float(), self.masks[idx], self.globals[idx]



#### Defining Train, Validation, and Test Set

In [168]:
from sklearn import preprocessing

path_data_temp = folder_data_temp + "scaffold" + "_" + str(0)
train_set = DGLDatasetReg(address=path_data_temp + "_train", train=True)

# Build scaler on the training set
scaler = preprocessing.StandardScaler()
scaler.fit(train_set.labels)

# Apply scaler to validation and test sets
val_set = DGLDatasetReg(address=path_data_temp + "_val", scaler=scaler, scaler_regression=True)
test_set = DGLDatasetReg(address=path_data_temp + "_test", scaler=scaler, scaler_regression=True)

print(len(train_set), len(val_set), len(test_set))



513 64 65


#### Data Loader

In [169]:
def collate(batch):
    # batch is a list of tuples (graphs, labels, masks, globals)
    # Concatenate a sequence of graphs
    graphs = [e[0] for e in batch]
    g = dgl.batch(graphs)

    # Concatenate a sequence of tensors (labels) along a new dimension
    labels = [e[1] for e in batch]
    labels = torch.stack(labels, 0).float()  # Convert labels to float

    # Concatenate a sequence of tensors (masks) along a new dimension
    masks = [e[2] for e in batch]
    masks = torch.stack(masks, 0)

    # Concatenate a sequence of tensors (globals) along a new dimension
    globals = [e[3] for e in batch]
    globals = torch.stack(globals, 0)

    return g, labels, masks, globals


def loader(batch_size=64):
    train_dataloader = DataLoader(train_set,
                                  batch_size=batch_size,
                                  collate_fn=collate,
                                  drop_last=False,
                                  shuffle=True,
                                  num_workers=1)

    val_dataloader = DataLoader(val_set,
                                batch_size=batch_size,
                                collate_fn=collate,
                                drop_last=False,
                                shuffle=False,
                                num_workers=1)

    test_dataloader = DataLoader(test_set,
                                 batch_size=batch_size,
                                 collate_fn=collate,
                                 drop_last=False,
                                 shuffle=False,
                                 num_workers=1)
    return train_dataloader, val_dataloader, test_dataloader


In [170]:
train_dataloader, val_dataloader, test_dataloader = loader(batch_size=64)

#### Defining A GNN

##### Some Variables

In [171]:
#Bace dataset has 1 task. Some other datasets may have some more number of tasks, e.g., tox21 has 12 tasks.
num_tasks = 1

# Size of global feature of each graph
global_size = 200

# Number of epochs to train the model
num_epochs = 100

# Number of steps to wait if the model performance on the validation set does not improve
patience = 10

#Configurations to instantiate the model
config = {"node_feature_size":127, "edge_feature_size":12, "hidden_size":100}


In [172]:
import dgl

class GNN(nn.Module):
    def __init__(self, config, global_size=200, num_tasks=1):
        super().__init__()
        self.config = config
        self.num_tasks = num_tasks
        self.node_feature_size = self.config.get('node_feature_size', 127)
        self.edge_feature_size = self.config.get('edge_feature_size', 12)
        self.hidden_size = self.config.get('hidden_size', 100)
        self.conv1 = GraphConv(self.node_feature_size, self.hidden_size)
        self.conv2 = GraphConv(self.hidden_size, self.num_tasks)

    def forward(self, mol_dgl_graph, globals):
        mol_dgl_graph = dgl.add_self_loop(mol_dgl_graph)  # Add self-loops
        mol_dgl_graph.ndata["v"] = mol_dgl_graph.ndata["v"][:, :self.node_feature_size]
        mol_dgl_graph.edata["e"] = mol_dgl_graph.edata["e"][:, :self.edge_feature_size]
        h = self.conv1(mol_dgl_graph, mol_dgl_graph.ndata["v"])
        h = F.relu(h)
        h = self.conv2(mol_dgl_graph, h)
        mol_dgl_graph.ndata["h"] = h
        return dgl.mean_nodes(mol_dgl_graph, "h")


#### Function to Compute Score of the Model

In [173]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [174]:
# def compute_score(model, data_loader, val_size, num_tasks):
#     model.eval()
#     prediction_all = torch.empty(0, num_tasks)
#     labels_all = torch.empty(0, num_tasks)
#     masks_all = torch.empty(0, *data_loader.dataset.masks.shape[2:])  # Adjust the dimensions

#     with torch.no_grad():
#         for batch_data in data_loader:
#             graphs, labels, masks, globals = batch_data
#             graphs = graphs.to(device)
#             labels = labels.to(device)
#             masks = masks.to(device)
#             globals = globals.to(device)

#             # Forward pass
#             outputs = model(graphs, globals)
#             prediction = outputs.squeeze(dim=1)  # Remove the extra dimension

#             # Apply mask to the prediction tensor
#             masked_prediction = torch.masked_select(prediction, masks.bool())

#             # Reshape the masked_prediction tensor
#             masked_prediction = masked_prediction.view(-1, num_tasks)

#             # Concatenate predictions and labels
#             prediction_all = torch.cat((prediction_all, masked_prediction), 0)
#             labels_all = torch.cat((labels_all, labels[masks.bool()].unsqueeze(1)), 0)  # Unsqueeze the tensor

#             # Expand mask dimensions
#             masks_all = torch.cat((masks_all, masks.expand_as(masked_prediction)), 0)

#         # Compute RMSE for each task
#         rmse = torch.sqrt(F.mse_loss(prediction_all, labels_all, reduction='mean'))

#     return rmse.item()



In [175]:
# def compute_score(model, data_loader):
#     """
#     Compute the RMSE score for the regression task in the FreeSolv dataset.

#     Args:
#         model (nn.Module): The trained model.
#         data_loader (DataLoader): The data loader for the dataset.

#     Returns:
#         float: The RMSE score.
#     """
#     model.eval()
#     prediction_all = []
#     labels_all = []

#     with torch.no_grad():
#         for batch_data in data_loader:
#             graphs, labels, masks, globals = batch_data
#             graphs = graphs.to(device)
#             labels = labels.to(device)

#             # Forward pass
#             outputs = model(graphs)
#             prediction = outputs.squeeze(dim=1)

#             # Append predictions and labels to lists
#             prediction_all.append(prediction)
#             labels_all.append(labels)

#         # Concatenate predictions and labels
#         prediction_all = torch.cat(prediction_all, dim=0)
#         labels_all = torch.cat(labels_all, dim=0)

#         # Compute RMSE for the regression task
#         rmse = torch.sqrt(F.mse_loss(prediction_all, labels_all, reduction='mean'))

#     return rmse.item()



In [176]:
def compute_score(model, data_loader, val_size=None, num_tasks=None):
    """
    Compute the RMSE score for the regression task.

    Args:
        model (nn.Module): The trained model.
        data_loader (DataLoader): The data loader for the dataset.
        val_size (int, optional): The size of the validation set. Default is None.
        num_tasks (int, optional): The number of regression tasks. Default is None.

    Returns:
        float: The RMSE score.
    """
    model.eval()
    prediction_all = []
    labels_all = []
    masks_all = []

    with torch.no_grad():
        for batch_data in data_loader:
            graphs, labels, masks, globals = batch_data
            graphs = graphs.to(device)
            labels = labels.to(device)
            masks = masks.to(device)
            globals = globals.to(device)

            # Forward pass
            outputs = model(graphs, globals)
            prediction = outputs.squeeze(dim=1)

            # Apply mask to the prediction tensor
            masked_prediction = torch.masked_select(prediction, masks.bool())

            # Reshape the masked_prediction tensor
            masked_prediction = masked_prediction.view(-1, num_tasks)

            # Append predictions, labels, and masks to lists
            prediction_all.append(masked_prediction)
            labels_all.append(labels)
            masks_all.append(masks)

        # Concatenate predictions, labels, and masks
        prediction_all = torch.cat(prediction_all, dim=0)
        labels_all = torch.cat(labels_all, dim=0)
        masks_all = torch.cat(masks_all, dim=0)

        # Compute RMSE for each task or overall RMSE if val_size and num_tasks are not provided
        if val_size is not None and num_tasks is not None:
            rmse = torch.sqrt(F.mse_loss(prediction_all[:val_size], labels_all[:val_size], reduction='mean'))
        else:
            rmse = torch.sqrt(F.mse_loss(prediction_all, labels_all, reduction='mean'))

    return rmse.item()


#### Loss Function

In [177]:
def loss_func(output, label, mask, num_tasks):
    criterion = torch.nn.MSELoss(reduction='none')
    loss = mask * criterion(output, label)
    loss = loss.sum() / mask.sum()
    return loss


#### Training and Evaluation

##### Training Function

In [178]:
def train_epoch(train_dataloader, model, optimizer):
    epoch_train_loss = 0
    iterations = 0
    model.train() # Prepare model for training
    for i, (mol_dgl_graph, labels, masks, globals) in enumerate(train_dataloader):
        prediction = model(mol_dgl_graph, globals)
        loss_train = loss_func(prediction, labels, masks, num_tasks)
        optimizer.zero_grad(set_to_none=True)
        loss_train.backward()
        optimizer.step()
        epoch_train_loss += loss_train.detach().item()
        iterations += 1
    epoch_train_loss /= iterations
    return epoch_train_loss

In [179]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()

def train_evaluate():
    model = GNN(config, global_size, num_tasks)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

    best_val = 0
    patience_count = 1
    epoch = 1

    while epoch <= num_epochs:

        if patience_count <= patience:
            model.train()
            loss_train = train_epoch(train_dataloader, model, optimizer)
            model.eval()
            score_val = compute_score(model, val_dataloader, len(val_set), num_tasks)
            if score_val > best_val:
                best_val = score_val
                print("Save checkpoint")
                path = os.path.join(checkpoint_path, 'checkpoint.pth')
                dict_checkpoint = {"score_val": score_val}
                dict_checkpoint.update({"model_state_dict": model.state_dict(), "optimizer_state": optimizer.state_dict()})
                with open(path, "wb") as outputfile:
                    cloudpickle.dump(dict_checkpoint, outputfile)
                patience_count = 1
            else:
                print("Patience", patience_count)
                patience_count += 1

            print("Epoch: {}/{} | Training Loss: {:.3f} | Valid Score: {:.3f}".format(
            epoch, num_epochs, loss_train, score_val))

            print(" ")
            print("Epoch: {}/{} | Best Valid Score Until Now: {:.3f}".format(epoch, num_epochs, best_val), "\n")
        epoch += 1

    # best model save
    shutil.rmtree(best_model_path, ignore_errors=True)
    shutil.copytree(checkpoint_path, best_model_path)

    print("Final results:")
    print("Average Valid Score: {:.3f}".format(np.mean(best_val)), "\n")


##### Function to compute test set score of the final saved model

In [180]:
def test_evaluate():
    final_model = GNN(config, global_size, num_tasks)
    path = os.path.join(best_model_path, 'checkpoint.pth')
    with open(path, 'rb') as f:
        checkpoint = cloudpickle.load(f)
    final_model.load_state_dict(checkpoint["model_state_dict"])
    final_model.eval()

    test_labels_scaled = scaler.transform(test_set.labels)
    test_set.labels = torch.tensor(test_labels_scaled).float()

    test_score = compute_score(final_model, test_dataloader, len(test_set), num_tasks)

    print("Test Score: {:.3f}".format(test_score), "\n")
    print("Execution time: {:.3f} seconds".format(time.time() - start_time))




##### Train the model and evaluate its performance

In [181]:
# Fit the scaler on the training set labels
train_labels_scaled = scaler.fit_transform(train_set.labels)
train_set.labels = torch.tensor(train_labels_scaled).float()

train_evaluate()
test_evaluate()

  return F.mse_loss(input, target, reduction=self.reduction)


Save checkpoint
Epoch: 1/100 | Training Loss: 0.951 | Valid Score: 1.202
 
Epoch: 1/100 | Best Valid Score Until Now: 1.202 

Patience 1
Epoch: 2/100 | Training Loss: 0.896 | Valid Score: 1.194
 
Epoch: 2/100 | Best Valid Score Until Now: 1.202 

Patience 2
Epoch: 3/100 | Training Loss: 0.907 | Valid Score: 1.191
 
Epoch: 3/100 | Best Valid Score Until Now: 1.202 

Patience 3
Epoch: 4/100 | Training Loss: 0.997 | Valid Score: 1.190
 
Epoch: 4/100 | Best Valid Score Until Now: 1.202 

Patience 4
Epoch: 5/100 | Training Loss: 0.894 | Valid Score: 1.198
 
Epoch: 5/100 | Best Valid Score Until Now: 1.202 

Patience 5
Epoch: 6/100 | Training Loss: 1.057 | Valid Score: 1.198
 
Epoch: 6/100 | Best Valid Score Until Now: 1.202 

Save checkpoint
Epoch: 7/100 | Training Loss: 0.901 | Valid Score: 1.204
 
Epoch: 7/100 | Best Valid Score Until Now: 1.204 

Patience 1
Epoch: 8/100 | Training Loss: 1.119 | Valid Score: 1.201
 
Epoch: 8/100 | Best Valid Score Until Now: 1.204 

Patience 2
Epoch: 9/10