# Project 3
The prediction of molecular properties is an important task in drug discovery. The molecules' atomic composition and arrangement can already tell us a lot about their biological behavior. Each 2D molecule can be represented as a graph, where the nodes are atoms connected by edges corresponding to chemical bonds. The prediction of molecular properties can be formulized as a graph classification task, and graph neural network is usually applied for making graph-level prediction.

In this project, you need develop a model for predicting the toxicity of new molecules. This notebook provides a sample pipeline that establishes a baseline. It is expected that your methods should outperform this baseline. You are strongly encouraged to think about designing more powerful models, finetuning hyperparameters, developing better training strategies, etc.

# Install package

In [1]:
# New these two packages
!pip install torch_geometric
!pip install rdkit-pypi




[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Dataset preparation and train-valid splitting.

In [2]:
import torch
import torch_geometric
import numpy as np
from torch_geometric.data import Data
from torch_geometric.data import DataLoader
from torch_geometric.datasets import MoleculeNet
import pickle

# Load datasets. The training and validation sets contain both molecules and their property labels. The test set only contain molecules.
# There are 12 property tasks for prediction. Some properties labels are missing (i.e., nan). You can ignore them.
train_dataset = torch.load("train_data.pt")
valid_dataset = torch.load("valid_data.pt")
test_dataset = torch.load("test_data.pt")

print(f'Size of training set: {len(train_dataset)}')
print(f'Size of validation set: {len(valid_dataset)}')
print(f'Size of test set: {len(test_dataset)}')

  from .autonotebook import tqdm as notebook_tqdm


Size of training set: 6264
Size of validation set: 783
Size of test set: 784


## New Features
First, we get features that are not included and add them to the original feature datasets.<br>
Below is some testing for each of the Descriptors in the rdkit Chem module

In [3]:
from rdkit import Chem
from rdkit.Chem import Descriptors as desc

print(desc.FpDensityMorgan1(Chem.MolFromSmiles(train_dataset[1].smiles)))
print(desc.FpDensityMorgan2(Chem.MolFromSmiles(train_dataset[1].smiles)))
print(desc.FpDensityMorgan3(Chem.MolFromSmiles(train_dataset[1].smiles)))

0.45
0.7
0.9


## Build Hybrid Model

In [100]:
from torch_geometric.nn import GCNConv, global_mean_pool as gap, BatchNorm, Linear
import torch.nn.functional as F 
from torch.nn import Linear

def reshape_tensor(tensor):
    reshaped_tensor = tensor.view(16, 32)
    return reshaped_tensor


class AtomEncoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(AtomEncoder, self).__init__()

        self.embeddings = torch.nn.ModuleList()

        for i in range(9):
            self.embeddings.append(torch.nn.Embedding(100, hidden_channels))

    def reset_parameters(self):
        for embedding in self.embeddings:
            embedding.reset_parameters()

    def forward(self, x):
        if x.dim() == 1:
            x = x.unsqueeze(1)

        out = 0
        for i in range(x.size(1)):
            out += self.embeddings[i](x[:, i])
        return out

# Our Encoder (does not use the edge_index or embeddings, can get about 65% accuracy)
class ModifiedAtomEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, num_node_features):
        super(ModifiedAtomEncoder, self).__init__()

        self.linear_layers = torch.nn.ModuleList()

        for i in range(num_node_features):
            self.linear_layers.append(torch.nn.Linear(1, hidden_channels))

    def reset_parameters(self):
        for linear in self.linear_layers:
            torch.nn.init.xavier_uniform_(linear.weight)
            torch.nn.init.zeros_(linear.bias)

    def forward(self, x):
        if x.dim() == 1:
            x = x.unsqueeze(1)

        out = 0
        for i in range(x.size(1)):
            out += self.linear_layers[i](x[:, i:i+1])
        return out


class GCN_noGraph(torch.nn.Module):
    def __init__(self, hidden_channels, num_node_features, num_classes):
        super(GCN_noGraph, self).__init__()
        torch.manual_seed(42)
        self.emb = AtomEncoder(hidden_channels=32, num_node_features=num_node_features)
        self.lin = Linear(hidden_channels, num_classes)

    def forward(self, batch):
        x, batch_size = batch.descriptors, batch.batch
        x = self.emb(x)

        # 2. Readout layer
        x = gap(x, batch_size)  # [batch_size, hidden_channels]
        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        return x
    
    
  
# The given model (uses the edge_index, can get about 74% accuracy)
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, num_node_features, num_classes):
        super(GCN, self).__init__()
        torch.manual_seed(42)
        self.emb = AtomEncoder(hidden_channels=32)
        self.conv1 = GCNConv(hidden_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, num_classes)

    def forward(self, batch):
        x, edge_index, batch_size = batch.x, batch.edge_index, batch.batch
        x = self.emb(x)
        
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = gap(x, batch_size)  # [batch_size, hidden_channels]
        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        return x
  
# Hybrid model (uses both the edge_index and the non-graph features)

class HybridModel(torch.nn.Module):
    def __init__(self, hidden_channels, num_node_features, num_classes, num_features_nongraph):
        super(HybridModel, self).__init__()

        # Graph-based Classifier
        self.emb = AtomEncoder(hidden_channels=hidden_channels)
        self.non_graph_emb = ModifiedAtomEncoder(hidden_channels=hidden_channels, num_node_features=num_features_nongraph)
        self.conv1 = GCNConv(hidden_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin_graph = Linear(hidden_channels, hidden_channels)

        # Non-graph Classifier
        self.fc1 = torch.nn.Linear(hidden_channels, hidden_channels)
        self.fc2 = torch.nn.Linear(hidden_channels, hidden_channels)

        # Combined layers
        self.fc3 = torch.nn.Linear(2 * hidden_channels, hidden_channels)  # Combining graph and non-graph outputs
        self.fc4 = torch.nn.Linear(hidden_channels, num_classes)

    def forward(self, batch):
        # Graph component (x is the graph features, y is the non-graph features)
        x, edge_index, batch_index, y = batch.x, batch.edge_index, batch.batch, batch.descriptors

        x = self.emb(x)
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = F.relu(self.conv3(x, edge_index))
        print('1', x.shape)
        x = gap(x, batch_index)  # [batch_size, hidden_channels]
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin_graph(x)
        x = x.float() # Convert to float for concatenation

        # Non-graph component
        y = self.non_graph_emb(y)
        print('2', y.shape)
        y = F.relu(self.fc1(y))
        y = gap(y, batch_index)  # [batch_size, hidden_channels]
        # 3. Apply a final classifier
        y = F.dropout(y, p=0.5, training=self.training)
        y = self.fc2(y)

        # Combine
        z = torch.cat([x, y], dim=1)
        z = F.relu(self.fc3(z))
        z = self.fc4(z)

        return z

def reshape_tensor(tensor):
    reshaped_tensor = tensor.view(16, 32)
    return reshaped_tensor


## Data 

In [5]:
# Put all of the datasets into one list
datasets = [train_dataset, valid_dataset, test_dataset]

for dataset in datasets:
    for graph in dataset:
        mol = Chem.MolFromSmiles(graph.smiles)
        new_features = torch.tensor([
            desc.FpDensityMorgan1(mol),
            desc.FpDensityMorgan2(mol),
            desc.FpDensityMorgan3(mol),
            desc.HeavyAtomMolWt(mol),
            desc.NumHAcceptors(mol),
            desc.NHOHCount(mol),
            desc.Kappa3(mol),
            desc.NOCount(mol),
            desc.HallKierAlpha(mol),
            desc.MinEStateIndex(mol),
            desc.MolWt(mol),
            desc.BalabanJ(mol),
            desc.MolLogP(mol),
            desc.PEOE_VSA6(mol),
            desc.SlogP_VSA2(mol),
            desc.SMR_VSA7(mol),
        ])
        new_features = torch.tensor(new_features).to(torch.float32)
        graph.descriptors = new_features

  new_features = torch.tensor(new_features).to(torch.float32)


In [36]:
print(train_dataset[0].descriptors)
print(train_dataset[1].descriptors.shape)

tensor([  1.0000,   1.1818,   1.1818, 197.9630,   3.0000,   5.0000,   2.0995,
          7.0000,   0.2600,  -5.1977, 206.0270,   5.0795,  -0.9922,   0.0000,
         29.7634,   0.0000])
torch.Size([16])


In [101]:
# create a model
num_node_features = train_dataset[0].x.shape[0]
num_features_nongraph = train_dataset[0].descriptors.shape[0]
model = HybridModel(32, num_node_features, 12, num_features_nongraph)

# loss function and optimizer
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=0.001)

criterion = torch.nn.BCEWithLogitsLoss(reduction = "none")

# Create DataLoaders
from torch_geometric.data import DataLoader

batch_size=32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Start training

In [12]:
# train and eval function
from sklearn.metrics import roc_auc_score

def train(model, device, loader, optimizer):
    model.train()

    for step, batch in enumerate(loader):
        batch = batch.to(device)
        pred = model(batch)
        y = batch.y.view(pred.shape).to(torch.float64)

        optimizer.zero_grad()
        ## ignore nan targets (unlabeled) when computing training loss.
        is_labeled = batch.y == batch.y
        loss = criterion(pred.to(torch.float32)[is_labeled], batch.y.to(torch.float32)[is_labeled]).mean()
        loss.backward()
        optimizer.step()


def eval(model, device, loader):
    model.eval()
    y_true = []
    y_pred = []
    # For every batch in test loader
    for batch in loader:

        batch = batch.to(device)
        if batch.x.shape[0] == 1:
            pass
        else:
            with torch.no_grad():
                pred = model(batch)

            y_true.append(batch.y.view(pred.shape))
            y_pred.append(pred)

    y_true = torch.cat(y_true, dim = 0).numpy()
    y_pred = torch.cat(y_pred, dim = 0).numpy()
    # Compute the ROC - AUC score and store as history
    rocauc_list = []

    for i in range(y_true.shape[1]):
        #AUC is only defined when there is at least one positive data.
        if np.sum(y_true[:,i] == 1) > 0 and np.sum(y_true[:,i] == 0) > 0:
            # ignore nan values
            is_labeled = y_true[:,i] == y_true[:,i]
            rocauc_list.append(roc_auc_score(y_true[is_labeled,i], y_pred[is_labeled,i]))

    if len(rocauc_list) == 0:
        raise RuntimeError('No positively labeled data available. Cannot compute ROC-AUC.')

    return {'rocauc': sum(rocauc_list)/len(rocauc_list)}


In [102]:
# Training
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Start training...")
print(type(model))
for epoch in range(1, 5):
    print("====epoch " + str(epoch))

    # training
    train(model, device, train_loader, optimizer)

    # evaluating
    train_acc = eval(model, device, train_loader)
    val_acc = eval(model, device, val_loader)
    print({'Train': train_acc, 'Validation': val_acc})


Start training...
<class '__main__.HybridModel'>
====epoch 1
2 torch.Size([512, 32])


RuntimeError: Expected index [590] to be smaller than self [32] apart from dimension 0 and to be smaller size than src [512]