<a href="https://colab.research.google.com/github/williamtbarker/ML4Molecules/blob/main/exercise_5_complete.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Task

For this task you will use the QM9 dataset with HOMO as the target value. Perform the following -

1. use the code from the GNN lesson and replace the model with the follwoing. You can find the detail in the [documentation](https://lifesci.dgl.ai/api/model.zoo.html#molecular-property-prediction)
  1. Graph convolution network (GCNPredictor)
  2. SchNet
  3. Graph attention network (GATPredictor)

Use the Canonical atom and bond featurizers


In [2]:
# 1. download QM9 and split dataset, use graph featurizer
!pip install deepchem
import deepchem as dc

# Load QM9 dataset
tasks, datasets, transformers = dc.molnet.load_qm9(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets

# Print out the sizes of the datasets
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(valid_dataset))
print("Test dataset size:", len(test_dataset))


Collecting deepchem
  Downloading deepchem-2.7.1-py3-none-any.whl (693 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m693.2/693.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting scipy<1.9 (from deepchem)
  Downloading scipy-1.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rdkit (from deepchem)
  Downloading rdkit-2023.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scipy, rdkit, deepchem
  Attempting uninstall: scipy
    Found existing installation: scipy 1.11.4
    Uninstalling scipy-1.11.4:
      Successfully uninstalled scipy-1.11.4
[31mERROR: pip's dependency resolver does not currently take into account all the pack

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead
[18:39:58] Explicit valence for atom # 1 C, 5, is greater than permitted
[18:39:58] ERROR: Could not sanitize molecule ending on line 2704
[18:39:58] ERROR: Explicit valence for atom # 1 C, 5, is greater than permitted
[18:39:58] Explicit valence for atom # 1 C, 5, is greater than permitted
[18:39:58] ERROR: Could not sanitize molecule ending on line 9097
[18:39:58] ERROR: Explicit valence for atom # 1 C, 5, is greater than permitted
[18:39:58] Explicit valence for atom # 2 C, 5, is greater than permitted
[18:39:58] ERROR: Could not sanitize molecule ending on line 19803
[18:39:58] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted
[18:39:58] Explicit valence for atom # 2 C, 5, is greater than permitted
[18:39:58] ERROR: Could not sanitize molecule ending on line 19870
[18:39:58] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted
[18:39:58] Explicit valen

Train dataset size: 105984
Validation dataset size: 13248
Test dataset size: 13248


In [4]:
import deepchem as dc
from deepchem.models import GraphConvModel

# Initialize GraphConvModel
model = GraphConvModel(
    n_tasks=len(tasks),
    graph_conv_layers=[128, 128],
    dropout=0.2,
    mode='regression'
)

# Train the model
for epoch in range(30):
    loss = model.fit(train_dataset, nb_epoch=1)
    print(f"Epoch {epoch+1}, Loss: {loss}")

# Evaluate the model
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
print("Training Score:", train_scores)
print("Validation Score:", valid_scores)



Epoch 1, Loss: 1.0240298589070638
Epoch 2, Loss: 1.0138044357299805
Epoch 3, Loss: 0.991546630859375
Epoch 4, Loss: 0.9893701553344727
Epoch 5, Loss: 0.9812606048583984
Epoch 6, Loss: 0.9870753606160482
Epoch 7, Loss: 1.0031274795532226
Epoch 8, Loss: 0.9684892654418945
Epoch 9, Loss: 0.9921380996704101
Epoch 10, Loss: 0.9824989318847657
Epoch 11, Loss: 1.0041276295979817
Epoch 12, Loss: 0.9924894332885742
Epoch 13, Loss: 0.9741585731506348
Epoch 14, Loss: 0.9599343299865722
Epoch 15, Loss: 0.9653359985351563
Epoch 16, Loss: 0.9735292434692383
Epoch 17, Loss: 0.9964990615844727
Epoch 18, Loss: 0.986782455444336
Epoch 19, Loss: 0.9598237991333007
Epoch 20, Loss: 0.9574346160888672
Epoch 21, Loss: 0.957727559407552
Epoch 22, Loss: 0.9577908515930176
Epoch 23, Loss: 0.9585564613342286
Epoch 24, Loss: 0.9635130882263183
Epoch 25, Loss: 0.9542188262939453
Epoch 26, Loss: 0.9436330159505208
Epoch 27, Loss: 0.973942756652832
Epoch 28, Loss: 0.9587184906005859
Epoch 29, Loss: 0.944136142730712

In [17]:
! pip install schnetpack ase





In [42]:
import torch
import schnetpack as spk
from schnetpack.datasets import QM9
from torch.utils.data import DataLoader, Subset
from schnetpack.representation import SchNet
from schnetpack.atomistic import Atomwise
from schnetpack.nn.cutoff import CosineCutoff

# Define the path to store the dataset
dataset_path = './qm9_dataset'

# Load the QM9 dataset
qm9_data = QM9(dataset_path, batch_size=batch_size)

# Manually define the size of the dataset
dataset_size = 134000

# Define sizes for train, val, and test sets
num_train = 100000
num_val = 10000
num_test = dataset_size - num_train - num_val

# Create indices for train, val, and test sets
indices = torch.randperm(dataset_size).tolist()
train_indices = indices[:num_train]
val_indices = indices[num_train:num_train + num_val]
test_indices = indices[num_train + num_val:]

# Create subsets for train, val, and test sets
train_dataset = Subset(qm9_data, train_indices)
val_dataset = Subset(qm9_data, val_indices)
test_dataset = Subset(qm9_data, test_indices)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the SchNet model using default or placeholder settings
n_atom_basis = 128  # number of features per atom
n_filters = 128     # number of filters in convolutions
n_interactions = 3  # number of interaction blocks
cutoff = 5.0        # cutoff distance in angstroms

# Placeholder radial basis function and cutoff function
# Replace with actual implementations from SchNetPack
radial_basis = spk.nn.basis.GaussianRBF(n_rbf=50, cutoff=cutoff)
cutoff_fn = CosineCutoff(cutoff)

model = SchNet(
    n_atom_basis=n_atom_basis,
    n_filters=n_filters,
    n_interactions=n_interactions,
    radial_basis=radial_basis,
    cutoff_fn=cutoff_fn
)

# Set up the optimizer and loss
optimizer = torch.optim.Adam(schnet_model.parameters(), lr=1e-4)
loss = spk.train.build_mse_loss(output_module)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
schnet_model.to(device)

for epoch in range(30):
    schnet_model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        pred = schnet_model(batch)
        batch_loss = loss(pred, batch['homo'])

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {batch_loss.item()}")

# Evaluate the model
schnet_model.eval()
test_scores = trainer.evaluate(test_loader)
print("Test Score:", test_scores)



AttributeError: module 'schnetpack.nn' has no attribute 'basis'

In [None]:
# 1a. Train GATPredictor

In [None]:
import deepchem as dc
import torch
from torch.utils.data import DataLoader

# Load the QM9 dataset with DeepChem and featurize it for graph models
tasks, datasets, transformers = dc.molnet.load_qm9(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets

# Convert DeepChem datasets to PyTorch tensors
def convert_to_torch(dataset):
    return [(torch.tensor(x), torch.tensor(y), torch.tensor(w), torch.tensor(ids))
            for x, y, w, ids in dataset.itersamples()]

train_dataset = convert_to_torch(train_dataset)
valid_dataset = convert_to_torch(valid_dataset)
test_dataset = convert_to_torch(test_dataset)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize GATPredictor
gat_model = GATPredictor(
    n_atom_basis=128,
    n_heads=4,
    n_layers=3,
    n_hidden=128,
    output_modules=['homo']
)

# Set up the optimizer
optimizer = torch.optim.Adam(gat_model.parameters(), lr=1e-4)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gat_model.to(device)

for epoch in range(30):
    gat_model.train()
    for batch in train_loader:
        # Move batch to device
        batch_x, batch_y, _, _ = batch
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        # Forward pass
        pred = gat_model(batch_x)

        # Compute loss
        loss = torch.nn.functional.mse_loss(pred, batch_y)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

# Evaluate the model
# ...



[04:43:58] Explicit valence for atom # 1 C, 5, is greater than permitted
[04:43:58] ERROR: Could not sanitize molecule ending on line 2704
[04:43:58] ERROR: Explicit valence for atom # 1 C, 5, is greater than permitted
[04:43:58] Explicit valence for atom # 1 C, 5, is greater than permitted
[04:43:58] ERROR: Could not sanitize molecule ending on line 9097
[04:43:58] ERROR: Explicit valence for atom # 1 C, 5, is greater than permitted
[04:43:58] Explicit valence for atom # 2 C, 5, is greater than permitted
[04:43:58] ERROR: Could not sanitize molecule ending on line 19803
[04:43:58] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted
[04:43:58] Explicit valence for atom # 2 C, 5, is greater than permitted
[04:43:58] ERROR: Could not sanitize molecule ending on line 19870
[04:43:58] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted
[04:43:58] Explicit valence for atom # 2 C, 5, is greater than permitted
[04:43:58] ERROR: Could not sanitize molecule end