# Check GPUs

In [2]:
!nvidia-smi

Wed Oct  2 14:49:15 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A6000               On  | 00000000:1B:00.0 Off |                  Off |
| 30%   23C    P8              30W / 300W |    271MiB / 49140MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000               On  | 00000000:1C:00.0 Off |  

# Imports

In [3]:
# Standard library imports
import os
import math
from pathlib import Path

# Third-party imports
import pandas as pd
from lightning import pytorch as pl
import torch
from chemprop import data, featurizers, models, nn

# Local imports

# CUDA
print(f"CUDA available: {torch.cuda.is_available()}")
os.environ['CUDA_VISIBLE_DEVICES'] = '7'

CUDA available: True


# Chemprop Model

## Load Chemprop data & details

In [2]:
chemprop_dir = Path.cwd()
input_path = chemprop_dir / 'data' / 'regression.csv'
num_workers = 8

smiles_column = 'smiles'
target_columns = ['log_value']

df_input = pd.read_csv(input_path, index_col=0)
df_input.head()

Unnamed: 0,molregno,relation,value,unit,standard_type,compound_name,smiles,original_index,activity_index,assay,target,target_id,num_activities,doi,pmid,journal,abstract,stupid_response,log_value
0,1353266,=,0.084,uM,IC50,"Thiazol-5-ylmethyl(2S,3R)-3-hydroxy-4-(N-methy...",CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1cncs1)C...,0,0,828928,Cytochrome P450 3A4,17045,52,10.1016/j.bmcl.2012.06.022,22765892.0,Bioorg Med Chem Lett,A new class of benzoxazole and benzothiazole a...,There is no explicit description of the experi...,-1.075721
1,1353267,=,0.2,uM,IC50,"thiazol-5-ylmethyl(2S,3R)-4-(2-(3-(dimethylami...",CC(C)CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1cn...,0,1,828928,Cytochrome P450 3A4,17045,52,10.1016/j.bmcl.2012.06.022,22765892.0,Bioorg Med Chem Lett,A new class of benzoxazole and benzothiazole a...,There is no explicit description of the experi...,-0.69897
2,1353268,=,0.18,uM,IC50,"thiazol-5-ylmethyl(2S,3R)-4-(2-((2-(dimethylam...",CC(C)CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1cn...,0,2,828928,Cytochrome P450 3A4,17045,52,10.1016/j.bmcl.2012.06.022,22765892.0,Bioorg Med Chem Lett,A new class of benzoxazole and benzothiazole a...,There is no explicit description of the experi...,-0.744727
3,1353269,=,0.041,uM,IC50,"Thiazol-5-ylmethyl(2S,3R)-3-hydroxy-4-(N-isobu...",CNc1nc2ccc(C(=O)N(CC(C)C)C[C@@H](O)[C@H](Cc3cc...,0,3,828928,Cytochrome P450 3A4,17045,52,10.1016/j.bmcl.2012.06.022,22765892.0,Bioorg Med Chem Lett,A new class of benzoxazole and benzothiazole a...,There is no explicit description of the experi...,-1.387216
4,1353270,=,0.028,uM,IC50,"Thiazol-5-ylmethyl(2S,3R)-4-(2-(ethylamino)-N-...",CCNc1nc2ccc(C(=O)N(CC(C)C)C[C@@H](O)[C@H](Cc3c...,0,4,828928,Cytochrome P450 3A4,17045,52,10.1016/j.bmcl.2012.06.022,22765892.0,Bioorg Med Chem Lett,A new class of benzoxazole and benzothiazole a...,There is no explicit description of the experi...,-1.552842


In [3]:
chemprop_dir = Path.cwd()
input_path = chemprop_dir / 'data' / 'regression.csv'
num_workers = 8

smiles_column = 'smiles'
target_columns = ['log_value']

df_input = pd.read_csv(input_path, index_col=0)
df_input.head()

Unnamed: 0,molregno,relation,value,unit,standard_type,compound_name,smiles,original_index,activity_index,assay,target,target_id,num_activities,doi,pmid,journal,abstract,stupid_response,log_value
0,1353266,=,0.084,uM,IC50,"Thiazol-5-ylmethyl(2S,3R)-3-hydroxy-4-(N-methy...",CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1cncs1)C...,0,0,828928,Cytochrome P450 3A4,17045,52,10.1016/j.bmcl.2012.06.022,22765892.0,Bioorg Med Chem Lett,A new class of benzoxazole and benzothiazole a...,There is no explicit description of the experi...,-1.075721
1,1353267,=,0.2,uM,IC50,"thiazol-5-ylmethyl(2S,3R)-4-(2-(3-(dimethylami...",CC(C)CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1cn...,0,1,828928,Cytochrome P450 3A4,17045,52,10.1016/j.bmcl.2012.06.022,22765892.0,Bioorg Med Chem Lett,A new class of benzoxazole and benzothiazole a...,There is no explicit description of the experi...,-0.69897
2,1353268,=,0.18,uM,IC50,"thiazol-5-ylmethyl(2S,3R)-4-(2-((2-(dimethylam...",CC(C)CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1cn...,0,2,828928,Cytochrome P450 3A4,17045,52,10.1016/j.bmcl.2012.06.022,22765892.0,Bioorg Med Chem Lett,A new class of benzoxazole and benzothiazole a...,There is no explicit description of the experi...,-0.744727
3,1353269,=,0.041,uM,IC50,"Thiazol-5-ylmethyl(2S,3R)-3-hydroxy-4-(N-isobu...",CNc1nc2ccc(C(=O)N(CC(C)C)C[C@@H](O)[C@H](Cc3cc...,0,3,828928,Cytochrome P450 3A4,17045,52,10.1016/j.bmcl.2012.06.022,22765892.0,Bioorg Med Chem Lett,A new class of benzoxazole and benzothiazole a...,There is no explicit description of the experi...,-1.387216
4,1353270,=,0.028,uM,IC50,"Thiazol-5-ylmethyl(2S,3R)-4-(2-(ethylamino)-N-...",CCNc1nc2ccc(C(=O)N(CC(C)C)C[C@@H](O)[C@H](Cc3c...,0,4,828928,Cytochrome P450 3A4,17045,52,10.1016/j.bmcl.2012.06.022,22765892.0,Bioorg Med Chem Lett,A new class of benzoxazole and benzothiazole a...,There is no explicit description of the experi...,-1.552842


In [4]:
print(len(df_input))
print(len(df_input.drop_duplicates()))

5118
5118


## Get molecule data and split

In [8]:
# Organize data for modeling
smis = df_input.loc[:, smiles_column].values
ys = df_input.loc[:, target_columns].values
all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]

# Split data for train, val, and test
mols = [d.mol for d in all_data]
train_indices, val_indices, test_indices = data.make_split_indices(mols, 'scaffold_balanced', (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(
    all_data, train_indices, val_indices, test_indices
)

In [None]:
mols_without_atommaps = []
for mol in mols:
    copied_mol = copy.deepcopy(mol)
    for atom in copied_mol.GetAtoms():
        atom.SetAtomMapNum(0)
    mols_without_atommaps.append(copied_mol)
result = mol_split_fun(
    np.array(mols_without_atommaps), sampler="scaffold", **astartes_kwargs
)
train, val, test = _unpack_astartes_result(result, include_val)

## Retrieve MoleculeDataset and DataLoader

In [9]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

# Train
train_dset = data.MoleculeDataset(train_data, featurizer)
scaler = train_dset.normalize_targets()

# Val
val_dset = data.MoleculeDataset(val_data, featurizer)
val_dset.normalize_targets(scaler)

# Test
test_dset = data.MoleculeDataset(test_data, featurizer)

# DataLoaders
train_loader = data.build_dataloader(train_dset, num_workers=num_workers)
val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)
test_loader = data.build_dataloader(test_dset, num_workers=num_workers, shuffle=False)

## Create MPNN

In [10]:
# Message passing and aggregation
mp = nn.BondMessagePassing()
agg = nn.MeanAggregation()

# Feed-forward network
output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
ffn = nn.RegressionFFN(output_transform=output_transform)

# Batch normalization
batch_norm = True

# Get metrics
metric_list = [nn.metrics.RMSEMetric(), nn.metrics.MAEMetric()] # Only the first metric is used for training and early stopping

# Create model
mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)
mpnn

MPNN(
  (message_passing): BondMessagePassing(
    (W_i): Linear(in_features=86, out_features=300, bias=False)
    (W_h): Linear(in_features=300, out_features=300, bias=False)
    (W_o): Linear(in_features=372, out_features=300, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
    (tau): ReLU()
    (V_d_transform): Identity()
    (graph_transform): Identity()
  )
  (agg): MeanAggregation()
  (bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (predictor): RegressionFFN(
    (ffn): MLP(
      (0): Sequential(
        (0): Linear(in_features=300, out_features=300, bias=True)
      )
      (1): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=300, out_features=1, bias=True)
      )
    )
    (criterion): MSELoss(task_weights=[[1.0]])
    (output_transform): UnscaleTransform()
  )
  (X_d_transform): Identity()
)

## Train Model

In [11]:
# Load trainer for model
trainer = pl.Trainer(
    logger=False,
    enable_checkpointing=True,
    enable_progress_bar=True,
    accelerator='auto',
    devices=1,
    max_epochs=50,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [12]:
# Train moel
trainer.fit(mpnn, train_loader, val_loader)

You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [7]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name            | Type               | Params
-------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K 
1 | agg             | MeanAggregation    | 0     
2 | bn              | BatchNorm1d        | 600   
3 | predictor       | RegressionFFN      | 90.6 K
4 | X_d_transform   | Identity           | 0     
-------------------------------------------------------
318 K     Trainable params
0         Non-tra

Epoch 0: 100%|██████████████████████████████████████████| 64/64 [00:12<00:00,  5.02it/s, train_loss=0.598]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                                                   | 0/8 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                      | 0/8 [00:00<?, ?it/s][A
Validation DataLoader 0:  12%|█████▊                                        | 1/8 [00:00<00:00, 21.74it/s][A
Validation DataLoader 0:  25%|███████████▌                                  | 2/8 [00:00<00:00, 23.41it/s][A
Validation DataLoader 0:  38%|█████████████████▎                            | 3/8 [00:00<00:00, 16.85it/s][A
Validation DataLoader 0:  50%|███████████████████████                       | 4/8 [00:00<00:00, 18.21it/s][A
Validation DataLoader 0:  62%|████████████████████████████▊                 | 5/8 [00:00<00:00, 21.63it/s][A
Validation DataLoader 0:  75%|██████████████████████████████████▌           | 6/8 [00:00<

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|████████████████████████| 64/64 [00:13<00:00,  4.71it/s, train_loss=0.0893, val_loss=0.739]


## Get Model Results

In [13]:
results = trainer.test(mpnn, test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [7]


Testing DataLoader 0: 100%|█████████████████████████████████████████████████| 8/8 [00:00<00:00, 70.23it/s]
