# Imports

In [1]:
# Standard library imports
import os
import math
from pathlib import Path

# Third-party imports
import pandas as pd
from lightning import pytorch as pl
import torch
from chemprop import data, featurizers, models, nn

# Local imports

# CUDA
print(f"CUDA available: {torch.cuda.is_available()}")
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

CUDA available: True


# Chemprop Model

## Load Chemprop data & details

In [2]:
chemprop_dir = Path.cwd()
input_path = chemprop_dir / 'data' / 'regression.csv'
num_workers = 8

smiles_column = 'smiles'
target_columns = ['log_value']

df_input = pd.read_csv(input_path, index_col=0)
df_input.head()

NameError: name 'np' is not defined

In [27]:
chemprop_dir = Path.cwd()
input_path = chemprop_dir / 'data' / 'regression.csv'
num_workers = 8

smiles_column = 'smiles'
target_columns = ['log_value']

df_input = pd.read_csv(input_path, index_col=0)
df_input['log_value'] = df_input['value'].apply(np.log10)
df_input.head()

Unnamed: 0,smiles,value,log_value
8,Cn1c(N2CCO[C@@H]3COC[C@H]32)nc(-c2ccncc2F)cc1=O,545.0,6.300786
9,CC(C)C[C@H](C=O)NC(=O)[C@@H](NS(=O)(=O)c1ccc(F...,531.0,6.274762
10,C[C@@H]1[C@@H](C)OCCN1c1nc(-c2ccncc2F)cc(=O)n1C,485.0,6.184149
11,c1ccc2nc(COc3ccc(-c4n[nH]cc4Cc4ccncc4)cc3)ccc2c1,450.0,6.109248
12,Cn1c(N2CCO[C@@H]3CCC[C@H]32)nc(-c2ccncc2F)cc1=O,445.0,6.098074


## Get molecule data and split

In [28]:
# Organize data for modeling
smis = df_input.loc[:, smiles_column].values
ys = df_input.loc[:, target_columns].values
all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]

# Split data for train, val, and test
mols = [d.mol for d in all_data]
train_indices, val_indices, test_indices = data.make_split_indices(mols, 'random', (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(
    all_data, train_indices, val_indices, test_indices
)

## Retrieve MoleculeDataset and DataLoader

In [29]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

# Train
train_dset = data.MoleculeDataset(train_data, featurizer)
scaler = train_dset.normalize_targets()

# Val
val_dset = data.MoleculeDataset(val_data, featurizer)
val_dset.normalize_targets(scaler)

# Test
test_dset = data.MoleculeDataset(test_data, featurizer)

# DataLoaders
train_loader = data.build_dataloader(train_dset, num_workers=num_workers)
val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)
test_loader = data.build_dataloader(test_dset, num_workers=num_workers, shuffle=False)



## Create MPNN

In [30]:
# Message passing and aggregation
mp = nn.BondMessagePassing()
agg = nn.MeanAggregation()

# Feed-forward network
output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
ffn = nn.RegressionFFN(output_transform=output_transform)

# Batch normalization
batch_norm = True

# Get metrics
metric_list = [nn.metrics.RMSEMetric(), nn.metrics.MAEMetric()] # Only the first metric is used for training and early stopping

# Create model
mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)
mpnn

MPNN(
  (message_passing): BondMessagePassing(
    (W_i): Linear(in_features=86, out_features=300, bias=False)
    (W_h): Linear(in_features=300, out_features=300, bias=False)
    (W_o): Linear(in_features=372, out_features=300, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
    (tau): ReLU()
    (V_d_transform): Identity()
    (graph_transform): Identity()
  )
  (agg): MeanAggregation()
  (bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (predictor): RegressionFFN(
    (ffn): MLP(
      (0): Sequential(
        (0): Linear(in_features=300, out_features=300, bias=True)
      )
      (1): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=300, out_features=1, bias=True)
      )
    )
    (criterion): MSELoss(task_weights=[[1.0]])
    (output_transform): UnscaleTransform()
  )
  (X_d_transform): Identity()
)

## Train Model

In [31]:
# Load trainer for model
trainer = pl.Trainer(
    logger=False,
    enable_checkpointing=True,
    enable_progress_bar=True,
    accelerator='auto',
    devices=1,
    max_epochs=50,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [32]:
# Train moel
trainer.fit(mpnn, train_loader, val_loader)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name            | Type               | Params
-------------------------------------------------------
0 | message_passing | BondMessagePassing | 227 K 
1 | agg             | MeanAggregation    | 0     
2 | bn              | BatchNorm1d        | 600   
3 | predictor       | RegressionFFN      | 90.6 K
4 | X_d_transform   | Identity           | 0     
-------------------------------------------------------
318 K     Trainable params
0         Non-trainable params
318 K     Total params
1.276     Total estimated model params size (MB)


Epoch 0: 100%|██████████████████████████████████████████| 64/64 [00:06<00:00,  9.49it/s, train_loss=0.651]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                                                   | 0/8 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                      | 0/8 [00:00<?, ?it/s][A
Validation DataLoader 0:  12%|█████▊                                        | 1/8 [00:00<00:00, 85.88it/s][A
Validation DataLoader 0:  25%|███████████▌                                  | 2/8 [00:00<00:00, 58.02it/s][A
Validation DataLoader 0:  38%|█████████████████▎                            | 3/8 [00:00<00:00, 71.23it/s][A
Validation DataLoader 0:  50%|███████████████████████                       | 4/8 [00:00<00:00, 67.23it/s][A
Validation DataLoader 0:  62%|████████████████████████████▊                 | 5/8 [00:00<00:00, 75.87it/s][A
Validation DataLoader 0:  75%|██████████████████████████████████▌           | 6/8 [00:00<

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|█████████████████████████| 64/64 [00:09<00:00,  6.68it/s, train_loss=0.106, val_loss=0.742]


## Get Model Results

In [33]:
results = trainer.test(mpnn, test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]


Testing DataLoader 0: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 113.25it/s]
