In [3]:
cd ../../ez_chem/

/Users/dongdongzhang/Desktop/group/databases/EzChem/ez_chem


## Create molecular graphs for delaney datasets.

In [2]:
from featurization import *

In [3]:
import torch
import pandas as pd
import numpy as np

Load in original delaney file. 

In [5]:
delaney = pd.read_csv('../datasets/deepchem/delaney.csv')

In [6]:
delaney.head()

Unnamed: 0,smiles,logSolubility
0,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.77
1,Cc1occc1C(=O)Nc2ccccc2,-3.3
2,CC(C)=CCCC(C)=CC(=O),-2.06
3,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.87
4,c1ccsc1,-1.33


In [7]:
delaney.shape

(1128, 2)

We will define a function to process the compounds. This function is copied from the script file 'makeGraph.py'. 

In [8]:
def genGraphs(all_data, smilesColumn, targetColumn):
    examples = []
    for idx, smi, tar in zip(range(all_data.shape[0]), all_data[smilesColumn], all_data[targetColumn]):
        molgraphs = {}

        mol_graph = MolGraph(smi, '1-GNN')
        molgraphs['x'] = torch.FloatTensor(mol_graph.f_atoms) # atom features 
        molgraphs['edge_attr'] = torch.FloatTensor(mol_graph.real_f_bonds) # bond features 
        molgraphs['edge_index'] = torch.LongTensor(np.concatenate([mol_graph.at_begin, mol_graph.at_end]).reshape(2,-1))
        molgraphs['smiles'] = smi
        molgraphs['id'] = torch.FloatTensor([idx])
        molgraphs['y'] = torch.FloatTensor([tar])
        examples.append(molgraphs)
        if idx % 100 == 0:
            print('Finish processing {} compounds'.format(idx))
    print('Done.')
        
    return examples

In [9]:
graphs = genGraphs(delaney, 'smiles', 'logSolubility')

Finish processing 0 compounds
Finish processing 100 compounds
Finish processing 200 compounds
Finish processing 300 compounds
Finish processing 400 compounds
Finish processing 500 compounds
Finish processing 600 compounds
Finish processing 700 compounds
Finish processing 800 compounds
Finish processing 900 compounds
Finish processing 1000 compounds
Finish processing 1100 compounds
Done.


In [10]:
torch.save(graphs, '../examples/propertyPrediction/delaney/raw/temp.pt')

# Create data loader for training.

Once we have prepared the file storing the molecular graph for the datasets, we are good to create the loaders for the next steps. 

In [4]:
from data import *

In our framework, we need to define the sizes for train and valid because it's convinient for CV.

In [5]:
config = {'dataset': 'deepchem/delaney', # dataset name
          'model': '1-GNN',  # model 
          'train_type': 'from_scratch', 
          'normalize': False,
          'train_size': 902, 
          'val_size': 113, 
          'batch_size': 32,
          'data_path': '../examples/propertyPrediction/delaney/'}

In [6]:
loader = get_data_loader(config)

Here we can check if the create the loader for train/valid sets correctly.

In [68]:
print(len(loader.train_loader.dataset)), print(len(loader.val_loader.dataset))

902
113


(None, None)

# Prepare models for training

First we still need to claim a dictionary to control the model calling and training. 

In [10]:
from models import *

In [26]:
from trainer import *

In [30]:
from helper import *

In [37]:
config_train = {'num_layer': 3, # atom embedding layers 
                'emb_dim':64, # embedding dimension
                'NumOutLayers': 3, # number of read-out layers
                'num_tasks':1, 
                'pooling': 'sum',
                'gnn_type': 'gcn', 
                'optimizer': 'adam',
                'lr': 0.001,
                'loss': 'l2',
                'metrics': 'l2', 
                'weights': 'xavier_norm', # weights initialization method 
                'taskType': 'single',
                'device': torch.device('cpu')
                
}

In [38]:
config.update(config_train)

In [39]:
config

{'dataset': 'deepchem/delaney',
 'model': '1-GNN',
 'train_type': 'from_scratch',
 'normalize': False,
 'train_size': 902,
 'val_size': 113,
 'batch_size': 32,
 'data_path': '../examples/propertyPrediction/delaney/',
 'num_layer': 3,
 'emb_dim': 64,
 'NumOutLayers': 3,
 'num_tasks': 1,
 'pooling': 'sum',
 'gnn_type': 'gcn',
 'optimizer': 'adam',
 'lr': 0.001,
 'loss': 'l2',
 'metrics': 'l2',
 'weights': 'xavier_norm',
 'device': device(type='cpu'),
 'taskType': 'single'}

Get the corresponding model: base model is GCN and entire model acchitecutre is 1-GNN. 

In [70]:
genModel = get_model(config)

In [72]:
genModel

models.GNN_1

In [14]:
args = objectview(config)

In [60]:
model = genModel(args.num_layer, args.emb_dim, args.NumOutLayers, args.num_tasks, graph_pooling=args.pooling, gnn_type=args.gnn_type)

In [61]:
model

GNN_1(
  (gnn): GNN(
    (x_embedding1): Sequential(
      (0): Linear(in_features=40, out_features=64, bias=True)
      (1): ReLU()
    )
    (gnns): ModuleList(
      (0): GCNConv(
        (edge_embedding1): Embedding(6, 64)
        (edge_embedding2): Embedding(3, 64)
      )
      (1): GCNConv(
        (edge_embedding1): Embedding(6, 64)
        (edge_embedding2): Embedding(3, 64)
      )
      (2): GCNConv(
        (edge_embedding1): Embedding(6, 64)
        (edge_embedding2): Embedding(3, 64)
      )
    )
    (batch_norms): ModuleList(
      (0): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (outLayers): ModuleList(
    (0): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): ReLU()
    )
    (1): Sequential(
      (0): Lin

In [62]:
model = init_weights(model, config)

In [63]:
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])

In [64]:
for epoch in range(1, 100):
    _ = train(model, optimizer, loader.train_loader, config)
    train_error = test(model, loader.train_loader, config)
    test_error = test(model, loader.val_loader, config)
    print("Train loss is {} and test loss is {} at epoch {}".format(train_error, test_error, epoch))
    

Train loss is 1.515937143804272 and test loss is 1.5047292860018413 at epoch 1
Train loss is 1.2208003867502462 and test loss is 1.6386668554145036 at epoch 2
Train loss is 1.0610275162949747 and test loss is 1.4435607191580964 at epoch 3
Train loss is 1.165612436998998 and test loss is 1.5613314264227747 at epoch 4
Train loss is 0.8835743680222897 and test loss is 1.2666348796336075 at epoch 5
Train loss is 0.831746621317948 and test loss is 1.1807890037896873 at epoch 6
Train loss is 0.7610444825965573 and test loss is 1.1733822602579473 at epoch 7
Train loss is 0.735870380945031 and test loss is 1.1663826982863277 at epoch 8
Train loss is 0.9094318241049489 and test loss is 1.3051373283067749 at epoch 9
Train loss is 0.7911411515466328 and test loss is 1.2335417144024596 at epoch 10
Train loss is 0.7085760532848445 and test loss is 1.1155068620806687 at epoch 11
Train loss is 0.6889282043793368 and test loss is 0.9550970032743296 at epoch 12
Train loss is 0.7626299762948007 and test