##Importing Libraries

In [1]:
import numpy as np
import pandas as pd

Importing DeepChem

In [2]:
!curl -Lo conda_installer.py https://raw.githubusercontent.com/deepchem/deepchem/master/scripts/colab_install.py
import conda_installer
conda_installer.install()
!/root/miniconda/bin/conda info -e
!pip install --pre deepchem
import deepchem as dc

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  3501  100  3501    0     0  18721      0 --:--:-- --:--:-- --:--:-- 18721


add /root/miniconda/lib/python3.7/site-packages to PYTHONPATH
python version: 3.7.10
fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
done
installing miniconda to /root/miniconda
done
installing rdkit, openmm, pdbfixer
added omnia to channels
added conda-forge to channels
done
conda packages installation finished!


# conda environments:
#
base                  *  /root/miniconda

Collecting deepchem
[?25l  Downloading https://files.pythonhosted.org/packages/5d/6e/1b8a3295f9eac3da3813b44300a68b108ef2f2c8060a9ce12863b30c13a2/deepchem-2.6.0.dev20210520034915-py3-none-any.whl (564kB)
[K     |████████████████████████████████| 573kB 3.9MB/s 
Installing collected packages: deepchem
Successfully installed deepchem-2.6.0.dev20210520034915


In [3]:
from deepchem.models.layers import GraphConv, GraphGather
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from deepchem.feat.mol_graphs import ConvMol

#Model

In [4]:
class MyGraphConvModel(tf.keras.Model):

  def __init__(self):
    super(MyGraphConvModel, self).__init__()
    self.gc1 = GraphConv(128, activation_fn=tf.nn.relu)

    self.drop = layers.Dropout(rate=0.2)

    self.gc2 = GraphConv(128, activation_fn=tf.nn.relu)

    self.readout = GraphGather(batch_size=batch_size, activation_fn=tf.nn.relu)

    self.dense1 = layers.Dense(256, activation=tf.nn.relu)

    self.dense2 = layers.Dense(128, activation=tf.nn.relu)

    self.dense3 = layers.Dense(64, activation=tf.nn.relu)

    self.dense4 = layers.Dense(32, activation=tf.nn.relu)

    self.dense5 = layers.Dense(1)

  def call(self, inputs):

    gc1_output = self.gc1(inputs)

    drop_output = self.drop(gc1_output)

    gc2_output = self.gc2([drop_output]+ inputs[1:])  

    readout_output = self.readout([gc2_output] + inputs[1:])  

    dense1_output = self.dense1(readout_output)
 
    dense2_output = self.dense2(dense1_output)

    dense3_output = self.dense3(dense2_output)
    
    dense4_output = self.dense4(dense3_output)

    output = self.dense5(dense4_output)
    return output


#Preparing Data

Models expect arrays of numbers as their inputs, not Python objects. We must convert the ConvMol objects into the particular set of arrays expected by the GraphConv, and GraphGather layers. Fortunately, the ConvMol class includes the code to do this, as well as to combine all the molecules in a batch to create a single set of arrays.

The following code creates a Python generator that given a batch of data generates the lists of inputs, labels, and weights whose values are Numpy arrays. atom_features holds a feature vector of length 75 for each atom. The other inputs are required to support minibatching in TensorFlow. degree_slice is an indexing convenience that makes it easy to locate atoms from all molecules with a given degree. membership determines the membership of atoms in molecules (atom i belongs to molecule membership[i]). deg_adjs is a list that contains adjacency lists grouped by atom degree.

In [5]:
def data_generator(dataset, epochs=1):
  for ind, (X_b, y_b, w_b, ids_b) in enumerate(dataset.iterbatches(batch_size, epochs,
                                                                   deterministic=True, pad_batches=True)):
    multiConvMol = ConvMol.agglomerate_mols(X_b)
    inputs = [multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership)]
    for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
      inputs.append(multiConvMol.get_deg_adjacency_lists()[i])
    
    labels = [y_b]
    weights = [w_b]
    yield (inputs, labels, weights)

#Splitting Data

In [6]:
splitters = ['random', 'scaffold', 'butina']
batch = [50,100]
learning = [0.0005, 0.0001]

tasks, datasets, transformers = dc.molnet.load_bace_regression(featurizer='GraphConv', splitter=splitters[0])
train_dataset, valid_dataset, test_dataset = datasets

batch_size = 50

#Fitting the model 

In [7]:
import warnings
warnings.filterwarnings('ignore')

metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
callback =  dc.models.ValidationCallback(data_generator(valid_dataset), 1000, metrics=metric)

model = dc.models.KerasModel(MyGraphConvModel(), loss=dc.models.losses.L2Loss())
losses = []
model.fit_generator(data_generator(train_dataset, epochs=1000), all_losses= losses)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


0.02930114507675171

#Summary

In [8]:
print(model.model.summary())

Model: "my_graph_conv_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
graph_conv (GraphConv)       multiple                  204288    
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
graph_conv_1 (GraphConv)     multiple                  346752    
_________________________________________________________________
graph_gather (GraphGather)   multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  65792     
_________________________________________________________________
dense_1 (Dense)              multiple                  32896     
_________________________________________________________________
dense_2 (Dense)              multiple          

#Metrics

In [9]:
from deepchem.metrics import to_one_hot

metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
metric2 = dc.metrics.Metric(dc.metrics.rms_score)
metric3 = dc.metrics.Metric(dc.metrics.mae_score)
#square of Pearson correlation
print('training set score:', model.evaluate_generator(data_generator(train_dataset), [metric], transformers))
print('test set score L2:', model.evaluate_generator(data_generator(test_dataset), [metric], transformers))
print('valid set score L2:', model.evaluate_generator(data_generator(valid_dataset), [metric], transformers))
# root mean square error
print('test set score rms:', model.evaluate_generator(data_generator(test_dataset), [metric2], transformers))
# mean absolute error
print('test set score mae:', model.evaluate_generator(data_generator(test_dataset), [metric3], transformers))
# mean squared error
print('training set mse:', model.evaluate_generator(data_generator(train_dataset), [dc.metrics.Metric(dc.metrics.mean_squared_error)], transformers))

training set score: {'pearson_r2_score': 0.988019369784596}
test set score L2: {'pearson_r2_score': 0.7288292436615328}
valid set score L2: {'pearson_r2_score': 0.46301749258593017}
test set score rms: {'rms_score': 0.895647689016439}
test set score mae: {'mae_score': 0.7028270603231244}
training set mse: {'mean_squared_error': 0.023010553676752327}


#Build In

In [10]:
model = dc.models.GraphConvModel(1, mode='regression')
model.fit(train_dataset, nb_epoch=100)

0.038815505504608154

In [11]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print('Training set score:', model.evaluate(train_dataset, [metric], transformers))
print('Test set score:', model.evaluate(test_dataset, [metric], transformers))

Training set score: {'pearson_r2_score': 0.9610046181403874}
Test set score: {'pearson_r2_score': 0.5832503339189358}
