In [1]:
import deepchem as dc
dc.__version__

'2.5.0'

In [2]:
tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets

model = dc.models.GraphConvModel(n_tasks=1, mode='regression', dropout=0.2)
model.fit(train_dataset, nb_epoch=100)

metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric], transformers))
print("Test set score:", model.evaluate(test_dataset, [metric], transformers))


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.








Training set score: {'pearson_r2_score': 0.9190485185405177}
Test set score: {'pearson_r2_score': 0.6806435465990877}


In [3]:
solubilities = model.predict_on_batch(test_dataset.X[:10])
for molecule, solubility, test_solubility in zip(test_dataset.ids, solubilities, test_dataset.y):
    print(solubility, test_solubility, molecule)

[-1.7491326] [-1.60114461] c1cc2ccc3cccc4ccc(c1)c2c34
[0.81845486] [0.20848251] Cc1cc(=O)[nH]c(=S)[nH]1
[-0.7372845] [-0.01602738] Oc1ccc(cc1)C2(OC(=O)c3ccccc23)c4ccc(O)cc4 
[-2.2143903] [-2.82191713] c1ccc2c(c1)cc3ccc4cccc5ccc2c3c45
[-1.2575278] [-0.52891635] C1=Cc2cccc3cccc1c23
[1.5497252] [1.10168349] CC1CO1
[-0.7153632] [-0.88987406] CCN2c1ccccc1N(C)C(=S)c3cccnc23 
[-1.0079273] [-0.52649706] CC12CCC3C(CCc4cc(O)ccc34)C2CCC1=O
[-0.990676] [-0.76358725] Cn2cc(c1ccccc1)c(=O)c(c2)c3cccc(c3)C(F)(F)F
[0.6259394] [-0.64020358] ClC(Cl)(Cl)C(NC=O)N1C=CN(C=C1)C(NC=O)C(Cl)(Cl)Cl 


In [None]:
def calculate_fps(train_file, test_file, mol_smi, pIC50):
  # process train data
  train_data = pd.read_csv(train_file)
  train_data_x = train_data[mol_smi]
  train_mols = [Chem.MolFromSmiles(smi) for smi in train_data_x]
  train_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048) for mol in train_mols]
  train_morgan_fps_array = np.asarray(train_morgan_fps, dtype=float)
  train_maccs_fps = [MACCSkeys.GenMACCSKeys(mol) for mol in train_mols]
  train_maccs_fps_array = np.asarray(train_maccs_fps, dtype=float)
  train_x = np.concatenate([train_morgan_fps_array, train_maccs_fps_array],axis=1)
  train_y = train_data[pIC50]
  # process test data
  test_data = pd.read_csv(test_file)
  test_data_x = test_data[mol_smi]
  test_mols = [Chem.MolFromSmiles(smi) for smi in test_data_x]
  test_morgan_fps = [Chem.AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048) for mol in test_mols]
  test_morgan_fps_array = np.asarray(test_morgan_fps, dtype=float)
  test_maccs_fps = [MACCSkeys.GenMACCSKeys(mol) for mol in test_mols]
  test_maccs_fps_array = np.asarray(test_maccs_fps, dtype=float)
  test_x = np.concatenate([test_morgan_fps_array, test_maccs_fps_array],axis=1)
  test_y = test_data[pIC50]
  return (train_x, train_y, test_x, test_y)

# model 1: keras

In [27]:
import numpy as np
import deepchem as dc
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys

def create_dataset(file_name, mol_smi, pIC50):
  data = pd.read_csv(file_name)
  data_x = data[mol_smi]
  mols = [Chem.MolFromSmiles(smi) for smi in data_x]
  morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048) for mol in mols]
  morgan_fps_array = np.asarray(morgan_fps, dtype=float)
  maccs_fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]
  maccs_fps_array = np.asarray(maccs_fps, dtype=float)
  x = np.concatenate([morgan_fps_array, maccs_fps_array],axis=1)

  y = data[pIC50]
  y = np.asarray(y, dtype=float).reshape(-1, 1)
  dataset = dc.data.NumpyDataset(X=x, y=y)

  return dataset


In [36]:
train_dataset = create_dataset('/Users/yanlixu/Desktop/git_code/dataset/splitted_data_202112/molnet_bace/molnet_bace1_train_RandomSplitter.csv', 'mol', 'pIC50')
test_dataset = create_dataset('/Users/yanlixu/Desktop/git_code/dataset/splitted_data_202112/molnet_bace/molnet_bace1_test_RandomSplitter.csv', 'mol', 'pIC50')




In [37]:
import tensorflow as tf


keras_model = tf.keras.Sequential([
    tf.keras.layers.Dense(1000, activation='relu'),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Dense(1)
])
model = dc.models.KerasModel(keras_model, dc.models.losses.L2Loss())

In [38]:
model.fit(train_dataset, nb_epoch=50)
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print('training set score:', model.evaluate(train_dataset, [metric]))
print('training set score:', model.evaluate(test_dataset, [metric]))

training set score: {'pearson_r2_score': 0.9542757017453345}
training set score: {'pearson_r2_score': 0.7087671060378062}


# model 2: torch

In [41]:
import torch

pytorch_model = torch.nn.Sequential(
    torch.nn.Linear(2215, 1000),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.Linear(1000, 1)
)
model = dc.models.TorchModel(pytorch_model, dc.models.losses.L2Loss())

model.fit(train_dataset, nb_epoch=50)
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print('training set score:', model.evaluate(train_dataset, [metric]))
print('test set score:', model.evaluate(test_dataset, [metric]))

training set score: {'pearson_r2_score': 0.9490645749196438}
test set score: {'pearson_r2_score': 0.7010280841012867}


# model 3

In [50]:
def create_dataset(file_name, mol_smi, cls):
  data = pd.read_csv(file_name)
  data_x = data[mol_smi]
  mols = [Chem.MolFromSmiles(smi) for smi in data_x]
  morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048) for mol in mols]
  morgan_fps_array = np.asarray(morgan_fps, dtype=float)
  maccs_fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]
  maccs_fps_array = np.asarray(maccs_fps, dtype=float)
  x = np.concatenate([morgan_fps_array, maccs_fps_array],axis=1)

  y = data[cls]
  y = np.asarray(y, dtype=float).reshape(-1, 1)
  dataset = dc.data.NumpyDataset(X=x, y=y)

  return dataset

In [51]:
train_dataset = create_dataset('/Users/yanlixu/Desktop/git_code/dataset/splitted_data_202112/molnet_bace/molnet_bace1_train_RandomSplitter.csv', 'mol', 'Class')
test_dataset = create_dataset('/Users/yanlixu/Desktop/git_code/dataset/splitted_data_202112/molnet_bace/molnet_bace1_test_RandomSplitter.csv', 'mol', 'Class')


In [53]:
class ClassificationModel(tf.keras.Model):
    
    def __init__(self):
        super(ClassificationModel, self).__init__()
        self.dense1 = tf.keras.layers.Dense(1000, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1)

    def call(self, inputs, training=False):
        y = self.dense1(inputs)
        if training:
            y = tf.nn.dropout(y, 0.5)
        logits = self.dense2(y)
        output = tf.nn.sigmoid(logits)
        return output, logits

keras_model = ClassificationModel()
output_types = ['prediction', 'loss']
model = dc.models.KerasModel(keras_model, dc.models.losses.SigmoidCrossEntropy(), output_types=output_types)

model.fit(train_dataset, nb_epoch=100)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
print('training set score:', model.evaluate(train_dataset, [metric]))
print('test set score:', model.evaluate(test_dataset, [metric]))

training set score: {'roc_auc_score': 0.9998816131447954}
test set score: {'roc_auc_score': 0.87855325495701}


# model 5

In [54]:
#train_data = pd.read_csv('/Users/yanlixu/Desktop/git_code/dataset/splitted_data_202112/molnet_bace/molnet_bace1_train_RandomSplitter.csv')
#test_data = pd.read_csv('/Users/yanlixu/Desktop/git_code/dataset/splitted_data_202112/molnet_bace/molnet_bace1_test_RandomSplitter.csv')





In [56]:
import deepchem as dc
from deepchem.models.graph_models import GraphConvModel

In [57]:
def featurize_data(file_name, mol, cls):
    data = pd.read_csv(file_name)
    mols = [Chem.MolFromSmiles(smi) for smi in data['mol']]
    feature = dc.feat.ConvMolFeaturizer()
    features = feature.featurize(mols)
    y = data[cls]
    y = np.asarray(y, dtype=float).reshape(-1, 1)
    dataset = dc.data.NumpyDataset(X=features, y=y)
    return dataset
 

In [68]:
train_dataset = featurize_data('/Users/yanlixu/Desktop/git_code/dataset/splitted_data_202112/molnet_bace/molnet_bace1_train_RandomSplitter.csv', 'mol', 'Class')

test_dataset = featurize_data('/Users/yanlixu/Desktop/git_code/dataset/splitted_data_202112/molnet_bace/molnet_bace1_test_RandomSplitter.csv', 'mol', 'Class')


In [69]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode='classification')
batch_size=50
model = dc.models.GraphConvModel(1, batch_size=batch_size,  mode='classification')
model.fit(train_dataset, nb_epoch=10)
train_scores = model.evaluate(train_dataset, [metric])
print(train_scores)
test_scores = model.evaluate(test_dataset, [metric])
print(test_scores)







{'mean-roc_auc_score': 0.8685066186511607}
{'mean-roc_auc_score': 0.7954465695736095}


In [71]:
train_dataset = featurize_data('/Users/yanlixu/Desktop/git_code/dataset/splitted_data_202112/molnet_bace/molnet_bace1_train_RandomSplitter.csv', 'mol', 'pIC50')

test_dataset = featurize_data('/Users/yanlixu/Desktop/git_code/dataset/splitted_data_202112/molnet_bace/molnet_bace1_test_RandomSplitter.csv', 'mol', 'pIC50')


In [76]:
metric = dc.metrics.Metric(dc.metrics.r2_score, np.mean, mode='regression')
batch_size=50
model = dc.models.GraphConvModel(1, batch_size=batch_size,  mode='regression')
model.fit(train_dataset, nb_epoch=50)
train_scores = model.evaluate(train_dataset, [metric])
print(train_scores)
test_scores = model.evaluate(test_dataset, [metric])
print(test_scores)







{'mean-r2_score': 0.8948328917173868}
{'mean-r2_score': 0.6413872676940158}
