In [1]:
! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh
! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
! conda install -c rdkit rdkit -y
!pip install --pre deepchem

--2021-12-31 09:05:12--  https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.130.3, 104.16.131.3, 2606:4700::6810:8203, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.130.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85055499 (81M) [application/x-sh]
Saving to: ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh’


2021-12-31 09:05:13 (176 MB/s) - ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh’ saved [85055499/85055499]

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ | done
Solving environment: - done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - _libgcc_mutex==0.1=main
    - asn1crypto==1.3.0=py37_0
    - ca-certificates==2020.1.1=0
    - certifi==2019.11.28=py37_0
    - cffi==1.14.0=py37h2e261b9_0
    - chardet==3.0.4=py37_1003
    - conda-package-handling==1.6.0=py37h7b6447c_0
    

# deepchem keras

In [6]:
import numpy as np
import deepchem as dc
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [3]:
def create_dataset(file_name, mol_smi, cls):
  data = pd.read_csv(file_name)
  data_x = data[mol_smi]
  mols = [Chem.MolFromSmiles(smi) for smi in data_x]
  morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048) for mol in mols]
  morgan_fps_array = np.asarray(morgan_fps, dtype=float)
  maccs_fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]
  maccs_fps_array = np.asarray(maccs_fps, dtype=float)
  x = np.concatenate([morgan_fps_array, maccs_fps_array],axis=1)
  y = data[cls]
  y = np.asarray(y, dtype=float).reshape(-1, 1)
  dataset = dc.data.NumpyDataset(X=x, y=y)
  return dataset

In [4]:
class ClassificationModel(tf.keras.Model):   
    def __init__(self):
        super(ClassificationModel, self).__init__()
        self.dense1 = tf.keras.layers.Dense(1000, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1)

    def call(self, inputs, training=False):
        y = self.dense1(inputs)
        if training:
            y = tf.nn.dropout(y, 0.5)
        logits = self.dense2(y)
        output = tf.nn.sigmoid(logits)
        return output, logits

In [9]:
def keras_cls(train_fin, test_fin, mol_smi, classification):
  train_dataset = create_dataset('molnet_bace1_train_RandomSplitter.csv', 'mol', 'Class')
  test_dataset = create_dataset('molnet_bace1_test_RandomSplitter.csv', 'mol', 'Class') 
  res_list = []
  for epoch in [10, 20]:
    keras_model = ClassificationModel()
    output_types = ['prediction', 'loss']
    model = dc.models.KerasModel(keras_model, dc.models.losses.SigmoidCrossEntropy(), output_types=output_types)
    model.fit(train_dataset, nb_epoch=10)
    # training result evaluation
    train_predict = model.predict(train_dataset)
    train_pred_prob_list = train_predict.reshape(-1).tolist()
    train_pred_class =  [1 if item > 0.5 else 0 for item in train_pred_prob_list]
    train_dataset_y = train_dataset.y.reshape(-1).tolist()
    train_roc_auc = roc_auc_score(train_dataset_y, train_pred_prob_list)
    train_acc = accuracy_score(train_dataset_y, train_pred_class)
    train_sen = recall_score(train_dataset_y, train_pred_class)
    tn, fp, fn, tp = confusion_matrix(train_dataset_y, train_pred_class).ravel()
    train_spc = tn/(tn+fp)
    # test result evaluation
    test_predict = model.predict(test_dataset)
    test_pred_prob_list = test_predict.reshape(-1).tolist()
    test_pred_class =  [1 if item > 0.5 else 0 for item in test_pred_prob_list]
    test_dataset_y = test_dataset.y.reshape(-1).tolist()
    test_roc_auc = roc_auc_score(test_dataset_y, test_pred_prob_list)
    test_acc = accuracy_score(test_dataset_y, test_pred_class)
    test_sen = recall_score(test_dataset_y, test_pred_class)
    tn, fp, fn, tp = confusion_matrix(test_dataset_y, test_pred_class).ravel()
    test_spc = tn/(tn+fp)

    keras_perf = {'roc_auc':[train_roc_auc, test_roc_auc],
                'acc':[train_acc, test_acc],
                'sen':[train_sen, test_sen],
                'spc':[train_spc, test_spc]}
    res_list.append(pd.DataFrame.from_dict(keras_perf))
  final_res = pd.concat(res_list, keys = [10, 20])
  return final_res



In [10]:
molnet_bace1 = keras_cls('molnet_bace1_train_RandomSplitter.csv','molnet_bace1_test_RandomSplitter.csv', 'mol', 'Class')

In [None]:
#molnet_bace1 = gnn_reg('molnet_bace1_train_RandomSplitter.csv','molnet_bace1_test_RandomSplitter.csv', 'mol', 'pIC50')
chembl_bace1 = keras_cls('chembl_bace1_train_RandomSplitter.csv', 'chembl_bace1_test_RandomSplitter.csv', 'canonical_smiles', 'Classification')
chembl_cdk2 = keras_cls('chembl_cdk2_train_RandomSplitter.csv', 'chembl_cdk2_test_RandomSplitter.csv',  'canonical_smiles', 'Classification')
chembl_ach = keras_cls('chembl_ach_train_RandomSplitter.csv', 'chembl_ach_test_RandomSplitter.csv',  'canonical_smiles', 'Classification')

In [None]:
with pd.ExcelWriter('deepchem_keras_cls.xlsx') as writer:  
    molnet_bace1.to_excel(writer, sheet_name='molnet_bace1')
    chembl_bace1.to_excel(writer, sheet_name='chembl_bace1')
    chembl_cdk2.to_excel(writer, sheet_name='chembl_cdk2')
    chembl_ach.to_excel(writer, sheet_name='chembl_ach')

# deepchem GNN

In [None]:
import deepchem as dc
from deepchem.models.graph_models import GraphConvModel
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [None]:
def featurize_data(file_name, mol, cls):
    data = pd.read_csv(file_name)
    mols = [Chem.MolFromSmiles(smi) for smi in data[mol]]
    feature = dc.feat.ConvMolFeaturizer()
    features = feature.featurize(mols)
    y = data[cls]
    y = np.asarray(y, dtype=float).reshape(-1, 1)
    dataset = dc.data.NumpyDataset(X=features, y=y)
    return dataset

In [None]:
def gnn_cls(train_fin, test_fin, mol_smi, classification):
  train_dataset = featurize_data(train_fin, mol_smi, classification)
  test_dataset = featurize_data(train_fin, mol_smi, classification) 
  res_list = []
  for epoch in [10, 20, 30, 40, 50]:
    model = dc.models.GraphConvModel(1, batch_size=50,  mode='classification')
    model.fit(train_dataset, nb_epoch=epoch)
    # training result evaluation
    train_predict = model.predict(train_dataset)
    train_predict_new = train_predict.reshape(-1, 2)
    train_pred_prob_list = [item[1] for item in train_predict_new]
    train_pred_class = [1 if item > 0.5 else 0 for item in train_pred_prob_list]
    train_dataset_y = train_dataset.y.reshape(-1).tolist()
    train_roc_auc = roc_auc_score(train_dataset_y, train_pred_prob_list)
    train_acc = accuracy_score(train_dataset_y, train_pred_class)
    train_sen = recall_score(train_dataset_y, train_pred_class)
    tn, fp, fn, tp = confusion_matrix(train_dataset_y, train_pred_class).ravel()
    train_spc = tn/(tn+fp)
    # testing result evaluation
    test_predict = model.predict(test_dataset)
    test_predict_new = test_predict.reshape(-1, 2)
    test_pred_prob_list = [item[1] for item in test_predict_new]
    test_pred_class = [1 if item > 0.5 else 0 for item in test_pred_prob_list]
    test_dataset_y = test_dataset.y.reshape(-1).tolist()
    test_roc_auc = roc_auc_score(test_dataset_y, test_pred_prob_list)
    test_acc = accuracy_score(test_dataset_y, test_pred_class)
    test_sen = recall_score(test_dataset_y, test_pred_class)
    tn, fp, fn, tp = confusion_matrix(test_dataset_y, test_pred_class).ravel()
    test_spc = tn/(tn+fp)

    gnn_perf = {'roc_auc':[train_roc_auc, test_roc_auc],
                'acc':[train_acc, test_acc],
                'sen':[train_sen, test_sen],
                'spc':[train_spc, test_spc]}
    res_list.append(pd.DataFrame.from_dict(gnn_perf))
  final_res = pd.concat(res_list, keys = [10, 20, 30, 40, 50])
  return final_res

In [None]:
molnet_bace1 = gnn_cls('molnet_bace1_train_RandomSplitter.csv', 'molnet_bace1_test_RandomSplitter.csv', 'mol', 'Class')
chembl_bace1 = gnn_reg('chembl_bace1_train_RandomSplitter.csv', 'chembl_bace1_test_RandomSplitter.csv', 'mol', 'Class')
chembl_cdk2 = gnn_reg('chembl_cdk2_train_RandomSplitter.csv', 'chembl_cdk2_test_RandomSplitter.csv',  'mol', 'Class')
chembl_ach = gnn_reg('chembl_ach_train_RandomSplitter.csv', 'chembl_ach_test_RandomSplitter.csv',  'mol', 'Class')

[    roc_auc       acc       sen       spc
 0  0.863154  0.643802  0.233696  0.987842
 1  0.863154  0.643802  0.233696  0.987842]