In [1]:
!pip install --pre deepchem[tensorflow]
!pip install tf_keras

import os
os.environ['TF_USE_LEGACY_KERAS'] = 'True'

import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")



In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
import deepchem as dc

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


In [20]:
class data:

  def __init__(self,featurizer):
    self.featurizer = featurizer
    self.datasets = None
    self.train_dataset = None
    self.valid_dataset = None
    self.test_dataset = None
    self.transformers = None
    self.kind = None

  def load_lipo(self):
    if self.kind == 'lipo' and self.datasets is not None:
      return
    self.kind = 'lipo'
    tasks, self.datasets, self.transformers = dc.molnet.load_lipo(featurizer=self.featurizer,reload=False)
    self.train_dataset, self.valid_dataset, self.test_dataset = self.datasets

  def load_sol(self):
    if self.kind == 'sol' and self.datasets is not None:
      return
    self.kind = 'sol'
    tasks, self.datasets, self.transformers = dc.molnet.load_delaney(featurizer=self.featurizer,reload=False)
    self.train_dataset, self.valid_dataset, self.test_dataset = self.datasets

In [4]:
class modelRunner:

  def __init__(self,name,model,kind,data):
    self.name = name
    self.model = model
    self.kind = kind
    self.data = data
    self.train_dataset = data.train_dataset
    self.valid_dataset = data.valid_dataset
    self.test_dataset = data.test_dataset
    self.transformers = data.transformers

  def train(self):
    if self.kind == 'sk':
      dc_model = dc.models.SklearnModel(self.model)
      dc_model.fit(self.train_dataset)
      self.model = dc_model
    else:
      self.model.fit(self.train_dataset,nb_epoch=100)
    print("Training Done")

  def evaluate(self):
    metric = dc.metrics.Metric(dc.metrics.r2_score)
    print("Model:",self.name)
    print("Training set score:", self.model.evaluate(self.train_dataset, [metric], self.transformers))
    print("Test set score:", self.model.evaluate(self.test_dataset, [metric], self.transformers))

  def run(self):
    self.train()
    self.evaluate()


In [5]:
descriptors = pd.read_excel("/content/reshma edited descriptors.xlsx")

['MolWt',
 'NumValenceElectrons',
 'NumRadicalElectrons',
 'TPSA',
 'NHOHCount',
 'NOCount',
 'NumAliphaticCarbocycles',
 'NumAliphaticHeterocycles',
 'NumAliphaticRings',
 'NumAmideBonds',
 'NumAromaticCarbocycles',
 'NumAromaticHeterocycles',
 'NumAromaticRings',
 'NumAtomStereoCenters',
 'NumBridgeheadAtoms',
 'NumHAcceptors',
 'NumHDonors',
 'NumHeteroatoms',
 'NumHeterocycles',
 'NumRotatableBonds',
 'NumSaturatedCarbocycles',
 'NumSaturatedHeterocycles',
 'NumSaturatedRings',
 'NumSpiroAtoms',
 'NumUnspecifiedAtomStereoCenters',
 'RingCount',
 'MolLogP',
 'fr_Al_COO',
 'fr_Al_OH',
 'fr_Al_OH_noTert',
 'fr_ArN',
 'fr_Ar_COO',
 'fr_Ar_N',
 'fr_Ar_NH',
 'fr_Ar_OH',
 'fr_COO',
 'fr_COO2',
 'fr_C_O',
 'fr_C_O_noCOO',
 'fr_C_S',
 'fr_HOCCN',
 'fr_Imine',
 'fr_NH0',
 'fr_NH1',
 'fr_NH2',
 'fr_N_O',
 'fr_Ndealkylation1',
 'fr_Ndealkylation2',
 'fr_Nhpyrrole',
 'fr_SH',
 'fr_aldehyde',
 'fr_alkyl_carbamate',
 'fr_alkyl_halide',
 'fr_allylic_oxid',
 'fr_amide',
 'fr_amidine',
 'fr_aniline'

In [6]:
rdKitData = data(featurizer=dc.feat.RDKitDescriptors(descriptors=descriptors['descriptors'].values.tolist()))
rdKitData.load_lipo()

In [7]:
convData = data(featurizer='GraphConv')
convData.load_lipo()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [8]:
modelRunners = [
    modelRunner("Random Forest", RandomForestRegressor(n_estimators=100, random_state=42),'sk',rdKitData),
    modelRunner("Linear Regression", LinearRegression(),'sk',rdKitData),
    modelRunner("Support Vector Regression", SVR(),'sk',rdKitData),
    modelRunner("K-Nearest Neighbors", KNeighborsRegressor(),'sk',rdKitData),
    modelRunner("Gradient Boosting", GradientBoostingRegressor(n_estimators=100, random_state=42),'sk',rdKitData),
    modelRunner("Decision Tree", DecisionTreeRegressor(random_state=42),'sk',rdKitData),
    modelRunner("MultiTaskRegressor",dc.models.MultitaskRegressor(n_tasks=1,n_features=110,layer_sizes=[50,100,50]),'dc',rdKitData),
    modelRunner("Graph Conv Model",dc.models.GraphConvModel(n_tasks=1, mode='regression', dropout=0.2,batch_normalize=False),'dc',convData)
]

In [11]:
for mr in modelRunners:
  mr.run()

Training Done
Model: Random Forest
Training set score: {'r2_score': 0.9538693871726501}
Test set score: {'r2_score': 0.48751195622752486}
Training Done
Model: Linear Regression
Training set score: {'r2_score': 0.5428054792251551}
Test set score: {'r2_score': 0.2649714633964758}
Training Done
Model: Support Vector Regression
Training set score: {'r2_score': 0.13845380147392616}
Test set score: {'r2_score': 0.1787335384237525}
Training Done
Model: K-Nearest Neighbors
Training set score: {'r2_score': 0.457124466536714}
Test set score: {'r2_score': 0.01098930997450387}
Training Done
Model: Gradient Boosting
Training set score: {'r2_score': 0.6711846948270028}
Test set score: {'r2_score': 0.46140529895505167}
Training Done
Model: Decision Tree
Training set score: {'r2_score': 0.9957829422202072}
Test set score: {'r2_score': -0.03651116023745793}
Training Done
Model: MultiTaskRegressor
Training set score: {'r2_score': 0.5552623433287378}
Test set score: {'r2_score': 0.40159162143948013}
Trai

In [22]:
rdKitData = data(featurizer=dc.feat.RDKitDescriptors(descriptors=descriptors['descriptors'].values.tolist()))
rdKitData.load_sol()

convData = data(featurizer='GraphConv')
convData.load_sol()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [24]:
modelRunners = [
    modelRunner("Random Forest", RandomForestRegressor(n_estimators=100, random_state=42),'sk',rdKitData),
    modelRunner("Linear Regression", LinearRegression(),'sk',rdKitData),
    modelRunner("Support Vector Regression", SVR(),'sk',rdKitData),
    modelRunner("K-Nearest Neighbors", KNeighborsRegressor(),'sk',rdKitData),
    modelRunner("Gradient Boosting", GradientBoostingRegressor(n_estimators=100, random_state=42),'sk',rdKitData),
    modelRunner("Decision Tree", DecisionTreeRegressor(random_state=42),'sk',rdKitData),
    modelRunner("MultiTaskRegressor",dc.models.MultitaskRegressor(n_tasks=1,n_features=110,layer_sizes=[50,100,50]),'dc',rdKitData),
    modelRunner("Graph Conv Model",dc.models.GraphConvModel(n_tasks=1, mode='regression', dropout=0.2,batch_normalize=False),'dc',convData)
]

In [23]:
rdKitData.kind

'sol'

In [25]:
for mr in modelRunners:
  mr.run()

Training Done
Model: Random Forest
Training set score: {'r2_score': 0.9847818730542178}
Test set score: {'r2_score': 0.8517107713678111}
Training Done
Model: Linear Regression
Training set score: {'r2_score': 0.8817561326144453}
Test set score: {'r2_score': 0.7066869813549574}
Training Done
Model: Support Vector Regression
Training set score: {'r2_score': 0.7780270395925207}
Test set score: {'r2_score': 0.6884686194794731}
Training Done
Model: K-Nearest Neighbors
Training set score: {'r2_score': 0.879101735400424}
Test set score: {'r2_score': 0.6904722294591406}
Training Done
Model: Gradient Boosting
Training set score: {'r2_score': 0.9563055752323223}
Test set score: {'r2_score': 0.8354158965805404}
Training Done
Model: Decision Tree
Training set score: {'r2_score': 0.9965769218860303}
Test set score: {'r2_score': 0.690769169626785}
Training Done
Model: MultiTaskRegressor
Training set score: {'r2_score': 0.8898947478141062}
Test set score: {'r2_score': 0.7972470886064216}
Training Don

In [27]:
WeaveFeaturizer = dc.feat.WeaveFeaturizer()
weaveData = data(featurizer=WeaveFeaturizer)
weaveData.load_lipo()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [None]:
modelRunner('WeaveModel',dc.models.WeaveModel(n_tasks=1,mode='regression',learning_rate=0.001),'dc',weaveData).run()

