In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from rdkit import Chem
from rdkit import rdBase
from rdkit import DataStructs


from rdkit.Chem import AllChem
from rdkit.Chem import RDConfig
from rdkit import rdBase
from rdkit.Chem.Draw import IPythonConsole

In [2]:
import tensorflow as tf

In [31]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [None]:
#export DGLBACKEND=[tensorflow]

In [None]:
#import dgllife

In [None]:
#from dgllife.utils import mol_to_complete_graph
#from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer, CanonicalBondFeaturizer

## preprocess(select batches)

In [26]:
ihbt = pd.read_csv('AID1706_binarized_sars.csv')
ihbt = ihbt[['smiles','activity'] ]

In [48]:
ihbt

Unnamed: 0,smiles,activity
0,CC1=CC=C(O1)C(C(=O)NCC2=CC=CO2)N(C3=CC=C(C=C3)...,1
1,CC1=CC=C(C=C1)S(=O)(=O)N2CCN(CC2)S(=O)(=O)C3=C...,1
2,CC1=CC2=C(C=C1)NC(=O)C(=C2)CN(CCC3=CC=CC=C3)CC...,1
3,CC1=CC=C(C=C1)CN(C(C2=CC=CS2)C(=O)NCC3=CC=CO3)...,1
4,CCN1C2=NC(=O)N(C(=O)C2=NC(=N1)C3=CC=CC=C3)C,1
...,...,...
290721,COCCN1C2=C(C=C(C=C2)C(=O)OC)SC1=NC(=O)CCS(=O)(...,0
290722,COC1=CC=C(C=C1)C2CC(=NN2C(=O)CSC3=NCCS3)C4=CC=CS4,0
290723,CC1CC(=O)N(C2=CC=CC=C2S1(=O)=O)CC(=O)NC3=CC=CC...,0
290724,COC1=C2C(=C(C=C1)OC)SC(=N2)NC(=O)C3=CC(=CC=C3)...,0


In [49]:
ihbt.groupby('activity').count()

Unnamed: 0_level_0,smiles
activity,Unnamed: 1_level_1
0,290321
1,405


In [51]:
ihbt_true = ihbt[ihbt['activity']==1]
ihbt_false = ihbt[ihbt['activity']==0]

In [52]:
ihbt_false

Unnamed: 0,smiles,activity
405,CCOCCCNCC(=O)NC1=CC=C(C=C1)OC(F)(F)F.Cl,0
406,COCCN1C(=NN=N1)CN2CCC(CC2)CC3=CC=CC=C3.Cl,0
407,COCCN1C(=NN=N1)CN2CCC(CC2)(C3=CC(=CC=C3)C(F)(F...,0
408,C1CCCN(CC1)CC(=O)NCCC2=CC=C(C=C2)F.C(=O)(C(=O)O)O,0
409,COC1=CC=C(C=C1)C(=O)C(C2=CC=CC=C2)N3CCOCC3.Cl,0
...,...,...
290721,COCCN1C2=C(C=C(C=C2)C(=O)OC)SC1=NC(=O)CCS(=O)(...,0
290722,COC1=CC=C(C=C1)C2CC(=NN2C(=O)CSC3=NCCS3)C4=CC=CS4,0
290723,CC1CC(=O)N(C2=CC=CC=C2S1(=O)=O)CC(=O)NC3=CC=CC...,0
290724,COC1=C2C(=C(C=C1)OC)SC(=N2)NC(=O)C3=CC(=CC=C3)...,0


In [56]:
ihbt_false_short = resample(ihbt_false, n_samples=800, replace = False)
#ihbt_ = ihbt_false_short + ihbt_true

In [64]:
ihbt_ = pd.concat([ihbt_false_short , ihbt_true], ignore_index =  True)#.reset_index()

In [66]:
ihbt_#.groupby('ac').count()

Unnamed: 0,smiles,activity
0,C1COCCN1C(=O)CSC2=NNC(=N2)C3=CC=CC=C3,0
1,C1=CC=C(C=C1)C2=CSC(=N2)SCC(=O)NC3=CC=CC=C3[N+...,0
2,CC1=CC2=C(C=C1)OC(=O)N2CC(=O)OC,0
3,C1CCN2C(C1)C(=O)N(C3=CC=CC=C3C2=O)CC(=O)N.C(=O...,0
4,C1CCN(CC1)CCN2C3=CC=CC=C3N4C2=NC(=O)C(=N4)CCC(...,0
...,...,...
1200,C1COC2=C(O1)C=CC(=C2)NC(=O)C3=C(OC=N3)C4=CC=CC=C4,1
1201,COC(=O)C1=CC=CC=C1NC(=O)C2=CC3=C(C=C2)OCCCO3,1
1202,COC1=CC=CC=C1CCNC(=O)C(=O)NCC2N(CCO2)S(=O)(=O)...,1
1203,CN(C)CCNC(=O)C(=O)NCC1N(CCO1)S(=O)(=O)C2=CC=C(...,1


## Train Test Split

In [68]:
def mol2arr(mol):
    '''
    get the morgan finger print as an array for a rdkit molecule
    parameter mol: rdkit molecule format
    return arr: array for that molecule, containing its finger print
    '''
    arr = np.zeros((1,))
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

In [76]:
mols=[Chem.MolFromSmiles(x) for x in ihbt_['smiles']]
X = np.array([mol2arr(mol) for mol in mols])
y = ihbt_['activity']

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [85]:
X.shape

(1205, 2048)

In [79]:
def make_discriminator_model():
    
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Conv2D(64, (2048,) ,es=(2, 2), padding='same'))
    #model.add(tf.keras.layers.LeakyReLU())
    #model.add(tf.keras.layers.Dropout(0.3))
      
    #model.add(tf.keras.layers.Conv2D(128, (5, 5), strides=(2, 2), padding='same'))
    #model.add(tf.keras.layers.LeakyReLU())
    #model.add(tf.keras.layers.Dropout(0.3))
       
    #model.add(tf.keras.layers.Flatten())
    #model.add(tf.keras.layers.Dense(1))
     
    return model

In [7]:
mols=[Chem.MolFromSmiles(x) for x in ihbt['smiles']]
#node_featurizer = CanonicalAtomFeaturizer(atom_data_field='h')
#edge_featurizer = CanonicalBondFeaturizer(bond_data_field='h')
#_feats = atom_featurizer.feat_size('h')
#train_x= [mol_to_complete_graph(m, node_featurizer=node_featurizer) for m in mols]
#train_y = np.array(df['activity'])
#train_y = np.array(train_y, dtype=np.int64)



In [15]:
trainX = np.array([mol2arr(mol) for mol in mols])
#trainY = [sol_class[mol.GetProp("SOL_classification")] for mol in train_mols]
#trainY = np_utils.to_categorical(trainY)
 
#testX = np.array([mol2arr(mol) for mol in test_mols])
#testY = [sol_class[mol.GetProp("SOL_classification")] for mol in test_mols]
#testY = np_utils.to_categorical(testY)


In [None]:
pip uninstall dgl -y