In [1]:
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import tensorflow as tf
from sklearn.model_selection import KFold
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,Dropout
file=r"C:\Users\VISWAM\Downloads\amf.csv"
db=pd.read_csv(file)
#Converting SMILES to ECFP values 
def get_fingerprint(smiles,size=8192):
  if ((smiles is None) or (pd.isnull(smiles))):
    return np.zeros((size,))
  molecule = Chem.MolFromSmiles(smiles)
  if molecule is None:
    return np.zeros((size,))
  fingerprint = AllChem.GetMorganFingerprintAsBitVect(
      molecule, 2, size)
  arr = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(fingerprint, arr)
  return arr 
cols=db.columns[[0,2,4,6,8,12]]
col=db.columns[[1,3,5,7]]
array=[]
for a in cols:
    j=[]
    for i in range(0,len(db[a])):
     temp=(get_fingerprint(db[a][i])).astype(int)
     j.append(temp)
    db[a]=j
db.head()
db.fillna(0,inplace=True)
db2=pd.concat([pd.DataFrame(db["R"].values.tolist()), pd.DataFrame(db["r1"].values.tolist()),pd.DataFrame(db["r2"].values.tolist()),pd.DataFrame(db["c1"].values.tolist()),db['c1c'],pd.DataFrame(db["c2"].values.tolist()),db['c2c'],pd.DataFrame(db["p"].values.tolist()),db['t'],db['T'],db['m'],db['am'],db['l'],db['s']], axis=1)
co=["c1c","c2c","t","T"]
db2[co]=(db2[co]-db2[co].mean())/db2[co].std()
db3=db['y']



In [None]:
#kfold validation with k=10
train=[]
trains=[]
test=[]
tests=[]
trainr=[]
testr=[]
testd=[]
kf = KFold(n_splits=10,random_state=35, shuffle=True)
for tr,te in kf.split(db2,db3):
    model = Sequential()
    reg = tf.keras.regularizers.l1_l2(l1=0.0001, l2=0.008)
    model.add(Dense(256 ,input_dim=db2.iloc[tr].shape[1], kernel_initializer='normal', kernel_regularizer=reg, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(384, kernel_initializer='normal', kernel_regularizer=reg, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(384, kernel_initializer='normal', kernel_regularizer=reg, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(512, kernel_initializer='normal', kernel_regularizer=reg, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    history= model.fit(db2.iloc[tr], db3.iloc[tr], validation_data=(db2.iloc[te], db3.iloc[te]), epochs=100,verbose=0)
    yp=model.predict(db2.iloc[te])
    xp=model.predict(db2.iloc[tr])
    ya=db3.iloc[te].values.reshape((db3.iloc[te].shape[0],1))
    xa=db3.iloc[tr].values.reshape((db3.iloc[tr].shape[0],1))
    test.append(np.mean(np.abs(yp-ya)))
    train.append(np.mean(np.abs(xp-xa)))
    tests.append(np.std(np.abs(yp-ya)))
    trains.append(np.std(np.abs(xp-xa)))
    testr.append(r2_score(ya,yp))
    trainr.append(r2_score(xa,xp))

In [None]:
print(np.mean(train),np.mean(test))
n=[1,2,3,4,5,6,7,8,9,10]
n=np.array(n)
t=np.transpose(test)
t1=np.transpose(train)
fig=plt.figure()
plt.figure(figsize=(10,6))
plt.title("Kfold-Error")
plt.bar(n,t1,.4,label='Train set',color='darkkhaki')
plt.bar(n+.3,t,.4,label='Test set',)
plt.xticks(np.arange(len(n)+1))
plt.xlabel("Iteration number")
plt.ylabel("Error")
plt.ylim(0,18)
plt.legend(bbox_to_anchor=(1,0), loc="lower right", 
                bbox_transform=fig.transFigure, ncol=2)
plt.savefig('regression.png')
plt.show()

In [None]:
print(np.mean(trainr),np.mean(testr))
n=[1,2,3,4,5,6,7,8,9,10]
n=np.array(n)
p=np.transpose(testr)
p1=np.transpose(trainr)
fig=plt.figure()
plt.figure(figsize=(10,6))
plt.title("Kfold-R2score")
plt.bar(n,p1,.4,label='Train set',color='darkkhaki')
plt.bar(n+.3,p,.4,label='Test set')
plt.xticks(np.arange(len(n)+1))
plt.xlabel("Iteration number")
plt.ylabel("R2 Score")
plt.ylim(0,1)
plt.legend(bbox_to_anchor=(1,0), loc="lower right", 
                bbox_transform=fig.transFigure, ncol=2)
plt.savefig('regression1.png')
plt.show()