<a href="https://colab.research.google.com/github/vinodmp4/AIinDrugDiscovery/blob/main/Solubility_Model_Builder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://pubs.acs.org/doi/10.1021/ci034243x

In [None]:
!pip install rdkit
!pip install xgboost

In [6]:
import xgboost as xgb
import pandas as pd
import numpy as np
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
!wget https://raw.githubusercontent.com/vinodmp4/AIinDrugDiscovery/refs/heads/main/solubility.csv

In [None]:
inputfile = pd.read_csv('solubility.csv')
inputfile.head(5)

In [9]:
smiles = list(inputfile['smiles'])
mols = [Chem.MolFromSmiles(mol) for mol in smiles]
LogP = [Descriptors.MolLogP(mol) for mol in mols]
MW = [Descriptors.MolWt(mol) for mol in mols]
RB = [Descriptors.NumRotatableBonds(mol) for mol in mols]
#------------------------
HeavyAtoms = [Descriptors.HeavyAtomCount(mol) for mol in mols]
AROM_ATOM_COUNTS = [sum([mol.GetAtomWithIdx(i).GetIsAromatic() for i in range(mol.GetNumAtoms())]) for mol in mols]
APRatio = [AR/HA for AR,HA in zip(AROM_ATOM_COUNTS,HeavyAtoms)]
#------------------------
inputfile['LogP'] = LogP
inputfile['MW'] = MW
inputfile['RB'] = RB
inputfile['APRatio'] = APRatio

def getDescriptors(smilestext):
    thismol = Chem.MolFromSmiles(smilestext)
    descriptors = []
    descriptors.append(Descriptors.MolLogP(thismol))
    descriptors.append(Descriptors.MolWt(thismol))
    descriptors.append(Descriptors.NumRotatableBonds(thismol))
    descriptors.append(sum([thismol.GetAtomWithIdx(i).GetIsAromatic() for i in range(thismol.GetNumAtoms())])/Descriptors.HeavyAtomCount(thismol))
    return descriptors

In [None]:
inputfile.to_csv('mldata.csv', index=False)
inputfile.head(5)

In [11]:
TARGET = inputfile['logSolubility']
FEATURES = inputfile.drop(['logSolubility','smiles'], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(FEATURES, TARGET, test_size=0.2,random_state=42)

In [12]:
model = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators = 10, seed = 123)

In [13]:
history = model.fit(X_train, Y_train)

In [14]:
PREDICTIONS = model.predict(X_test)

In [None]:
plt.scatter(Y_test, PREDICTIONS, c="#7CAE99", alpha=0.4, edgecolor = "black")
plt.plot(Y_test,Y_test,"-r")
plt.ylabel('Predicted LogS');plt.xlabel('Experimental LogS')

In [None]:
CORRDATA = inputfile.drop(['smiles'], axis=1)
train, test = train_test_split(CORRDATA, test_size=0.2,random_state=42)
corrmap = train.corr()
sns.heatmap(corrmap, annot = True, vmin = -1)

In [None]:
mydata = getDescriptors("O=[Si]=O")  #CC(=O)Nc1ccc(O)cc1 ,OCC, C(=O)([O-])[O-].[Ca+2], O=[Si]=O
round(float(model.predict([mydata])[0]),2)

In [18]:
model.save_model('solubility_model.json')