## Predicting Log P value given SMILES using RDKit at Zinc Dataset 
Given the Zinc Dataset, we'll take the SMILES strings and predict the Log P values using RDKit and see the performance 

In [2]:
from rdkit import Chem 
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole 
from rdkit.Chem import Descriptors 
from rdkit.Chem import AllChem 
from rdkit import DataStructs
import numpy as np 
from rdkit.Chem import MolFromInchi
from rdkit.Chem import rdMolDescriptors
import pandas as pd 


### Experiment 1: 
- Take the first 100 original SMILES and predict their log P value 

In [None]:
data= pd.read_csv("enhanced_molecules_top1000 (2).csv", usecols=[0,1],nrows=100) # take in the first col and first 100 smiles 
data.head() 

Unnamed: 0,smiles,logP
0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1\n,5.0506
1,C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1\n,3.1137
2,N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...,4.96778
3,CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...,4.00022
4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#...,3.60956


In [25]:
smiles= np.array(data["smiles"])    # (100,)
logP_gt= np.array(data["logP"])     # (100,)
print(smiles[0])
print(logP_gt[0])

CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1

5.0506


In [None]:
# iterate through the entire smiles array to feed into RDKit 
# combine RDKit results into one array and calculate the RMSE with ground_truth
pred_logP= [] # (100,)
for i in range(0,len(smiles)):
    mol =  Chem.MolFromSmiles(smiles[i])    # generate molecule in RDKit 
    value= Descriptors.MolLogP(mol)     # predict log P value 
    pred_logP.append(value)     # append to the list 

pred_logP= np.array(pred_logP)  # convert to np array 
print(pred_logP[:5])    # print out predictions 

[5.0506  3.1137  4.96778 4.00022 3.60956]


In [34]:
# calculate error metrics 
mse= np.mean( (pred_logP-logP_gt)**2 ,axis=0)
print("Mean Squared Error:",mse)

rmse= np.sqrt (mse)
print("Root Mean Squared Error:",rmse)

Mean Squared Error: 4.782392200704608e-30
Root Mean Squared Error: 2.1868681260434085e-15


#### Experiment 2: 
Same set up as experiment 1, given original canonical and isomeric SMILES, predict log P values, but we use the entire data set 