In [1]:
import logging

# Disable RDKit logging
logger = logging.getLogger('rdkit')
logger.disabled = True
import rdkit
print(rdkit.__version__)
from rdkit import Chem
import numpy as np
import pandas as pd
from rdkit.Chem import AllChem, PandasTools

2021.03.2


In [2]:
'''
The activity data (IC50) of compounds tested on PPAR_gamma (homo sapiens) were downloaded from ChEMBL (CHEMBL235). 

'''
dfic50 = pd.read_csv('./data_sets/ppar_gamma_ic50.csv', delimiter=';')

In [3]:
len(dfic50)

2261

In [4]:

dfic50.dropna(subset = ['Smiles'],inplace=True)
dfic50["Standard Units"].unique(), dfic50['Standard Relation'].unique()

(array(['nM', nan], dtype=object),
 array(["'='", nan, "'>'", "'<'"], dtype=object))

In [5]:
dfic50_1 = dfic50[dfic50['Standard Relation']=="'='"]
dfic50_2 = dfic50.loc[(dfic50['Standard Relation']=="'>'")|(dfic50['Standard Relation']=="'>='")]
dfic50_3 = dfic50[dfic50['Standard Relation'].isna()]
len(dfic50_1), len(dfic50_2), len(dfic50_3)


(1911, 218, 117)

In [6]:
#Only the compounds with IC50 tested and pChEMBL values calculated were selected for further processing and analysis. 
dfic50_1.dropna(subset = ['pChEMBL Value'],inplace=True)
len(dfic50_1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfic50_1.dropna(subset = ['pChEMBL Value'],inplace=True)


1903

In [7]:
dfic50_1

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value
0,CHEMBL111271,,,508.59,2,6.98,25,CCCc1cc(Oc2ccc(F)cc2)ccc1OCCCOc1ccc2c(c1)OC(CC...,IC50,'=',...,CHEMBL1149176,1,Scientific Literature,J Med Chem,2004.0,,,,,1.200
1,CHEMBL121106,,,357.44,0,2.49,1a,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,IC50,'=',...,CHEMBL1130725,1,Scientific Literature,Bioorg Med Chem Lett,1998.0,,,,,0.030
2,CHEMBL121106,,,357.44,0,2.49,1a,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,IC50,'=',...,CHEMBL1130725,1,Scientific Literature,Bioorg Med Chem Lett,1998.0,,,,,0.060
4,CHEMBL2037081,,,486.49,1,6.09,70,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(C[C@@H]...,IC50,'=',...,CHEMBL2034870,1,Scientific Literature,Bioorg Med Chem,2012.0,,,,,6.857
5,CHEMBL3695875,,,508.67,2,8.62,BDBM147320,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)N[C@@...,IC50,'=',...,CHEMBL3638687,37,BindingDB Patent Bioactivity Data,,2015.0,,,,,256.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2256,CHEMBL82293,PALMITIC ACID,-1.0,256.43,1,5.55,4,CCCCCCCCCCCCCCCC(=O)O,IC50,'=',...,CHEMBL3046720,1,Scientific Literature,Med Chem Res,2013.0,,,,,30.000
2257,CHEMBL8659,OLEIC ACID,2.0,282.47,1,6.11,9,CCCCCCCC/C=C\CCCCCCCC(=O)O,IC50,'=',...,CHEMBL3046720,1,Scientific Literature,Med Chem Res,2013.0,,,,,4.100
2258,CHEMBL1173474,BEHENIC ACID,,340.59,1,7.89,7,CCCCCCCCCCCCCCCCCCCCCC(=O)O,IC50,'=',...,CHEMBL3046720,1,Scientific Literature,Med Chem Res,2013.0,,,,,30.000
2259,CHEMBL3695916,,,558.72,2,8.68,55,CC[C@H](NC(=O)c1ccc2c(c1)c(C)c(C)n2Cc1ccc(-c2c...,IC50,'=',...,CHEMBL3734690,1,Scientific Literature,ACS Med Chem Lett,2015.0,,,,,9.000


In [8]:
len(dfic50_1[dfic50_1['pChEMBL Value']>=6]),len(dfic50_1[dfic50_1['pChEMBL Value']<6])

(1259, 644)

In [9]:
import sys
sys.path.append('/home/sliu/cyp_related_coding/CYP19_inhibitor_screening/')
'''
The following function used for molecule preprocessing is from 'RingSySTEMS' project: https://github.com/anya-chen/RingSystems/tree/master/Preprocessing/src
'''
from src import MoleculePreprocessor
from src.MoleculePreprocessor import MoleculePreprocessor
from src.MoleculePreprocessorExtended import MoleculePreprocessorExtended

In [10]:
moleculesProcessed = MoleculePreprocessorExtended.init_with_smiles(list(dfic50_1.Smiles))

In [11]:
from rdkit import RDLogger
moleculesProcessed.csp_wash()

In [12]:
preprocessedSmilesDict = moleculesProcessed.get_rawsmiles_smiles_dict()

In [13]:
preprocessedSmiles_df = pd.DataFrame(list(preprocessedSmilesDict.items()), columns=['rawSmiles','preprocessedSmiles'])

In [14]:
preprocessed_df = pd.merge(dfic50_1, preprocessedSmiles_df, 
                           left_on='Smiles', right_on='rawSmiles')

In [15]:
preprocessed_df.dropna(subset=['preprocessedSmiles'],inplace=True) 

In [16]:
preprocessed_df = preprocessed_df[preprocessed_df['preprocessedSmiles']!='']

In [17]:
len(preprocessed_df)

1900

In [18]:
# Use the loc method to set the values in the new column based on the values in the existing column
preprocessed_df.loc[preprocessed_df['pChEMBL Value']>=6, 'y_true_label'] = '1'
preprocessed_df.loc[preprocessed_df['pChEMBL Value']<6, 'y_true_label'] = '0'

In [19]:
def remove_stereochemistry(smiles):
    # Convert the SMILES string to an RDKit molecule object
    mol = Chem.MolFromSmiles(smiles)
    
    # Use the Chem.RemoveStereochemistry function to remove stereo informationz
    Chem.RemoveStereochemistry(mol)
    
    # Convert the molecule back to a SMILES string without stereochemistry
    return Chem.MolToSmiles(mol)

preprocessed_df['preprocessedSmiles'] = preprocessed_df['preprocessedSmiles'].apply(remove_stereochemistry)
preprocessed_df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value,rawSmiles,preprocessedSmiles,y_true_label
0,CHEMBL111271,,,508.59,2,6.98,25,CCCc1cc(Oc2ccc(F)cc2)ccc1OCCCOc1ccc2c(c1)OC(CC...,IC50,'=',...,J Med Chem,2004.0,,,,,1.200,CCCc1cc(Oc2ccc(F)cc2)ccc1OCCCOc1ccc2c(c1)OC(CC...,CCCc1cc(Oc2ccc(F)cc2)ccc1OCCCOc1ccc2c(c1)OC(CC...,0
1,CHEMBL121106,,,357.44,0,2.49,1a,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,IC50,'=',...,Bioorg Med Chem Lett,1998.0,,,,,0.030,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,CN(CCOc1ccc(Cc2sc(=O)[nH]c2O)cc1)c1ccccn1,1
2,CHEMBL121106,,,357.44,0,2.49,1a,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,IC50,'=',...,Bioorg Med Chem Lett,1998.0,,,,,0.060,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,CN(CCOc1ccc(Cc2sc(=O)[nH]c2O)cc1)c1ccccn1,1
3,CHEMBL121106,,,357.44,0,2.49,86,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,IC50,'=',...,Eur J Med Chem,2011.0,,,,,210.000,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,CN(CCOc1ccc(Cc2sc(=O)[nH]c2O)cc1)c1ccccn1,1
4,CHEMBL2037081,,,486.49,1,6.09,70,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(C[C@@H]...,IC50,'=',...,Bioorg Med Chem,2012.0,,,,,6.857,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(C[C@@H]...,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(CC(OC(C...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1898,CHEMBL150,KAEMPFEROL,,286.24,0,2.28,20,O=c1c(O)c(-c2ccc(O)cc2)oc2cc(O)cc(O)c12,IC50,'=',...,Eur J Med Chem,2021.0,,,AGONIST,,23.100,O=c1c(O)c(-c2ccc(O)cc2)oc2cc(O)cc(O)c12,O=c1c(O)c(-c2ccc(O)cc2)oc2cc(O)cc(O)c12,0
1899,CHEMBL379064,ISORHAMNETIN,,316.26,0,2.29,1; Isorhamnetin,COc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O)ccc1O,IC50,'=',...,Eur J Med Chem,2021.0,,,ANTAGONIST,,3.500,COc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O)ccc1O,COc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O)ccc1O,0
1900,CHEMBL82293,PALMITIC ACID,-1.0,256.43,1,5.55,4,CCCCCCCCCCCCCCCC(=O)O,IC50,'=',...,Med Chem Res,2013.0,,,,,30.000,CCCCCCCCCCCCCCCC(=O)O,CCCCCCCCCCCCCCCC(=O)O,0
1901,CHEMBL8659,OLEIC ACID,2.0,282.47,1,6.11,9,CCCCCCCC/C=C\CCCCCCCC(=O)O,IC50,'=',...,Med Chem Res,2013.0,,,,,4.100,CCCCCCCC/C=C\CCCCCCCC(=O)O,CCCCCCCCC=CCCCCCCCC(=O)O,0


In [20]:
duplicates = preprocessed_df.duplicated('preprocessedSmiles', keep=False)

# Filter DataFrame to list duplicates with their information
duplicates_df = preprocessed_df[duplicates]
duplicates_df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value,rawSmiles,preprocessedSmiles,y_true_label
1,CHEMBL121106,,,357.44,0,2.49,1a,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,IC50,'=',...,Bioorg Med Chem Lett,1998.0,,,,,0.030,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,CN(CCOc1ccc(Cc2sc(=O)[nH]c2O)cc1)c1ccccn1,1
2,CHEMBL121106,,,357.44,0,2.49,1a,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,IC50,'=',...,Bioorg Med Chem Lett,1998.0,,,,,0.060,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,CN(CCOc1ccc(Cc2sc(=O)[nH]c2O)cc1)c1ccccn1,1
3,CHEMBL121106,,,357.44,0,2.49,86,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,IC50,'=',...,Eur J Med Chem,2011.0,,,,,210.000,CN(CCOc1ccc(C[C@@H]2SC(=O)NC2=O)cc1)c1ccccn1,CN(CCOc1ccc(Cc2sc(=O)[nH]c2O)cc1)c1ccccn1,1
4,CHEMBL2037081,,,486.49,1,6.09,70,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(C[C@@H]...,IC50,'=',...,Bioorg Med Chem,2012.0,,,,,6.857,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(C[C@@H]...,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(CC(OC(C...,1
8,CHEMBL3699355,,,638.61,2,9.20,BDBM147372,Cc1c(C)n(Cc2ccc(-c3ccccc3C(=O)O)cc2)c2ccc(C(=O...,IC50,'=',...,,2015.0,,,,,11.000,Cc1c(C)n(Cc2ccc(-c3ccccc3C(=O)O)cc2)c2ccc(C(=O...,Cc1c(C)n(Cc2ccc(-c3ccccc3C(=O)O)cc2)c2ccc(C(=O...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1868,CHEMBL424133,LY-518674,2.0,409.49,0,3.76,5,Cc1ccc(Cn2nc(CCCc3ccc(OC(C)(C)C(=O)O)cc3)nc2O)cc1,IC50,'=',...,J Med Chem,2003.0,,,,,6500.000,Cc1ccc(Cn2nc(CCCc3ccc(OC(C)(C)C(=O)O)cc3)nc2O)cc1,Cc1ccc(Cn2[nH]c(CCCc3ccc(OC(C)(C)C(=O)O)cc3)nc...,0
1869,CHEMBL424133,LY-518674,2.0,409.49,0,3.76,42,Cc1ccc(Cn2nc(CCCc3ccc(OC(C)(C)C(=O)O)cc3)nc2O)cc1,IC50,'=',...,Bioorg Med Chem,2012.0,,,,,5.187,Cc1ccc(Cn2nc(CCCc3ccc(OC(C)(C)C(=O)O)cc3)nc2O)cc1,Cc1ccc(Cn2[nH]c(CCCc3ccc(OC(C)(C)C(=O)O)cc3)nc...,0
1881,CHEMBL3695816,,,532.60,2,6.33,2c,Cc1c(C)n(Cc2ccc(-c3ccccc3C(=O)O)cc2)c2ccc(C(=O...,IC50,'=',...,Bioorg Med Chem Lett,2019.0,,,,,0.015,Cc1c(C)n(Cc2ccc(-c3ccccc3C(=O)O)cc2)c2ccc(C(=O...,Cc1c(C)n(Cc2ccc(-c3ccccc3C(=O)O)cc2)c2ccc(C(=O...,1
1882,CHEMBL3695816,,,532.60,2,6.33,BDBM147258,Cc1c(C)n(Cc2ccc(-c3ccccc3C(=O)O)cc2)c2ccc(C(=O...,IC50,'=',...,,2015.0,,,,,33.000,Cc1c(C)n(Cc2ccc(-c3ccccc3C(=O)O)cc2)c2ccc(C(=O...,Cc1c(C)n(Cc2ccc(-c3ccccc3C(=O)O)cc2)c2ccc(C(=O...,1


In [21]:
groups = duplicates_df.groupby('preprocessedSmiles')
id_toremove = []
for name, group in groups:
    # Check if all values in 'y_true_label' are the same within the group
    if not group['y_true_label'].nunique() == 1:
        # Print out 'Molecule ChEMBL ID' for all entries in the group
        print(f"Different activity classifications found for {name}:")
        for chembl_id in group['Molecule ChEMBL ID']:
            print(chembl_id)
            id_toremove.append(chembl_id)

Different activity classifications found for C=C(OC(C)=O)c1c(C(=O)OC)cc2c(O)c3c(c(O)cc4cc(C)c(Cl)c(OC)c43)c(O)c2c1O:
CHEMBL254814
CHEMBL254814
Different activity classifications found for C=C1OC(=O)c2cc3c(O)c4c(c(O)cc5cc(C)c(Cl)c(OC)c54)c(O)c3c(O)c21:
CHEMBL400132
CHEMBL400132
Different activity classifications found for C=Cc1c(C(=O)OC)cc2c(O)c3c(c(O)cc4cc(C)c(Cl)c(OC)c43)c(O)c2c1O:
CHEMBL253120
CHEMBL253120
Different activity classifications found for CC1(C(=O)O)CCc2ccc(OCCCOc3ccc(Oc4ccccc4)cc3Cl)cc2O1:
CHEMBL111480
CHEMBL109638
CHEMBL112790
Different activity classifications found for CCC1(C(=O)O)CCc2ccc(OCCCOc3ccc(Oc4ccc(F)cc4)cc3Cl)cc2O1:
CHEMBL109241
CHEMBL112725
CHEMBL112725
Different activity classifications found for CCCCCCCCOc1ccc(C(=O)NC(C(=O)O)C(C)C)cc1:
CHEMBL4743677
CHEMBL4743677
Different activity classifications found for CCCCNC(=O)NS(=O)(=O)c1ccc(NC(=O)Cn2c(=O)c(C)nc3ccccc32)cc1:
CHEMBL3959627
CHEMBL3959627
Different activity classifications found for CCCCNC(=S)NS(=O)(=

In [22]:
print(len(id_toremove))

145


In [23]:
print(len(set(id_toremove)))

63


In [24]:
# Drop the rows involved in the printed chembl_ids
for i in set(id_toremove):
    preprocessed_df = preprocessed_df[preprocessed_df['Molecule ChEMBL ID'] != i]

In [25]:
preprocessed_df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value,rawSmiles,preprocessedSmiles,y_true_label
0,CHEMBL111271,,,508.59,2,6.98,25,CCCc1cc(Oc2ccc(F)cc2)ccc1OCCCOc1ccc2c(c1)OC(CC...,IC50,'=',...,J Med Chem,2004.0,,,,,1.200,CCCc1cc(Oc2ccc(F)cc2)ccc1OCCCOc1ccc2c(c1)OC(CC...,CCCc1cc(Oc2ccc(F)cc2)ccc1OCCCOc1ccc2c(c1)OC(CC...,0
4,CHEMBL2037081,,,486.49,1,6.09,70,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(C[C@@H]...,IC50,'=',...,Bioorg Med Chem,2012.0,,,,,6.857,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(C[C@@H]...,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(CC(OC(C...,1
5,CHEMBL3695875,,,508.67,2,8.62,BDBM147320,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)N[C@@...,IC50,'=',...,,2015.0,,,,,256.000,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)N[C@@...,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)NC(C)...,1
6,CHEMBL3695892,,,527.50,2,8.77,BDBM147337,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)N[C@H...,IC50,'=',...,,2015.0,,,,,5.000,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)N[C@H...,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)NC(C)...,1
7,CHEMBL3695898,,,506.62,2,7.61,BDBM147343,COc1c(F)cccc1[C@H](C)NC(=O)c1ccc2c(c1)c(C)c(C)...,IC50,'=',...,,2015.0,,,,,21.000,COc1c(F)cccc1[C@H](C)NC(=O)c1ccc2c(c1)c(C)c(C)...,COc1c(F)cccc1C(C)NC(=O)c1ccc2c(c1)c(C)c(C)n2Cc...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1898,CHEMBL150,KAEMPFEROL,,286.24,0,2.28,20,O=c1c(O)c(-c2ccc(O)cc2)oc2cc(O)cc(O)c12,IC50,'=',...,Eur J Med Chem,2021.0,,,AGONIST,,23.100,O=c1c(O)c(-c2ccc(O)cc2)oc2cc(O)cc(O)c12,O=c1c(O)c(-c2ccc(O)cc2)oc2cc(O)cc(O)c12,0
1899,CHEMBL379064,ISORHAMNETIN,,316.26,0,2.29,1; Isorhamnetin,COc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O)ccc1O,IC50,'=',...,Eur J Med Chem,2021.0,,,ANTAGONIST,,3.500,COc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O)ccc1O,COc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O)ccc1O,0
1900,CHEMBL82293,PALMITIC ACID,-1.0,256.43,1,5.55,4,CCCCCCCCCCCCCCCC(=O)O,IC50,'=',...,Med Chem Res,2013.0,,,,,30.000,CCCCCCCCCCCCCCCC(=O)O,CCCCCCCCCCCCCCCC(=O)O,0
1901,CHEMBL8659,OLEIC ACID,2.0,282.47,1,6.11,9,CCCCCCCC/C=C\CCCCCCCC(=O)O,IC50,'=',...,Med Chem Res,2013.0,,,,,4.100,CCCCCCCC/C=C\CCCCCCCC(=O)O,CCCCCCCCC=CCCCCCCCC(=O)O,0


In [26]:
preprocessed_df.drop_duplicates('preprocessedSmiles',inplace=True)
preprocessed_df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value,rawSmiles,preprocessedSmiles,y_true_label
0,CHEMBL111271,,,508.59,2,6.98,25,CCCc1cc(Oc2ccc(F)cc2)ccc1OCCCOc1ccc2c(c1)OC(CC...,IC50,'=',...,J Med Chem,2004.0,,,,,1.200,CCCc1cc(Oc2ccc(F)cc2)ccc1OCCCOc1ccc2c(c1)OC(CC...,CCCc1cc(Oc2ccc(F)cc2)ccc1OCCCOc1ccc2c(c1)OC(CC...,0
4,CHEMBL2037081,,,486.49,1,6.09,70,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(C[C@@H]...,IC50,'=',...,Bioorg Med Chem,2012.0,,,,,6.857,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(C[C@@H]...,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(CC(OC(C...,1
5,CHEMBL3695875,,,508.67,2,8.62,BDBM147320,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)N[C@@...,IC50,'=',...,,2015.0,,,,,256.000,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)N[C@@...,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)NC(C)...,1
6,CHEMBL3695892,,,527.50,2,8.77,BDBM147337,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)N[C@H...,IC50,'=',...,,2015.0,,,,,5.000,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)N[C@H...,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)NC(C)...,1
7,CHEMBL3695898,,,506.62,2,7.61,BDBM147343,COc1c(F)cccc1[C@H](C)NC(=O)c1ccc2c(c1)c(C)c(C)...,IC50,'=',...,,2015.0,,,,,21.000,COc1c(F)cccc1[C@H](C)NC(=O)c1ccc2c(c1)c(C)c(C)...,COc1c(F)cccc1C(C)NC(=O)c1ccc2c(c1)c(C)c(C)n2Cc...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1898,CHEMBL150,KAEMPFEROL,,286.24,0,2.28,20,O=c1c(O)c(-c2ccc(O)cc2)oc2cc(O)cc(O)c12,IC50,'=',...,Eur J Med Chem,2021.0,,,AGONIST,,23.100,O=c1c(O)c(-c2ccc(O)cc2)oc2cc(O)cc(O)c12,O=c1c(O)c(-c2ccc(O)cc2)oc2cc(O)cc(O)c12,0
1899,CHEMBL379064,ISORHAMNETIN,,316.26,0,2.29,1; Isorhamnetin,COc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O)ccc1O,IC50,'=',...,Eur J Med Chem,2021.0,,,ANTAGONIST,,3.500,COc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O)ccc1O,COc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O)ccc1O,0
1900,CHEMBL82293,PALMITIC ACID,-1.0,256.43,1,5.55,4,CCCCCCCCCCCCCCCC(=O)O,IC50,'=',...,Med Chem Res,2013.0,,,,,30.000,CCCCCCCCCCCCCCCC(=O)O,CCCCCCCCCCCCCCCC(=O)O,0
1901,CHEMBL8659,OLEIC ACID,2.0,282.47,1,6.11,9,CCCCCCCC/C=C\CCCCCCCC(=O)O,IC50,'=',...,Med Chem Res,2013.0,,,,,4.100,CCCCCCCC/C=C\CCCCCCCC(=O)O,CCCCCCCCC=CCCCCCCCC(=O)O,0


In [27]:
# Check for duplicates in 'preprocessedSmiles'
duplicates = preprocessed_df.duplicated(subset='preprocessedSmiles', keep=False)

# Show rows where 'preprocessedSmiles' is duplicated
duplicate_rows = preprocessed_df[duplicates]
duplicate_rows

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value,rawSmiles,preprocessedSmiles,y_true_label


In [28]:
len(preprocessed_df[preprocessed_df['pChEMBL Value']>=6]),len(preprocessed_df[preprocessed_df['pChEMBL Value']<6])

(870, 480)

In [29]:
preprocessed_df_useful = preprocessed_df[['preprocessedSmiles','Molecule ChEMBL ID', 'y_true_label']]
preprocessed_df_useful

Unnamed: 0,preprocessedSmiles,Molecule ChEMBL ID,y_true_label
0,CCCc1cc(Oc2ccc(F)cc2)ccc1OCCCOc1ccc2c(c1)OC(CC...,CHEMBL111271,0
4,Cc1oc(-c2ccc(C(F)(F)F)cc2)nc1Cn1ccc2cc(CC(OC(C...,CHEMBL2037081,1
5,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)NC(C)...,CHEMBL3695875,1
6,Cc1c(C)n(Cc2ccc(-c3ccccc3)cc2)c2ccc(C(=O)NC(C)...,CHEMBL3695892,1
7,COc1c(F)cccc1C(C)NC(=O)c1ccc2c(c1)c(C)c(C)n2Cc...,CHEMBL3695898,1
...,...,...,...
1898,O=c1c(O)c(-c2ccc(O)cc2)oc2cc(O)cc(O)c12,CHEMBL150,0
1899,COc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O)ccc1O,CHEMBL379064,0
1900,CCCCCCCCCCCCCCCC(=O)O,CHEMBL82293,0
1901,CCCCCCCCC=CCCCCCCCC(=O)O,CHEMBL8659,0


In [30]:
#save data to csv
preprocessed_df_useful.to_csv('./data_sets/ppar_gamma_preprocessed.csv', index=False, sep=';')