In [1]:
import logging

# Disable RDKit logging
logger = logging.getLogger('rdkit')
logger.disabled = True
import rdkit
print(rdkit.__version__)
from rdkit import Chem
import numpy as np
import pandas as pd
from rdkit.Chem import AllChem, PandasTools

2021.03.2


In [2]:
'''
The activity data (IC50) of compounds tested on PPAR_gamma (homo sapiens) were downloaded from ChEMBL (CHEMBL251). 

'''
dfic50 = pd.read_csv('./data_sets/aa2a_ic50.csv', delimiter=';')

In [3]:
len(dfic50)

1604

In [4]:

dfic50.dropna(subset = ['Smiles'],inplace=True)
dfic50["Standard Units"].unique(), dfic50['Standard Relation'].unique()

(array([nan, 'nM'], dtype=object),
 array([nan, "'='", "'>'", "'>='", "'<'"], dtype=object))

In [5]:
dfic50_1 = dfic50[dfic50['Standard Relation']=="'='"]
dfic50_2 = dfic50.loc[(dfic50['Standard Relation']=="'>'")|(dfic50['Standard Relation']=="'>='")]
dfic50_3 = dfic50[dfic50['Standard Relation'].isna()]
len(dfic50_1), len(dfic50_2), len(dfic50_3)


(582, 142, 842)

In [6]:
#Only the compounds with IC50 tested and pChEMBL values calculated were selected for further processing and analysis. 
dfic50_1.dropna(subset = ['pChEMBL Value'],inplace=True)
len(dfic50_1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfic50_1.dropna(subset = ['pChEMBL Value'],inplace=True)


574

In [7]:
dfic50_1

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value
2,CHEMBL123195,,,293.33,0.0,2.41,13,CCCCc1nc2[nH]cnc2c2nc(-c3cccnc3)nn12,IC50,'=',...,CHEMBL1135548,1,Scientific Literature,J Med Chem,2002.0,CHEMBL3307715,,,,900.00
3,CHEMBL81863,,,329.41,0.0,4.25,4g,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,IC50,'=',...,CHEMBL1131846,1,Scientific Literature,Bioorg Med Chem Lett,1999.0,,,,,1711.00
10,CHEMBL21572,,,463.48,1.0,2.90,49,CSc1c2c(nc(NC(=O)Cc3ccc4c(c3)OCO4)n3nc(-c4ccco...,IC50,'=',...,CHEMBL1136743,1,Scientific Literature,J Med Chem,2003.0,,,,,21.50
11,CHEMBL1081160,,,395.51,0.0,4.06,BDBM50310925,CN(C)c1cc2nc(NCc3cccnc3)nc(C(=O)c3cccs3)c2s1,IC50,'=',...,CHEMBL3886219,37,BindingDB Patent Bioactivity Data,,2015.0,,,,,318.30
15,CHEMBL1077750,,,288.40,0.0,3.85,5,CCc1nc(C(=O)c2ccc(C)s2)c2sccc2n1,IC50,'=',...,CHEMBL1154258,1,Scientific Literature,Bioorg Med Chem Lett,2009.0,CHEMBL3307512,,,,3852.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1596,CHEMBL113142,ZM-241385,,337.34,0.0,1.72,ZM241385,Nc1nc(NCCc2ccc(O)cc2)nc2nc(-c3ccco3)nn12,IC50,'=',...,CHEMBL5126601,1,Scientific Literature,ACS Med Chem Lett,2022.0,,,ANTAGONIST,,8.82
1597,CHEMBL4849795,,,274.33,0.0,3.06,27,Nc1nc(Cc2ccccc2)cn2c1nc1ccccc12,IC50,'=',...,CHEMBL4813923,1,Scientific Literature,Eur J Med Chem,2021.0,,TIME = 0.6667 hr,ANTAGONIST,,31.00
1598,CHEMBL3694769,PBF-509,1.0,306.13,0.0,1.19,40; PBF-509,Nc1nc(-n2cccn2)nc(-n2cccn2)c1Br,IC50,'=',...,CHEMBL4699449,1,Scientific Literature,J Med Chem,2020.0,,TIME = 0.5 hr,ANTAGONIST,,25.00
1602,CHEMBL4522981,,,433.45,0.0,2.10,8; BAY-545,CCn1c(=O)c2c(C)c(C(=O)N3CCC(O)CC3)sc2n(CCC(F)(...,IC50,'=',...,CHEMBL4422615,1,Scientific Literature,Eur J Med Chem,2019.0,CHEMBL3308072,,,,820.00


In [8]:
len(dfic50_1[dfic50_1['pChEMBL Value']>=6]),len(dfic50_1[dfic50_1['pChEMBL Value']<6])

(434, 140)

In [9]:
import sys
sys.path.append('/home/sliu/cyp_related_coding/CYP19_inhibitor_screening/')
'''
The following function used for molecule preprocessing is from 'RingSySTEMS' project: https://github.com/anya-chen/RingSystems/tree/master/Preprocessing/src
'''
from src import MoleculePreprocessor
from src.MoleculePreprocessor import MoleculePreprocessor
from src.MoleculePreprocessorExtended import MoleculePreprocessorExtended

In [10]:
moleculesProcessed = MoleculePreprocessorExtended.init_with_smiles(list(dfic50_1.Smiles))

In [11]:
from rdkit import RDLogger
moleculesProcessed.csp_wash()

In [12]:
preprocessedSmilesDict = moleculesProcessed.get_rawsmiles_smiles_dict()

In [13]:
preprocessedSmiles_df = pd.DataFrame(list(preprocessedSmilesDict.items()), columns=['rawSmiles','preprocessedSmiles'])

In [14]:
preprocessed_df = pd.merge(dfic50_1, preprocessedSmiles_df, 
                           left_on='Smiles', right_on='rawSmiles')

In [15]:
preprocessed_df.dropna(subset=['preprocessedSmiles'],inplace=True) 

In [16]:
preprocessed_df = preprocessed_df[preprocessed_df['preprocessedSmiles']!='']

In [17]:
len(preprocessed_df)

565

In [18]:
# Use the loc method to set the values in the new column based on the values in the existing column
preprocessed_df.loc[preprocessed_df['pChEMBL Value']>=6, 'y_true_label'] = '1'
preprocessed_df.loc[preprocessed_df['pChEMBL Value']<6, 'y_true_label'] = '0'

In [19]:
def remove_stereochemistry(smiles):
    # Convert the SMILES string to an RDKit molecule object
    mol = Chem.MolFromSmiles(smiles)
    
    # Use the Chem.RemoveStereochemistry function to remove stereo informationz
    Chem.RemoveStereochemistry(mol)
    
    # Convert the molecule back to a SMILES string without stereochemistry
    return Chem.MolToSmiles(mol)

preprocessed_df['preprocessedSmiles'] = preprocessed_df['preprocessedSmiles'].apply(remove_stereochemistry)
preprocessed_df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value,rawSmiles,preprocessedSmiles,y_true_label
0,CHEMBL123195,,,293.33,0.0,2.41,13,CCCCc1nc2[nH]cnc2c2nc(-c3cccnc3)nn12,IC50,'=',...,J Med Chem,2002.0,CHEMBL3307715,,,,900.0,CCCCc1nc2[nH]cnc2c2nc(-c3cccnc3)nn12,CCCCc1nc2[nH]cnc2c2nc(-c3cccnc3)nn12,1
1,CHEMBL81863,,,329.41,0.0,4.25,4g,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,IC50,'=',...,Bioorg Med Chem Lett,1999.0,,,,,1711.0,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,0
2,CHEMBL81863,,,329.41,0.0,4.25,4g,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,IC50,'=',...,Bioorg Med Chem Lett,1999.0,CHEMBL3307715,,,,2010.0,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,0
3,CHEMBL21572,,,463.48,1.0,2.90,49,CSc1c2c(nc(NC(=O)Cc3ccc4c(c3)OCO4)n3nc(-c4ccco...,IC50,'=',...,J Med Chem,2003.0,,,,,21.5,CSc1c2c(nc(NC(=O)Cc3ccc4c(c3)OCO4)n3nc(-c4ccco...,CSc1c2c(nc(NC(=O)Cc3ccc4c(c3)OCO4)n3nc(-c4ccco...,1
4,CHEMBL1081160,,,395.51,0.0,4.06,BDBM50310925,CN(C)c1cc2nc(NCc3cccnc3)nc(C(=O)c3cccs3)c2s1,IC50,'=',...,,2015.0,,,,,318.3,CN(C)c1cc2nc(NCc3cccnc3)nc(C(=O)c3cccs3)c2s1,CN(C)c1cc2nc(NCc3cccnc3)nc(C(=O)c3cccs3)c2s1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,CHEMBL5092804,,,461.53,0.0,2.17,23,COc1cccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N...,IC50,'=',...,J Med Chem,2022.0,,TIME = 1.0 hr,ANTAGONIST,,45.5,COc1cccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N...,COc1cccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N...,1
570,CHEMBL5081971,,,523.60,1.0,3.95,35,Cc1ccc(-c2nc(N)nc3c2ncc(=O)n3CCN2CCN(c3ccc(Oc4...,IC50,'=',...,J Med Chem,2022.0,,TIME = 1.0 hr,ANTAGONIST,,604.8,Cc1ccc(-c2nc(N)nc3c2ncc(=O)n3CCN2CCN(c3ccc(Oc4...,Cc1ccc(-c2nc(N)nc3c2ncc(=O)n3CCN2CCN(c3ccc(Oc4...,1
571,CHEMBL5079711,,,462.51,1.0,1.56,37,COc1ccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N)...,IC50,'=',...,J Med Chem,2022.0,,TIME = 1.0 hr,ANTAGONIST,,213.0,COc1ccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N)...,COc1ccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N)...,1
572,CHEMBL1933288,,,458.57,0.0,3.89,EUB0000679a,C[C@@H]1CCNC(=O)c2cc3ccc(C(=O)Nc4nc5ccccc5n4CC...,IC50,'=',...,,2023.0,,Selectivity number of off-targets = 0.0 None |...,,,1420.0,C[C@@H]1CCNC(=O)c2cc3ccc(C(=O)Nc4nc5ccccc5n4CC...,CC1CCNC(=O)c2cc3ccc(C(=O)Nc4nc5ccccc5n4CCCN(C)...,0


In [20]:
duplicates = preprocessed_df.duplicated('preprocessedSmiles', keep=False)

# Filter DataFrame to list duplicates with their information
duplicates_df = preprocessed_df[duplicates]
duplicates_df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value,rawSmiles,preprocessedSmiles,y_true_label
1,CHEMBL81863,,,329.41,0.0,4.25,4g,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,IC50,'=',...,Bioorg Med Chem Lett,1999.0,,,,,1711.00,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,0
2,CHEMBL81863,,,329.41,0.0,4.25,4g,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,IC50,'=',...,Bioorg Med Chem Lett,1999.0,CHEMBL3307715,,,,2010.00,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,0
4,CHEMBL1081160,,,395.51,0.0,4.06,BDBM50310925,CN(C)c1cc2nc(NCc3cccnc3)nc(C(=O)c3cccs3)c2s1,IC50,'=',...,,2015.0,,,,,318.30,CN(C)c1cc2nc(NCc3cccnc3)nc(C(=O)c3cccs3)c2s1,CN(C)c1cc2nc(NCc3cccnc3)nc(C(=O)c3cccs3)c2s1,1
5,CHEMBL1081160,,,395.51,0.0,4.06,35,CN(C)c1cc2nc(NCc3cccnc3)nc(C(=O)c3cccs3)c2s1,IC50,'=',...,Bioorg Med Chem Lett,2009.0,CHEMBL3307512,,,,1417.00,CN(C)c1cc2nc(NCc3cccnc3)nc(C(=O)c3cccs3)c2s1,CN(C)c1cc2nc(NCc3cccnc3)nc(C(=O)c3cccs3)c2s1,0
10,CHEMBL1079893,,,353.45,0.0,3.50,43,C[C@@H](Nc1nc(C(=O)N2CCCC2)c2sccc2n1)c1cccnc1,IC50,'=',...,Bioorg Med Chem Lett,2009.0,CHEMBL3307512,,,,6760.00,C[C@@H](Nc1nc(C(=O)N2CCCC2)c2sccc2n1)c1cccnc1,CC(Nc1nc(C(=O)N2CCCC2)c2sccc2n1)c1cccnc1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497,CHEMBL5176737,,,318.34,0.0,1.79,12o,Nc1nc(C(=O)NCc2ccccn2)cn2c1nc1ccccc12,IC50,'=',...,J Med Chem,2022.0,,TIME = 0.5 hr,ANTAGONIST,,69.90,Nc1nc(C(=O)NCc2ccccn2)cn2c1nc1ccccc12,Nc1nc(C(=O)NCc2ccccn2)cn2c1nc1ccccc12,1
511,CHEMBL5094102,,,475.55,0.0,2.56,48,COc1ccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N)...,IC50,'=',...,J Med Chem,2022.0,,TIME = 1.0 hr,ANTAGONIST,,6.90,COc1ccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N)...,COc1ccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N)...,1
529,CHEMBL431770,ISTRADEFYLLINE,4.0,384.44,0.0,2.12,"2, KW 6002",CCn1c(=O)c2c(nc(/C=C/c3ccc(OC)c(OC)c3)n2C)n(CC...,IC50,'=',...,Bioorg Med Chem Lett,2013.0,CHEMBL3308072,,,,5250.00,CCn1c(=O)c2c(nc(/C=C/c3ccc(OC)c(OC)c3)n2C)n(CC...,CCn1c(=O)c2c(nc(C=Cc3ccc(OC)c(OC)c3)n2C)n(CC)c1=O,0
530,CHEMBL431770,ISTRADEFYLLINE,4.0,384.44,0.0,2.12,"2, KW 6002",CCn1c(=O)c2c(nc(/C=C/c3ccc(OC)c(OC)c3)n2C)n(CC...,IC50,'=',...,Bioorg Med Chem Lett,2013.0,CHEMBL3308072,,,,5.28,CCn1c(=O)c2c(nc(/C=C/c3ccc(OC)c(OC)c3)n2C)n(CC...,CCn1c(=O)c2c(nc(C=Cc3ccc(OC)c(OC)c3)n2C)n(CC)c1=O,0


In [21]:
groups = duplicates_df.groupby('preprocessedSmiles')
id_toremove = []
for name, group in groups:
    # Check if all values in 'y_true_label' are the same within the group
    if not group['y_true_label'].nunique() == 1:
        # Print out 'Molecule ChEMBL ID' for all entries in the group
        print(f"Different activity classifications found for {name}:")
        for chembl_id in group['Molecule ChEMBL ID']:
            print(chembl_id)
            id_toremove.append(chembl_id)

Different activity classifications found for CC(Nc1nc(C(=O)N2CCCC2)c2sccc2n1)c1cccnc1:
CHEMBL1079893
CHEMBL1082064
CHEMBL1079894
Different activity classifications found for CCNc1cc2nc(NCc3cccnc3)nc(C(=O)c3cccs3)c2s1:
CHEMBL1080421
CHEMBL1080421
Different activity classifications found for CN(C)c1cc2nc(NCc3cccnc3)nc(C(=O)c3cccs3)c2s1:
CHEMBL1081160
CHEMBL1081160
Different activity classifications found for Cc1c(C#N)cccc1-c1cc(-c2ccn(Cc3cccc(C(C)(C)O)c3)c(=O)c2)nc(N)n1:
CHEMBL5424461
CHEMBL5424461
CHEMBL5424461
Different activity classifications found for Cc1ccc(-c2nc(N)nc3c2nnn3Cc2cccc(COC3CCOC3)n2)o1:
CHEMBL4297184
CHEMBL4297184
CHEMBL4297184
CHEMBL4297184
CHEMBL4297184
CHEMBL4297184


In [22]:
print(len(id_toremove))

16


In [23]:
print(len(set(id_toremove)))

7


In [24]:
# Drop the rows involved in the printed chembl_ids
for i in set(id_toremove):
    preprocessed_df = preprocessed_df[preprocessed_df['Molecule ChEMBL ID'] != i]

In [25]:
preprocessed_df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value,rawSmiles,preprocessedSmiles,y_true_label
0,CHEMBL123195,,,293.33,0.0,2.41,13,CCCCc1nc2[nH]cnc2c2nc(-c3cccnc3)nn12,IC50,'=',...,J Med Chem,2002.0,CHEMBL3307715,,,,900.0,CCCCc1nc2[nH]cnc2c2nc(-c3cccnc3)nn12,CCCCc1nc2[nH]cnc2c2nc(-c3cccnc3)nn12,1
1,CHEMBL81863,,,329.41,0.0,4.25,4g,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,IC50,'=',...,Bioorg Med Chem Lett,1999.0,,,,,1711.0,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,0
2,CHEMBL81863,,,329.41,0.0,4.25,4g,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,IC50,'=',...,Bioorg Med Chem Lett,1999.0,CHEMBL3307715,,,,2010.0,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,0
3,CHEMBL21572,,,463.48,1.0,2.90,49,CSc1c2c(nc(NC(=O)Cc3ccc4c(c3)OCO4)n3nc(-c4ccco...,IC50,'=',...,J Med Chem,2003.0,,,,,21.5,CSc1c2c(nc(NC(=O)Cc3ccc4c(c3)OCO4)n3nc(-c4ccco...,CSc1c2c(nc(NC(=O)Cc3ccc4c(c3)OCO4)n3nc(-c4ccco...,1
6,CHEMBL1077750,,,288.40,0.0,3.85,5,CCc1nc(C(=O)c2ccc(C)s2)c2sccc2n1,IC50,'=',...,Bioorg Med Chem Lett,2009.0,CHEMBL3307512,,,,3852.0,CCc1nc(C(=O)c2ccc(C)s2)c2sccc2n1,CCc1nc(C(=O)c2ccc(C)s2)c2sccc2n1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,CHEMBL5092804,,,461.53,0.0,2.17,23,COc1cccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N...,IC50,'=',...,J Med Chem,2022.0,,TIME = 1.0 hr,ANTAGONIST,,45.5,COc1cccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N...,COc1cccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N...,1
570,CHEMBL5081971,,,523.60,1.0,3.95,35,Cc1ccc(-c2nc(N)nc3c2ncc(=O)n3CCN2CCN(c3ccc(Oc4...,IC50,'=',...,J Med Chem,2022.0,,TIME = 1.0 hr,ANTAGONIST,,604.8,Cc1ccc(-c2nc(N)nc3c2ncc(=O)n3CCN2CCN(c3ccc(Oc4...,Cc1ccc(-c2nc(N)nc3c2ncc(=O)n3CCN2CCN(c3ccc(Oc4...,1
571,CHEMBL5079711,,,462.51,1.0,1.56,37,COc1ccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N)...,IC50,'=',...,J Med Chem,2022.0,,TIME = 1.0 hr,ANTAGONIST,,213.0,COc1ccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N)...,COc1ccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N)...,1
572,CHEMBL1933288,,,458.57,0.0,3.89,EUB0000679a,C[C@@H]1CCNC(=O)c2cc3ccc(C(=O)Nc4nc5ccccc5n4CC...,IC50,'=',...,,2023.0,,Selectivity number of off-targets = 0.0 None |...,,,1420.0,C[C@@H]1CCNC(=O)c2cc3ccc(C(=O)Nc4nc5ccccc5n4CC...,CC1CCNC(=O)c2cc3ccc(C(=O)Nc4nc5ccccc5n4CCCN(C)...,0


In [26]:
preprocessed_df.drop_duplicates('preprocessedSmiles',inplace=True)
preprocessed_df

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value,rawSmiles,preprocessedSmiles,y_true_label
0,CHEMBL123195,,,293.33,0.0,2.41,13,CCCCc1nc2[nH]cnc2c2nc(-c3cccnc3)nn12,IC50,'=',...,J Med Chem,2002.0,CHEMBL3307715,,,,900.0,CCCCc1nc2[nH]cnc2c2nc(-c3cccnc3)nn12,CCCCc1nc2[nH]cnc2c2nc(-c3cccnc3)nn12,1
1,CHEMBL81863,,,329.41,0.0,4.25,4g,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,IC50,'=',...,Bioorg Med Chem Lett,1999.0,,,,,1711.0,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,0
3,CHEMBL21572,,,463.48,1.0,2.90,49,CSc1c2c(nc(NC(=O)Cc3ccc4c(c3)OCO4)n3nc(-c4ccco...,IC50,'=',...,J Med Chem,2003.0,,,,,21.5,CSc1c2c(nc(NC(=O)Cc3ccc4c(c3)OCO4)n3nc(-c4ccco...,CSc1c2c(nc(NC(=O)Cc3ccc4c(c3)OCO4)n3nc(-c4ccco...,1
6,CHEMBL1077750,,,288.40,0.0,3.85,5,CCc1nc(C(=O)c2ccc(C)s2)c2sccc2n1,IC50,'=',...,Bioorg Med Chem Lett,2009.0,CHEMBL3307512,,,,3852.0,CCc1nc(C(=O)c2ccc(C)s2)c2sccc2n1,CCc1nc(C(=O)c2ccc(C)s2)c2sccc2n1,0
7,CHEMBL1078001,,,380.45,0.0,3.94,14,Cc1ccc(C(=O)c2nc(NC(=O)c3cccnc3)nc3ccsc23)s1,IC50,'=',...,Bioorg Med Chem Lett,2009.0,CHEMBL3307512,,,,161.0,Cc1ccc(C(=O)c2nc(NC(=O)c3cccnc3)nc3ccsc23)s1,Cc1ccc(C(=O)c2nc(NC(=O)c3cccnc3)nc3ccsc23)s1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,CHEMBL5092804,,,461.53,0.0,2.17,23,COc1cccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N...,IC50,'=',...,J Med Chem,2022.0,,TIME = 1.0 hr,ANTAGONIST,,45.5,COc1cccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N...,COc1cccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N...,1
570,CHEMBL5081971,,,523.60,1.0,3.95,35,Cc1ccc(-c2nc(N)nc3c2ncc(=O)n3CCN2CCN(c3ccc(Oc4...,IC50,'=',...,J Med Chem,2022.0,,TIME = 1.0 hr,ANTAGONIST,,604.8,Cc1ccc(-c2nc(N)nc3c2ncc(=O)n3CCN2CCN(c3ccc(Oc4...,Cc1ccc(-c2nc(N)nc3c2ncc(=O)n3CCN2CCN(c3ccc(Oc4...,1
571,CHEMBL5079711,,,462.51,1.0,1.56,37,COc1ccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N)...,IC50,'=',...,J Med Chem,2022.0,,TIME = 1.0 hr,ANTAGONIST,,213.0,COc1ccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N)...,COc1ccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N)...,1
572,CHEMBL1933288,,,458.57,0.0,3.89,EUB0000679a,C[C@@H]1CCNC(=O)c2cc3ccc(C(=O)Nc4nc5ccccc5n4CC...,IC50,'=',...,,2023.0,,Selectivity number of off-targets = 0.0 None |...,,,1420.0,C[C@@H]1CCNC(=O)c2cc3ccc(C(=O)Nc4nc5ccccc5n4CC...,CC1CCNC(=O)c2cc3ccc(C(=O)Nc4nc5ccccc5n4CCCN(C)...,0


In [27]:
# Check for duplicates in 'preprocessedSmiles'
duplicates = preprocessed_df.duplicated(subset='preprocessedSmiles', keep=False)

# Show rows where 'preprocessedSmiles' is duplicated
duplicate_rows = preprocessed_df[duplicates]
duplicate_rows

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value,rawSmiles,preprocessedSmiles,y_true_label


In [28]:
len(preprocessed_df[preprocessed_df['pChEMBL Value']>=6]),len(preprocessed_df[preprocessed_df['pChEMBL Value']<6])

(334, 109)

In [29]:
preprocessed_df_useful = preprocessed_df[['preprocessedSmiles','Molecule ChEMBL ID', 'y_true_label']]
preprocessed_df_useful

Unnamed: 0,preprocessedSmiles,Molecule ChEMBL ID,y_true_label
0,CCCCc1nc2[nH]cnc2c2nc(-c3cccnc3)nn12,CHEMBL123195,1
1,Cc1[nH]c2nc(-c3ccccc3)nc(NCc3ccncc3)c2c1C,CHEMBL81863,0
3,CSc1c2c(nc(NC(=O)Cc3ccc4c(c3)OCO4)n3nc(-c4ccco...,CHEMBL21572,1
6,CCc1nc(C(=O)c2ccc(C)s2)c2sccc2n1,CHEMBL1077750,0
7,Cc1ccc(C(=O)c2nc(NC(=O)c3cccnc3)nc3ccsc23)s1,CHEMBL1078001,1
...,...,...,...
569,COc1cccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N...,CHEMBL5092804,1
570,Cc1ccc(-c2nc(N)nc3c2ncc(=O)n3CCN2CCN(c3ccc(Oc4...,CHEMBL5081971,1
571,COc1ccc(N2CCN(CCn3c(=O)cnc4c(-c5ccc(C)o5)nc(N)...,CHEMBL5079711,1
572,CC1CCNC(=O)c2cc3ccc(C(=O)Nc4nc5ccccc5n4CCCN(C)...,CHEMBL1933288,0


In [30]:
#save data to csv
preprocessed_df_useful.to_csv('./data_sets/aa2a_preprocessed.csv', index=False, sep=';')