# Filter predictions

26th May - apply structural fiilters to isocyanide ugi predictions to remove greaseballs etc

In [1]:
import pandas as pd
from useful_rdkit_utils import add_molecule_and_errors

df_isocyanides = pd.read_csv('/rds-d2/user/wjm41/hpc-work/datasets/Ugis/datasets/enamine_library_generation/enumerated_isocyanides_scored.csv')
add_molecule_and_errors(df_isocyanides, 'ugi', mol_col_name='mol')

INFO:rdkit:Enabling RDKit 2021.09.4 jupyter extensions
100%|██████████| 62811/62811 [00:28<00:00, 2231.19it/s]
100%|██████████| 62811/62811 [03:53<00:00, 269.24it/s]
100%|██████████| 62811/62811 [00:28<00:00, 2222.81it/s]
100%|██████████| 62811/62811 [00:03<00:00, 17578.14it/s]
100%|██████████| 62811/62811 [00:03<00:00, 17158.01it/s]
100%|██████████| 62811/62811 [00:24<00:00, 2562.53it/s]


In [2]:
from tqdm import tqdm

from dock2hit.filters import return_pains_lib, return_alert_substructs
from dock2hit.filters import does_mol_have_more_than_one_chiral_center, does_mol_have_structural_alert, does_mol_have_too_many_donors_and_acceptors, does_mol_match_pains, does_mol_satisfy_ro5

pains_lib = return_pains_lib()
alert_substructs = return_alert_substructs()

tqdm.pandas()
df_isocyanides['has_structural_alert'] = df_isocyanides['mol'].progress_apply(does_mol_have_structural_alert, 
                                                                     alert_substructs=alert_substructs)
df_isocyanides['matches_pain'] = df_isocyanides['mol'].progress_apply(
    does_mol_match_pains, pains_lib=pains_lib)
df_isocyanides['more_than_one_chiral_center'] = df_isocyanides['mol'].progress_apply(
    does_mol_have_more_than_one_chiral_center)
df_isocyanides['too_many_pcores'] = df_isocyanides['mol'].progress_apply(does_mol_have_too_many_donors_and_acceptors)
df_isocyanides['satisfies_ro5'] = df_isocyanides['mol'].progress_apply(
    does_mol_satisfy_ro5)

df_filtered = df_isocyanides.query(
    'satisfies_ro5 & ~too_many_pcores & ~matches_pain & ~more_than_one_chiral_center & ~has_structural_alert')
df_filtered

Unnamed: 0,ID,SMILES,ugi,predicted_pIC50,mol,Error,has_structural_alert,matches_pain,more_than_one_chiral_center,too_many_pcores,satisfies_ro5
8,EN300-75265,CCNc1ccncc1,CCN(C(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1ccc(OC(C)...,5.494262,<rdkit.Chem.rdchem.Mol object at 0x151acd92a5e0>,,False,False,False,False,True
11,EN300-19067,COc1cccc(N)c1,COc1cccc(NC(=O)C(c2cccnc2)N(C(=O)c2cocn2)c2ccc...,5.476997,<rdkit.Chem.rdchem.Mol object at 0x151acd92a9a0>,,False,False,False,False,True
38,EN300-20183,N#Cc1cccc(N)c1,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)Nc2cccc(C#N)...,5.420334,<rdkit.Chem.rdchem.Mol object at 0x151acd1f8400>,,False,False,False,False,True
85,EN300-180531,CCC(C)(C)CN.Cl,CCC(C)(C)CNC(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1cc...,5.379709,<rdkit.Chem.rdchem.Mol object at 0x151acd1f95e0>,,False,False,False,False,True
105,EN300-54456,NCCc1ccc[nH]1,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)NCCc2ccc[nH]...,5.370107,<rdkit.Chem.rdchem.Mol object at 0x151acd1f9d60>,,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
61887,EN300-238018,c1cc2c(cn1)CCCN2,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)N2CCCc3cnccc...,4.188120,<rdkit.Chem.rdchem.Mol object at 0x151ace60d8e0>,,False,False,False,False,True
61899,EN300-27100943,O=C(O)C(F)(F)F.O=C1CC2(CNC2)N1,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)N2CC3(CC(=O)...,4.187679,<rdkit.Chem.rdchem.Mol object at 0x151ace60dd60>,,False,False,False,False,True
61913,EN300-7427792,Cl.c1cc2c(cn1)NCCC2,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)N2CCCc3ccncc...,4.187255,<rdkit.Chem.rdchem.Mol object at 0x151ace60f2e0>,,False,False,False,False,True
61914,EN300-4299857,c1cc2c(cn1)NCCC2,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)N2CCCc3ccncc...,4.187255,<rdkit.Chem.rdchem.Mol object at 0x151ace60f340>,,False,False,False,False,True


In [3]:
from rdkit.Chem import Lipinski, Descriptors

df_isocyanides['logP'] = df_isocyanides['mol'].progress_apply(
    Descriptors.MolLogP)


In [17]:
from rdkit.Chem import MolFromSmiles

best_ugi_mol = 'CC(C)Oc1ccc(cc1)N(C(C(=O)NCCc1ccns1)c1cccnc1)C(=O)c1cocn1'
print(Descriptors.MolLogP(MolFromSmiles(best_ugi_mol)))
print(does_mol_satisfy_ro5(best_ugi_mol))

4.0603000000000025
True


In [4]:
df_filtered[['ID', 'SMILES', 'ugi', 'predicted_pIC50']].sort_values(by='predicted_pIC50', ascending=False).to_csv(
    '/rds-d2/user/wjm41/hpc-work/datasets/Ugis/datasets/enamine_library_generation/enumerated_isocyanides_filtered.csv', index=False)
df_isocyanides.drop(columns=['mol', 'Error']).to_csv(
    '/rds-d2/user/wjm41/hpc-work/datasets/Ugis/datasets/enamine_library_generation/enumerated_isocyanides_scored.csv', index=False)


In [19]:
import mols2grid

df_view = df_isocyanides.query(
    '~more_than_one_chiral_center & ~matches_pain & ~has_structural_alert & satisfies_ro5').drop_duplicates('ugi')

best_mols = df_view.nlargest(100, 'predicted_pIC50')
mols2grid.display(best_mols,
                  smiles_col='ugi',
                  subset=['img', 'predicted_pIC50', 'logP'],
                  transform={
                      'predicted_pIC50': lambda x: f'Predicted pIC50: {x:.3f}',
                      'logP': lambda x: f'LogP: {x:.3f}'},
                  size=(300, 300),
                  width=2000)


In [24]:
best_mols[['predicted_pIC50', 'logP']] = best_mols[['predicted_pIC50', 'logP']].round(3)


In [25]:
best_mols[['ID', 'SMILES', 'ugi', 'predicted_pIC50', 'logP']].to_csv(
    '/rds-d2/user/wjm41/hpc-work/datasets/Ugis/datasets/enamine_library_generation/enumerated_isocyanide_library/top_100.csv', index=False)


In [14]:
df_view = df_isocyanides.query('~more_than_one_chiral_center & logP < 5.1')

best_mols = df_view.nlargest(100, 'predicted_pIC50')
best_mols.iloc[:11]

Unnamed: 0,ID,SMILES,ugi,predicted_pIC50,mol,Error,has_structural_alert,matches_pain,more_than_one_chiral_center,too_many_pcores,satisfies_ro5,logP
3,EN300-61594,CCNc1cccc(CO)c1,CCN(C(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1ccc(OC(C)...,5.528175,<rdkit.Chem.rdchem.Mol object at 0x151acd92a8e0>,,False,False,False,False,False,4.7903
4,EN300-100942,CCCNc1cccnc1,CCCN(C(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1ccc(OC(C...,5.518613,<rdkit.Chem.rdchem.Mol object at 0x151acd92a4c0>,,False,False,False,False,False,5.0831
8,EN300-75265,CCNc1ccncc1,CCN(C(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1ccc(OC(C)...,5.494262,<rdkit.Chem.rdchem.Mol object at 0x151acd92a5e0>,,False,False,False,False,True,4.693
9,EN300-174170,CCNc1cccc(O)c1,CCN(C(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1ccc(OC(C)...,5.49406,<rdkit.Chem.rdchem.Mol object at 0x151acd92a3a0>,,True,False,False,False,False,5.0036
11,EN300-19067,COc1cccc(N)c1,COc1cccc(NC(=O)C(c2cccnc2)N(C(=O)c2cocn2)c2ccc...,5.476997,<rdkit.Chem.rdchem.Mol object at 0x151acd92a9a0>,,False,False,False,False,True,4.8922
16,EN300-53053,Nc1ccc(CCO)cc1,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)Nc2ccc(CCO)c...,5.466761,<rdkit.Chem.rdchem.Mol object at 0x151acd92ab80>,,False,False,False,True,False,4.4184
19,EN300-248914,Nc1cccc(CCO)c1,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)Nc2cccc(CCO)...,5.461609,<rdkit.Chem.rdchem.Mol object at 0x151acd92aca0>,,False,False,False,True,False,4.4184
21,EN300-120383,COc1ccnc(N)c1,COc1ccnc(NC(=O)C(c2cccnc2)N(C(=O)c2cocn2)c2ccc...,5.454452,<rdkit.Chem.rdchem.Mol object at 0x151acd92ad60>,,False,False,False,True,True,4.2872
30,EN300-61358,Nc1cccc(C(=O)Nc2cnccn2)c1,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)Nc2cccc(C(=O...,5.430921,<rdkit.Chem.rdchem.Mol object at 0x151acd1f8100>,,False,False,False,True,False,4.9259
36,EN300-385266,Cl.Cl.NCCc1ccns1,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)NCCc2ccns2)c...,5.420857,<rdkit.Chem.rdchem.Mol object at 0x151acd1f8340>,,False,False,False,True,True,4.0603


In [11]:
df_view = df_isocyanides.query(
    '~more_than_one_chiral_center & ~matches_pain & ~has_structural_alert & ~too_many_pcores & logP < 5.1').nlargest(100, 'predicted_pIC50').iloc[:10]
df_view


Unnamed: 0,ID,SMILES,ugi,predicted_pIC50,mol,Error,has_structural_alert,matches_pain,more_than_one_chiral_center,too_many_pcores,satisfies_ro5,logP
3,EN300-61594,CCNc1cccc(CO)c1,CCN(C(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1ccc(OC(C)...,5.528175,<rdkit.Chem.rdchem.Mol object at 0x151acd92a8e0>,,False,False,False,False,False,4.7903
4,EN300-100942,CCCNc1cccnc1,CCCN(C(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1ccc(OC(C...,5.518613,<rdkit.Chem.rdchem.Mol object at 0x151acd92a4c0>,,False,False,False,False,False,5.0831
8,EN300-75265,CCNc1ccncc1,CCN(C(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1ccc(OC(C)...,5.494262,<rdkit.Chem.rdchem.Mol object at 0x151acd92a5e0>,,False,False,False,False,True,4.693
11,EN300-19067,COc1cccc(N)c1,COc1cccc(NC(=O)C(c2cccnc2)N(C(=O)c2cocn2)c2ccc...,5.476997,<rdkit.Chem.rdchem.Mol object at 0x151acd92a9a0>,,False,False,False,False,True,4.8922
38,EN300-20183,N#Cc1cccc(N)c1,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)Nc2cccc(C#N)...,5.420334,<rdkit.Chem.rdchem.Mol object at 0x151acd1f8400>,,False,False,False,False,True,4.75528
46,EN300-106670,OCCNc1ccc(Br)cc1,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)N(CCO)c2ccc(...,5.405484,<rdkit.Chem.rdchem.Mol object at 0x151acd1f8700>,,False,False,False,False,False,5.0329
65,EN300-107394,NCCc1ccc(Cl)nc1,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)NCCc2ccc(Cl)...,5.390312,<rdkit.Chem.rdchem.Mol object at 0x151acd1f8e20>,,False,False,False,False,False,4.6522
82,EN300-137644,NCCc1ccc2[nH]ccc2c1,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)NCCc2ccc3[nH...,5.38032,<rdkit.Chem.rdchem.Mol object at 0x151acd1f94c0>,,False,False,False,False,False,5.0851
85,EN300-180531,CCC(C)(C)CN.Cl,CCC(C)(C)CNC(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1cc...,5.379709,<rdkit.Chem.rdchem.Mol object at 0x151acd1f95e0>,,False,False,False,False,True,4.7973
91,EN300-76663,COCCNc1ccncc1,COCCN(C(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1ccc(OC(...,5.376807,<rdkit.Chem.rdchem.Mol object at 0x151acd1f9820>,,False,False,False,False,False,4.3195


In [24]:
df_view = df_isocyanides.query(
    '~more_than_one_chiral_center & ~matches_pain & ~has_structural_alert & ~too_many_pcores & logP < 5.1').drop_duplicates('ugi').sort_values(by='predicted_pIC50', ascending=False)
df_view

Unnamed: 0,ID,SMILES,ugi,predicted_pIC50,has_structural_alert,matches_pain,more_than_one_chiral_center,too_many_pcores,satisfies_ro5,mol,Error,logP
0,EN300-61594,CCNc1cccc(CO)c1,CCN(C(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1ccc(OC(C)...,5.552310,False,False,False,False,False,<rdkit.Chem.rdchem.Mol object at 0x150296e3e220>,,4.79030
2,EN300-100942,CCCNc1cccnc1,CCCN(C(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1ccc(OC(C...,5.492673,False,False,False,False,False,<rdkit.Chem.rdchem.Mol object at 0x150296e3e100>,,5.08310
10,EN300-75265,CCNc1ccncc1,CCN(C(=O)C(c1cccnc1)N(C(=O)c1cocn1)c1ccc(OC(C)...,5.435981,False,False,False,False,True,<rdkit.Chem.rdchem.Mol object at 0x150296e3e7c0>,,4.69300
24,EN300-68689,CC(C)(C)c1nc2cc(N)ccc2[nH]1.Cl.Cl,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)N(CCO)c2ccc(...,5.399508,False,False,False,False,False,<rdkit.Chem.rdchem.Mol object at 0x150296e3efa0>,,5.03290
27,EN300-19067,COc1cccc(N)c1,COc1cccc(NC(=O)C(c2cccnc2)N(C(=O)c2cocn2)c2ccc...,5.391374,False,False,False,False,True,<rdkit.Chem.rdchem.Mol object at 0x150299eb2100>,,4.89220
...,...,...,...,...,...,...,...,...,...,...,...,...
62615,EN300-55249,CCOc1cccc(C(C)N)c1,CC(C)Oc1ccc(N(C(=O)c2cocn2)C(C(=O)N2CCN(C)c3cc...,4.159684,False,False,False,False,False,<rdkit.Chem.rdchem.Mol object at 0x1502994e2f40>,,4.86700
62629,EN300-28256540,Cc1ncc(F)c(C2CCNCC2)n1.Cl.Cl,Cc1ncc(F)c(C2CCN(C(=O)C(c3cccnc3)N(C(=O)c3cocn...,4.158195,False,False,False,False,False,<rdkit.Chem.rdchem.Mol object at 0x1502994e44c0>,,4.88862
62660,EN300-33484,Nc1cccc(Cl)c1-n1cncn1,Cc1cc(C)n(CC2CCN(C(=O)C(c3cccnc3)N(C(=O)c3cocn...,4.155576,False,False,False,False,False,<rdkit.Chem.rdchem.Mol object at 0x1502994e60a0>,,4.99704
62718,EN300-359669,Cc1cc(C)n(C2CCNCC2)n1,Cc1cc(C)n(C2CCN(C(=O)C(c3cccnc3)N(C(=O)c3cocn3...,4.149045,False,False,False,False,False,<rdkit.Chem.rdchem.Mol object at 0x1502994e76a0>,,4.92194


In [26]:

mols2grid.display(df_view.nlargest(20, 'predicted_pIC50'),
                  smiles_col='ugi',
                  subset=['img', 'predicted_pIC50', 'logP'],
                  transform={
                      'predicted_pIC50': lambda x: f'Predicted pIC50: {x:.3f}',
                      'logP': lambda x: f'LogP: {x:.3f}'},
                  size=(300, 300),
                  width=2000)
