In [1]:
import os
os.chdir("../")

In [2]:
import pandas as pd

from rdkit import Chem
from rdkit.Chem import SaltRemover
from networkx import to_numpy_matrix
from mmpa.mmp import MMP

from tqdm import tqdm
tqdm.pandas()

### import all activity values from glen's hdac paper

https://www.ebi.ac.uk/chembl/g/#browse/activities/filter/document_chembl_id%3ACHEMBL1144843

In [3]:
df = pd.read_csv('/home/daniel/Downloads/DOWNLOAD-yHB7WOyNM1ncO7P50KZYChBxehUvIU9pGeCACYqyD4Y=.zip', sep=';', compression='zip')

### most common molecules

In [4]:
df['Molecule Name'].value_counts()[0:10]

Series([], Name: Molecule Name, dtype: int64)

### most common targets

In [5]:
df['Target Name'].value_counts()[0:10]

Rattus norvegicus        44
No relevant target       24
HCT-116                  13
Histone deacetylase 1    13
HERG                     13
Canis familiaris          7
Plasma                    3
Mus musculus              2
Name: Target Name, dtype: int64

### filter hdac1 measurements

In [6]:
df_hdac1 = df[df['Target Name']=='Histone deacetylase 1'].copy()
df_hdac1.Smiles.unique().size

13

### strip salts

In [7]:
def strip_salts(smiles):
    mol = Chem.MolFromSmiles(smiles)
    remover = SaltRemover.SaltRemover()
    mol, salts = remover.StripMolWithDeleted(mol)
    smiles = Chem.MolToSmiles(mol)
    return smiles

In [8]:
df_hdac1['Stripped'] = df_hdac1.Smiles.apply(strip_salts)
df_hdac1.Stripped.unique().size

13

### create cartesian product of molecules test in the same assay

In [9]:
df_hdac1_pairs = pd.merge(df_hdac1, df_hdac1, on='Assay ChEMBL ID')
df_hdac1_pairs = df_hdac1_pairs[['Stripped_x', 'Stripped_y']].drop_duplicates()
df_hdac1_pairs

Unnamed: 0,Stripped_x,Stripped_y
0,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(Cl...,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(Cl...
1,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(Cl...,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...
2,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(Cl...,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...
3,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(Cl...,N#Cc1cc(CN2CCC2)cnc1-c1ccc(C(=O)Nc2ccccc2N)cc1
4,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(Cl...,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(F)...
...,...,...
164,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...,Cc1cc(CN2CCN(C(C)C)CC2)cnc1-c1ccc(C(=O)Nc2cccc...
165,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...,Nc1ccccc1NC(=O)c1ccc(-c2ncc(CN3CCC3)cc2F)cc1
166,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...,Nc1ccccc1NC(=O)c1ccc(-c2ncc(CN3CCC3)cc2Cl)cc1
167,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...,Cc1cc(CN2CCC2)cnc1-c1ccc(C(=O)Nc2ccccc2N)cc1


### identify pairs

In [10]:
def apply_mmpa(prospective_pair):

    # prepare potential atom-atom mappings and create correspondence graph
    mmp = MMP(prospective_pair.Stripped_x, prospective_pair.Stripped_y)
    mmp.createCorrespondence(penalty=3.0)
    mmp.findCliques()
    mmp.eliminateMCS()
    
    # append frags to output
    prospective_pair['Fragment_x'] = mmp.getFragment1()
    prospective_pair['Fragment_y'] = mmp.getFragment2()
    prospective_pair['SMIRKS'] = mmp.getSmirks()
    
    # return
    return prospective_pair

In [11]:
df_hdac1_pairs = df_hdac1_pairs.progress_apply(apply_mmpa, axis=1)
df_hdac1_pairs = df_hdac1_pairs[~pd.isna(df_hdac1_pairs.SMIRKS)]
df_hdac1_pairs

100%|██████████| 169/169 [00:37<00:00,  4.52it/s]


Unnamed: 0,Stripped_x,Stripped_y,Fragment_x,Fragment_y,SMIRKS
1,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(Cl...,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...,cc(c)-c1ncc(CN2CCN(CC)CC2)cc1Cl,cc(c)-c1ncc(CN2CCN(C(C)C)CC2)cc1C#N,[#6:13](-[#6](-[#7:6]1-[#6:30](-[#6:29](-[#7:1...
2,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(Cl...,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...,cc(c)-c1ncc(CN2CCN(CC)CC2)cc1Cl,cc(c)-c1ncc(CN2CCN(C(C)C)CC2)cc1F,[#6:13](-[#6](-[#7:6]1-[#6:29](-[#6:28](-[#7:1...
3,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(Cl...,N#Cc1cc(CN2CCC2)cnc1-c1ccc(C(=O)Nc2ccccc2N)cc1,cc(c)-c1ncc(CN2CCN(CC)CC2)cc1Cl,cc(c)-c1ncc(CN2CCC2)cc1C#N,[#6](-[#6](-[#7]1-[#6](-[#6:21](-[#7:11](-[#6:...
4,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(Cl...,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(F)...,cc(c)-c1ncc(C)cc1Cl,cc(c)-c1ncc(C)cc1F,[#6:10](-[#6:8]1:[#6:21](:[#7:24]:[#6:2](-[#6:...
5,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(Cl...,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(C#...,cc(c)-c1ncc(C)cc1Cl,cc(c)-c1ncc(C)cc1C#N,[#6:10](-[#6:8]1:[#6:21](:[#7:22]:[#6:2](-[#6:...
...,...,...,...,...,...
163,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...,N#Cc1cccnc1-c1ccc(C(=O)Nc2ccccc2N)cc1,cc(c)-c1ncc(CN2CCN(C(C)C)CC2)cc1Cl,cc(c)-c1ncccc1C#N,[#6](-[#6](-[#6](-[H])(-[H])-[H])(-[#7]1-[#6](...
164,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...,Cc1cc(CN2CCN(C(C)C)CC2)cnc1-c1ccc(C(=O)Nc2cccc...,cc(c)-c1ncc(C)cc1Cl,cc(c)-c1ncc(C)cc1C,[#6:6](-[#6:3]1:[#6:23](:[#7:19]:[#6:12](-[#6:...
165,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...,Nc1ccccc1NC(=O)c1ccc(-c2ncc(CN3CCC3)cc2F)cc1,cc(c)-c1ncc(CN2CCN(C(C)C)CC2)cc1Cl,cc(c)-c1ncc(CN2CCC2)cc1F,[#6](-[#6](-[#6](-[H])(-[H])-[H])(-[#7]1-[#6](...
166,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...,Nc1ccccc1NC(=O)c1ccc(-c2ncc(CN3CCC3)cc2Cl)cc1,cCN1CCN(C(C)C)CC1,cCN1CCC1,[#6](-[#6](-[#6](-[H])(-[H])-[H])(-[#7]1-[#6](...


### quality control, demonstrate x -> y using reaction

In [12]:
def apply_reactions(reaction):
    
    # create reaction
    rxn = Chem.rdChemReactions.ReactionFromSmarts(reaction.SMIRKS)

    # enumerate products
    reaction['Products'] = rxn.RunReactants((Chem.AddHs(Chem.MolFromSmiles(reaction.Stripped_x)),))
    return reaction

In [13]:
df_hdac1_products = df_hdac1_pairs.progress_apply(apply_reactions, axis=1)
df_hdac1_products.sample(3)

100%|██████████| 156/156 [00:07<00:00, 19.55it/s]


Unnamed: 0,Stripped_x,Stripped_y,Fragment_x,Fragment_y,SMIRKS,Products
60,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(F)...,Cc1cc(CN2CCN(C(C)C)CC2)cnc1-c1ccc(C(=O)Nc2cccc...,cc(c)-c1ncc(CN2CCN(CC)CC2)cc1F,cc(c)-c1ncc(CN2CCN(C(C)C)CC2)cc1C,[#6:13](-[#6](-[#7:2]1-[#6:28](-[#6:30](-[#7:3...,((<rdkit.Chem.rdchem.Mol object at 0x7fed3537f...
20,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...,N#Cc1cccnc1-c1ccc(C(=O)Nc2ccccc2N)cc1,c-c1ncc(CN2CCN(C(C)C)CC2)cc1C#N,c-c1ncccc1C#N,[#6](-[#6](-[#6](-[H])(-[H])-[H])(-[#7]1-[#6](...,((<rdkit.Chem.rdchem.Mol object at 0x7fed35621...
18,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(C#...,CC(C)N1CCNCC1,CCN1CCNCC1,[#6:13](-[#6](-[#6](-[H])(-[H])-[H])(-[#7:5]1-...,((<rdkit.Chem.rdchem.Mol object at 0x7fed35658...


In [14]:
len(df_hdac1_products.index)

156

In [15]:
def products_to_list(productset):
    
    # given produces exist
    productlist = []
    for product in productset:
        productlist.append('.'.join([Chem.MolToSmiles(Chem.RemoveHs(productpart)) for productpart in product]))
    return list(set(productlist))

df_hdac1_products['ProductList'] = df_hdac1_products.Products.apply(lambda x: products_to_list(x) if pd.notna(x) else list())

In [16]:
df_hdac1_products.sample(3)

Unnamed: 0,Stripped_x,Stripped_y,Fragment_x,Fragment_y,SMIRKS,Products,ProductList
77,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(C#...,CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c...,cc(c)-c1ncc(CN2CCN(CC)CC2)cc1C#N,cc(c)-c1ncc(CN2CCN(C(C)C)CC2)cc1Cl,[#6:13](-[#6](-[#7:6]1-[#6:24](-[#6:26](-[#7:1...,((<rdkit.Chem.rdchem.Mol object at 0x7fed351b2...,[CC(C)N1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)...
101,N#Cc1cccnc1-c1ccc(C(=O)Nc2ccccc2N)cc1,Nc1ccccc1NC(=O)c1ccc(-c2ncc(CN3CCC3)cc2Cl)cc1,cc(c)-c1ncccc1C#N,cc(c)-c1ncc(CN2CCC2)cc1Cl,[#7]#[#6]-[#6:17]1:[#6:14](:[#6](:[#6:15](:[#7...,((<rdkit.Chem.rdchem.Mol object at 0x7fed3507d...,[Nc1ccccc1NC(=O)c1ccc(-c2ncc(CN3CCC3)cc2Cl)cc1]
73,CCN1CCN(Cc2cnc(-c3ccc(C(=O)Nc4ccccc4N)cc3)c(C#...,Cc1cc(CN2CCN(C(C)C)CC2)cnc1-c1ccc(C(=O)Nc2cccc...,cc(c)-c1ncc(CN2CCN(CC)CC2)cc1C#N,cc(c)-c1ncc(CN2CCN(C(C)C)CC2)cc1C,[#6:13](-[#6](-[#7:2]1-[#6:25](-[#6:26](-[#7:3...,((<rdkit.Chem.rdchem.Mol object at 0x7fed352a3...,[Cc1cc(CN2CCN(C(C)C)CC2)cnc1-c1ccc(C(=O)Nc2ccc...


In [17]:
df_hdac1_products.apply(lambda x: x.Stripped_y not in x.ProductList, axis=1).sum()

0

In [18]:
df_hdac1_products.ProductList.apply(lambda x: len(x) > 1).sum()

0