In [1]:
import os
os.chdir("../")

In [2]:
import pandas as pd

from rdkit import Chem
from rdkit.Chem import SaltRemover
from networkx.algorithms.clique import find_cliques
from networkx import to_numpy_matrix
from func_timeout import func_timeout, FunctionTimedOut
from mmpa.mmp import MMP

from tqdm import tqdm
tqdm.pandas()

### import all activity values for compounds within 40% similarity of adenaline

In [3]:
df = pd.read_csv('/home/daniel/Downloads/DOWNLOAD-DMAOS_6IYWiRfDdsg31VJek5bJXL5Y2ndfPnJhrbTB0=.csv.gz', sep=';', compression='gzip')

### most common molecules

In [4]:
df['Molecule Name'].value_counts()[0:10]

ISOPROTERENOL                  1894
EPINEPHRINE                    1025
NOREPINEPHRINE                  851
PHENYLEPHRINE                   566
ALBUTEROL                       493
RACEPINEPHRINE                  449
LEVISOPRENALINE                 143
ISOPROTERENOL HYDROCHLORIDE      96
NOREPINEPHRINE BITARTRATE        86
EPINEPHRINE BITARTRATE           79
Name: Molecule Name, dtype: int64

### most common targets

In [5]:
df['Target Name'].value_counts()[0:10]

Rattus norvegicus             3191
Unchecked                      665
Beta-2 adrenergic receptor     363
Beta-1 adrenergic receptor     132
Hepatotoxicity                 130
Canis familiaris               111
NON-PROTEIN TARGET             109
ADMET                           79
Beta-3 adrenergic receptor      65
Cavia porcellus                 64
Name: Target Name, dtype: int64

### filter beta2 measurements

In [6]:
df_beta2 = df[df['Target Name']=='Beta-2 adrenergic receptor'].copy()
df_beta2.Smiles.unique().size

56

### strip salts

In [7]:
def strip_salts(smiles):
    mol = Chem.MolFromSmiles(smiles)
    remover = SaltRemover.SaltRemover()
    mol, salts = remover.StripMolWithDeleted(mol)
    smiles = Chem.MolToSmiles(mol)
    return smiles

In [8]:
df_beta2['Stripped'] = df_beta2.Smiles.apply(strip_salts)
df_beta2.Stripped.unique().size

47

### create cartesian product of molecules test in the same assay

In [9]:
df_beta2_pairs = pd.merge(df_beta2, df_beta2, on='Assay ChEMBL ID')
df_beta2_pairs = df_beta2_pairs[['Stripped_x', 'Stripped_y']].drop_duplicates()
df_beta2_pairs

Unnamed: 0,Stripped_x,Stripped_y
0,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1
1,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CCC(NC(C)C)C(O)c1ccc(O)c(O)c1
2,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CC(C)NC[C@H](O)c1ccc(O)c(O)c1
3,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CNCC(O)c1ccc(OC(=O)C(C)(C)C)c(OC(=O)C(C)(C)C)c1
4,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CNCC(O)c1ccc(O)c(O)c1
...,...,...
1938,CC(C)NCC(O)c1ccc(O)c(NS(C)(=O)=O)c1,COC(CNC(C)C)c1ccc(O)c(O)c1
1946,CC(C)(C)NCC(O)c1ccc(O)c(NC(N)=O)c1,COC(CNC(C)C)c1ccc(O)c(O)c1
1954,CC(C)NCC(O)c1ccc(O)cc1,COC(CNC(C)C)c1ccc(O)c(O)c1
1962,CC(C)(C)NCC(O)c1ccc(O)c(O)c1,COC(CNC(C)C)c1ccc(O)c(O)c1


### identify pairs

In [10]:
def apply_mmpa(prospective_pair):

    # prepare potential atom-atom mappings and create correspondence graph
    mmp = MMP(prospective_pair.Stripped_x, prospective_pair.Stripped_y)
    mmp.createCorrespondence(penalty=3.0)
    mmp.findCliques()
    mmp.eliminateMCS()
    
    # append frags to output
    prospective_pair['Fragment_x'] = mmp.getFragment1()
    prospective_pair['Fragment_y'] = mmp.getFragment2()
    prospective_pair['SMIRKS'] = mmp.getSmirks()
    
    # return
    return prospective_pair

In [11]:
df_beta2_pairs = df_beta2_pairs.progress_apply(apply_mmpa, axis=1)
df_beta2_pairs = df_beta2_pairs[~pd.isna(df_beta2_pairs.SMIRKS)]
df_beta2_pairs

100%|██████████| 551/551 [00:38<00:00, 14.21it/s]


Unnamed: 0,Stripped_x,Stripped_y,Fragment_x,Fragment_y,SMIRKS
1,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CCC(NC(C)C)C(O)c1ccc(O)c(O)c1,[H][C@@](O)(CNC(C)C)c1cccc(O)c1,CCC(NC(C)C)C(O)c1cccc(O)c1,[#6:13](-[#6:5](-[#6:12](-[H])(-[H])-[H])(-[#7...
2,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CC(C)NC[C@H](O)c1ccc(O)c(O)c1,CC(C)NCC(O)c1cccc(O)c1,CC(C)NCC(O)c1cccc(O)c1,[#6:10](-[#6:5](-[#6:11](-[H])(-[H])-[H])(-[#7...
3,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CNCC(O)c1ccc(OC(=O)C(C)(C)C)c(OC(=O)C(C)(C)C)c1,[H][C@@](O)(CNC(C)C)c1ccc(O)c(O)c1,CNCC(O)c1ccc(OC(=O)C(C)(C)C)c(OC(=O)C(C)(C)C)c1,[#6](-[#6](-[#6](-[H])(-[H])-[H])(-[#7:5](-[#6...
4,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CNCC(O)c1ccc(O)c(O)c1,[H][C@@](O)(CNC(C)C)c1cccc(O)c1,CNCC(O)c1cccc(O)c1,[#6](-[#6](-[#6](-[H])(-[H])-[H])(-[#7:6](-[#6...
5,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CC(C)NCC(O)c1ccc(O)c(O)c1,[H][C@@](O)(CNC(C)C)c1cccc(O)c1,CC(C)NCC(O)c1cccc(O)c1,[#6:8](-[#6:3](-[#6:9](-[H])(-[H])-[H])(-[#7:5...
...,...,...,...,...,...
1938,CC(C)NCC(O)c1ccc(O)c(NS(C)(=O)=O)c1,COC(CNC(C)C)c1ccc(O)c(O)c1,CNCC(O)c1ccc(O)c(NS(C)(=O)=O)c1,CNCC(OC)c1ccc(O)c(O)c1,[#6:10](-[#7:9](-[#6:11](-[#6:5](-[#8]-[H])(-[...
1946,CC(C)(C)NCC(O)c1ccc(O)c(NC(N)=O)c1,COC(CNC(C)C)c1ccc(O)c(O)c1,CC(C)(C)NCC(O)c1ccc(O)c(NC(N)=O)c1,COC(CNC(C)C)c1ccc(O)c(O)c1,[#6](-[#6](-[#6:11](-[H])(-[H])-[H])(-[#6:12](...
1954,CC(C)NCC(O)c1ccc(O)cc1,COC(CNC(C)C)c1ccc(O)c(O)c1,CNCC(O)c1ccc(O)cc1,CNCC(OC)c1ccc(O)c(O)c1,[#6:5](-[#7:6](-[#6:7](-[#6:1](-[#8]-[H])(-[#6...
1962,CC(C)(C)NCC(O)c1ccc(O)c(O)c1,COC(CNC(C)C)c1ccc(O)c(O)c1,ccc(cc)C(O)CNC(C)(C)C,ccc(cc)C(CNC(C)C)OC,[#6](-[#6](-[#6:12](-[H])(-[H])-[H])(-[#6:13](...


### quality control, demonstrate x -> y using reaction

In [12]:
def apply_reactions(reaction):
    
    # create reaction
    rxn = Chem.rdChemReactions.ReactionFromSmarts(reaction.SMIRKS)

    # enumerate products
    reaction['Products'] = rxn.RunReactants((Chem.AddHs(Chem.MolFromSmiles(reaction.Stripped_x)),))
    return reaction

In [13]:
df_beta2_products = df_beta2_pairs.progress_apply(apply_reactions, axis=1)
df_beta2_products.sample(3)

100%|██████████| 490/490 [00:11<00:00, 43.75it/s]


Unnamed: 0,Stripped_x,Stripped_y,Fragment_x,Fragment_y,SMIRKS,Products
1543,OCc1cc(C(O)CNCc2ccc(CNCC(O)c3ccc(O)c(CO)c3)cc2...,CC(C)(CNCC(O)c1ccc(O)c(CO)c1)CNCC(O)c1ccc(O)c(...,CCNCc1ccc(CNCC(O)c2ccc(O)c(CO)c2)cc1,CCNCC(C)(C)CNCC(O)c1ccc(O)c(CO)c1,[#8](-[#6](-[#6]1:[#6](:[#6:5](-[#6](-[#8]-[H]...,((<rdkit.Chem.rdchem.Mol object at 0x7f19c6459...
325,CC(C)NCC(O)c1ccc(O)cc1,CC(C)(C)NCC(O)c1ccc(O)c(CO)c1,CC(C)NCC(O)c1ccc(O)cc1,CC(C)(C)NCC(O)c1ccc(O)c(CO)c1,[#6:8](-[#6](-[#6:9](-[H])(-[H])-[H])(-[#7:3](...,((<rdkit.Chem.rdchem.Mol object at 0x7f19c728c...
34,CNCC(O)c1ccc(O)c(O)c1,CNC[C@H](O)c1ccc(O)c(O)c1,CNCC(O)c1cccc(O)c1,[H][C@](O)(CNC)c1cccc(O)c1,[#6:9](-[#7:3](-[#6:5](-[#6](-[#8:12]-[H])(-[#...,((<rdkit.Chem.rdchem.Mol object at 0x7f19c756c...


In [14]:
len(df_beta2_products.index)

490

In [15]:
def products_to_list(productset):
    
    # given produces exist
    productlist = []
    for product in productset:
        productlist.append('.'.join([Chem.MolToSmiles(Chem.RemoveHs(productpart)) for productpart in product]))
    return list(set(productlist))

df_beta2_products['ProductList'] = df_beta2_products.Products.apply(lambda x: products_to_list(x) if pd.notna(x) else list())

In [16]:
df_beta2_products.sample(3)

Unnamed: 0,Stripped_x,Stripped_y,Fragment_x,Fragment_y,SMIRKS,Products,ProductList
824,CCC(NC1CCCC1)C(O)c1ccc(O)c(O)c1,NC[C@H](O)c1ccc(O)c(O)c1,CCC(NC1CCCC1)C(O)c1cccc(O)c1,[H][C@](O)(CN)c1cccc(O)c1,[#6](-[#6](-[#6](-[#7](-[#6]1(-[#6](-[#6](-[#6...,((<rdkit.Chem.rdchem.Mol object at 0x7f19c6d69...,[NC[C@H](O)c1ccc(O)c(O)c1]
122,NC[C@H](O)c1ccc(O)c(O)c1,CNCC(O)c1ccc(O)c(O)c1F,[H][C@](O)(CN)c1ccc(O)c(O)c1,CNCC(O)c1ccc(O)c(O)c1F,[#7](-[#6:7](-[#6@](-[#8:9]-[H])(-[#6:2]1:[#6:...,((<rdkit.Chem.rdchem.Mol object at 0x7f19c7526...,[CNCC(O)c1ccc(O)c(O)c1F]
78,CNCC(O)c1cc(O)c(O)cc1F,CNC[C@H](O)c1cc(O)c(O)cc1F,CNCC(O)c1cc(O)ccc1F,[H][C@](O)(CNC)c1cc(O)ccc1F,[#6:10](-[#7:3](-[#6:7](-[#6](-[#8:13]-[H])(-[...,((<rdkit.Chem.rdchem.Mol object at 0x7f19c7517...,[CNC[C@H](O)c1cc(O)c(O)cc1F]


In [17]:
df_beta2_products.apply(lambda x: x.Stripped_y not in x.ProductList, axis=1).sum()

0

In [18]:
df_beta2_products.ProductList.apply(lambda x: len(x) > 1).sum()

0