In [1]:
import os
os.chdir("../")

In [2]:
import pandas as pd

from rdkit import Chem
from rdkit.Chem import SaltRemover
from networkx.algorithms.clique import find_cliques
from networkx import to_numpy_matrix
from func_timeout import func_timeout, FunctionTimedOut
from mmpa.mmp import MMP

from tqdm import tqdm
tqdm.pandas()

### import all activity values for compounds within 40% similarity of adenaline

In [3]:
df = pd.read_csv('/home/daniel/Downloads/DOWNLOAD-DMAOS_6IYWiRfDdsg31VJek5bJXL5Y2ndfPnJhrbTB0=.csv.gz', sep=';', compression='gzip')

### most common molecules

In [13]:
df[['Molecule ChEMBL ID', 'Molecule Name']].value_counts()[0:10]

Molecule ChEMBL ID  Molecule Name              
CHEMBL434           ISOPROTERENOL                  1894
CHEMBL679           EPINEPHRINE                    1025
CHEMBL1437          NOREPINEPHRINE                  851
CHEMBL1215          PHENYLEPHRINE                   566
CHEMBL714           ALBUTEROL                       493
CHEMBL1740          RACEPINEPHRINE                  449
CHEMBL1160723       LEVISOPRENALINE                 117
CHEMBL1711          ISOPROTERENOL HYDROCHLORIDE      96
CHEMBL1256958       EPINEPHRINE BITARTRATE           79
CHEMBL677           LEVONORDEFRIN                    68
dtype: int64

### most common targets

In [9]:
df[['Assay ChEMBL ID', 'Target Name']].value_counts()[0:10]

Assay ChEMBL ID  Target Name                                               
CHEMBL3885882    Rattus norvegicus                                             2658
CHEMBL3885883    Rattus norvegicus                                              447
CHEMBL1794375    Unchecked                                                       51
CHEMBL1794424    Thioredoxin reductase 1, cytoplasmic                            40
CHEMBL1738442    Histone-lysine N-methyltransferase, H3 lysine-9 specific 3      39
CHEMBL1614275    Putative fructose-1,6-bisphosphate aldolase                     30
CHEMBL1741321    Cytochrome P450 2D6                                             29
CHEMBL1741325    Cytochrome P450 2C9                                             29
CHEMBL1741324    Cytochrome P450 3A4                                             29
CHEMBL1741323    Cytochrome P450 2C19                                            29
dtype: int64

### filter beta2 measurements

In [6]:
df_beta2 = df[df['Target Name']=='Beta-2 adrenergic receptor'].copy()
df_beta2.Smiles.unique().size

56

### strip salts

In [7]:
def strip_salts(smiles):
    mol = Chem.MolFromSmiles(smiles)
    remover = SaltRemover.SaltRemover()
    mol, salts = remover.StripMolWithDeleted(mol)
    smiles = Chem.MolToSmiles(mol)
    return smiles

In [8]:
df_beta2['Stripped'] = df_beta2.Smiles.apply(strip_salts)
df_beta2.Stripped.unique().size

47

### create cartesian product of molecules test in the same assay

In [9]:
df_beta2_pairs = pd.merge(df_beta2, df_beta2, on='Assay ChEMBL ID')
df_beta2_pairs = df_beta2_pairs[['Stripped_x', 'Stripped_y']].drop_duplicates()
df_beta2_pairs

Unnamed: 0,Stripped_x,Stripped_y
0,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1
1,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CCC(NC(C)C)C(O)c1ccc(O)c(O)c1
2,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CC(C)NC[C@H](O)c1ccc(O)c(O)c1
3,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CNCC(O)c1ccc(OC(=O)C(C)(C)C)c(OC(=O)C(C)(C)C)c1
4,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CNCC(O)c1ccc(O)c(O)c1
...,...,...
1938,CC(C)NCC(O)c1ccc(O)c(NS(C)(=O)=O)c1,COC(CNC(C)C)c1ccc(O)c(O)c1
1946,CC(C)(C)NCC(O)c1ccc(O)c(NC(N)=O)c1,COC(CNC(C)C)c1ccc(O)c(O)c1
1954,CC(C)NCC(O)c1ccc(O)cc1,COC(CNC(C)C)c1ccc(O)c(O)c1
1962,CC(C)(C)NCC(O)c1ccc(O)c(O)c1,COC(CNC(C)C)c1ccc(O)c(O)c1


### identify pairs

In [10]:
def apply_mmpa(prospective_pair):

    # prepare potential atom-atom mappings and create correspondence graph
    mmp = MMP(prospective_pair.Stripped_x, prospective_pair.Stripped_y, fuzziness=5)
    mmp.createCorrespondence()
    mmp.findCliques()
    mmp.eliminateMCS()
    
    # append frags to output
    prospective_pair['Fragment_x'] = mmp.getFragment1()
    prospective_pair['Fragment_y'] = mmp.getFragment2()
    prospective_pair['SMIRKS'] = mmp.getSmirks()
    
    # return
    return prospective_pair

In [11]:
df_beta2_pairs = df_beta2_pairs.progress_apply(apply_mmpa, axis=1)
df_beta2_pairs = df_beta2_pairs[~pd.isna(df_beta2_pairs.SMIRKS)]
df_beta2_pairs

100%|██████████| 551/551 [02:26<00:00,  3.75it/s]


Unnamed: 0,Stripped_x,Stripped_y,Fragment_x,Fragment_y,SMIRKS
1,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CCC(NC(C)C)C(O)c1ccc(O)c(O)c1,[H][C@@]([OH])([CH2][NH][CH]([CH3])[CH3])[c]1[...,[CH3][CH2][CH]([NH][CH]([CH3])[CH3])[CH]([OH])...,[#6:10](-[#6:7](-[#6:9](-[H])(-[H])-[H])(-[#7:...
2,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CC(C)NC[C@H](O)c1ccc(O)c(O)c1,[CH3][CH]([CH3])[NH][CH2][C]([OH])[c]1[cH][cH]...,[CH3][CH]([CH3])[NH][CH2][C]([OH])[c]1[cH][cH]...,[#6:10](-[#6:5](-[#6:11](-[H])(-[H])-[H])(-[#7...
3,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CNCC(O)c1ccc(OC(=O)C(C)(C)C)c(OC(=O)C(C)(C)C)c1,[H][C@@]([OH])([CH2][NH][CH]([CH3])[CH3])[c]1[...,[CH3][NH][CH2][CH]([OH])[c]1[cH][cH][c]([O][C]...,[#6](-[#6:5](-[#6](-[H])(-[H])-[H])(-[#7:3](-[...
4,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CNCC(O)c1ccc(O)c(O)c1,[H][C@@]([OH])([CH2][NH][CH]([CH3])[CH3])[c]1[...,[CH3][NH][CH2][CH]([OH])[c]1[cH][cH][cH][c]([O...,[#6](-[#6:6](-[#6](-[H])(-[H])-[H])(-[#7:8](-[...
5,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CC(C)NCC(O)c1ccc(O)c(O)c1,[H][C@@]([OH])([CH2][NH][CH]([CH3])[CH3])[c]1[...,[CH3][CH]([CH3])[NH][CH2][CH]([OH])[c]1[cH][cH...,[#6:9](-[#6:6](-[#6:10](-[H])(-[H])-[H])(-[#7:...
...,...,...,...,...,...
1938,CC(C)NCC(O)c1ccc(O)c(NS(C)(=O)=O)c1,COC(CNC(C)C)c1ccc(O)c(O)c1,[CH3][NH][CH2][CH]([OH])[c]1[cH][cH][c]([OH])[...,[CH3][NH][CH2][CH]([O][CH3])[c]1[cH][cH][c]([O...,[#6:11](-[#7:12](-[#6:10](-[#6:8](-[#8:13]-[H]...
1946,CC(C)(C)NCC(O)c1ccc(O)c(NC(N)=O)c1,COC(CNC(C)C)c1ccc(O)c(O)c1,[CH3][C]([CH3])([CH3])[NH][CH2][CH]([OH])[c]1[...,[CH3][O][CH]([CH2][NH][CH]([CH3])[CH3])[c]1[cH...,[#6:15](-[#6:12](-[#6:14](-[H])(-[H])-[H])(-[#...
1954,CC(C)NCC(O)c1ccc(O)cc1,COC(CNC(C)C)c1ccc(O)c(O)c1,[CH3][NH][CH2][CH]([OH])[c]1[cH][cH][c]([OH])[...,[CH3][NH][CH2][CH]([O][CH3])[c]1[cH][cH][c]([O...,[#6:9](-[#7:11](-[#6:10](-[#6:6](-[#8:12]-[H])...
1962,CC(C)(C)NCC(O)c1ccc(O)c(O)c1,COC(CNC(C)C)c1ccc(O)c(O)c1,[cH2][cH][c]([cH][cH2])[CH]([OH])[CH2][NH][C](...,[cH2][cH][c]([cH][cH2])[CH]([CH2][NH][CH]([CH3...,[#6:10](-[#6:8](-[#6](-[H])(-[H])-[H])(-[#6:9]...


### quality control, demonstrate x -> y using reaction

In [12]:
def apply_reactions(reaction):
    
    # create reaction
    rxn = Chem.rdChemReactions.ReactionFromSmarts(reaction.SMIRKS)

    # enumerate products
    reaction['Products'] = rxn.RunReactants((Chem.AddHs(Chem.MolFromSmiles(reaction.Stripped_x)),))
    return reaction

In [13]:
df_beta2_products = df_beta2_pairs.progress_apply(apply_reactions, axis=1)
df_beta2_products.sample(3)

100%|██████████| 490/490 [00:12<00:00, 39.86it/s]


Unnamed: 0,Stripped_x,Stripped_y,Fragment_x,Fragment_y,SMIRKS,Products
1345,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,CC(C)NCC(O)c1ccc2ccccc2c1,[H][C@@]([OH])([CH2][NH][CH]([CH3])[CH3])[c]1[...,[CH3][CH]([CH3])[NH][CH2][CH]([OH])[c]1[cH][cH...,[#6:13](-[#6:9](-[#6:12](-[H])(-[H])-[H])(-[#7...,((<rdkit.Chem.rdchem.Mol object at 0x7f249cc29...
383,CC(C)(C)NCC(O)c1ccc(O)c(C(N)=O)c1,CC(C)NCC(O)c1ccc(O)c(O)c1,[CH3][C]([CH3])([CH3])[NH][CH2][CH]([OH])[c]1[...,[CH3][CH]([CH3])[NH][CH2][CH]([OH])[c]1[cH][cH...,[#6:14](-[#6:11](-[#6:15](-[H])(-[H])-[H])(-[#...,((<rdkit.Chem.rdchem.Mol object at 0x7f249d374...
477,NC[C@H](O)c1ccc(O)c(O)c1,CC(C)NCC(O)c1ccc(O)c(O)c1,[H][C@]([OH])([CH2][NH2])[c]1[cH][cH][cH][c]([...,[CH3][CH]([CH3])[NH][CH2][CH]([OH])[c]1[cH][cH...,[#7:10](-[#6:11](-[#6@:7](-[#8:12]-[H])(-[#6:4...,((<rdkit.Chem.rdchem.Mol object at 0x7f249d1ce...


In [14]:
len(df_beta2_products.index)

490

In [15]:
def products_to_list(productset):
    
    # given produces exist
    productlist = []
    for product in productset:
        productlist.append('.'.join([Chem.MolToSmiles(Chem.RemoveHs(productpart)) for productpart in product]))
    return list(set(productlist))

df_beta2_products['ProductList'] = df_beta2_products.Products.apply(lambda x: products_to_list(x) if pd.notna(x) else list())

In [16]:
df_beta2_products.sample(3)

Unnamed: 0,Stripped_x,Stripped_y,Fragment_x,Fragment_y,SMIRKS,Products,ProductList
1058,CC(C)NCC(O)c1ccc(O)c(N)c1,CC(C)NC[C@@H](O)c1ccc(O)c(O)c1,[CH3][CH]([CH3])[NH][CH2][CH]([OH])[c]1[cH][cH...,[H][C@@]([OH])([CH2][NH][CH]([CH3])[CH3])[c]1[...,[#6:9](-[#6:6](-[#6:10](-[H])(-[H])-[H])(-[#7:...,((<rdkit.Chem.rdchem.Mol object at 0x7f249cdbc...,[CC(C)NC[C@@H](O)c1ccc(O)c(O)c1]
37,CC(C)NCC(O)c1ccc(O)c(O)c1,CC(C)NC[C@H](O)c1ccc(O)c(O)c1,[CH3][CH]([CH3])[NH][CH2][CH]([OH])[c]1[cH][cH...,[H][C@]([OH])([CH2][NH][CH]([CH3])[CH3])[c]1[c...,[#6:9](-[#6:6](-[#6:10](-[H])(-[H])-[H])(-[#7:...,((<rdkit.Chem.rdchem.Mol object at 0x7f249d8b6...,[CC(C)NC[C@H](O)c1ccc(O)c(O)c1]
1670,CC(C)(C)NCC(O)c1ccc(O)c(NC(N)=O)c1,CC(C)(C)NCC(O)c1ccc(O)c(O)c1,[CH3][c]1[cH][cH][c]([OH])[c]([NH][C]([NH2])=[...,[CH3][c]1[cH][cH][c]([OH])[c]([OH])[cH]1,[#6:9](-[#6:3]1:[#6:8](:[#6:6](:[#6:7](-[#8:5]...,((<rdkit.Chem.rdchem.Mol object at 0x7f249c6ae...,[CC(C)(C)NCC(O)c1ccc(O)c(O)c1]


In [17]:
df_beta2_products.apply(lambda x: x.Stripped_y not in x.ProductList, axis=1).sum()

0

In [18]:
df_beta2_products.ProductList.apply(lambda x: len(x) > 1).sum()

0