In [13]:
import glob
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole # Enables RDKit IPython integration
from rdkit.Chem import PandasTools
import seaborn as sns

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [14]:
path_to_library = '/home/paula/Masterarbeit/FragmentLibrary/'

# list of folders for each subpocket
subpockets = ['AP', 'FP', 'SE', 'GA', 'B1', 'B2']
folders = [path_to_library+subpocket for subpocket in subpockets]

In [16]:
# read data

data = pd.DataFrame()

# iterate over subpockets
for i, folder in enumerate(folders):
    
    subpocket = subpockets[i]
    
    file = folder+'/'+subpocket+'.sdf' 
    df = pd.DataFrame()
    suppl = Chem.SDMolSupplier(file, removeHs=False)
        
    fragments = []
    smiles = []
    groups = []
    kinases = []

    for frag in suppl:
        # remove dummy atoms from fragments and create smiles
        du = Chem.MolFromSmiles('*')
        h = Chem.MolFromSmiles('[H]', sanitize=False)
        fixed_frag = AllChem.ReplaceSubstructs(frag, du, h, replaceAll=True)[0]
        fixed_frag = Chem.RemoveHs(fixed_frag)
        s = Chem.MolToSmiles(fixed_frag)
        fragments.append(fixed_frag)
        # smiles for comparing fragments
        smiles.append(s)
        # kinase groups
        groups.append(frag.GetProp('group'))
        kinases.append(frag.GetProp('kinase'))
      
    df['smiles'] = smiles
    df['fragment'] = fragments
    df['subpocket'] = subpocket
    df['group'] = groups
    df['kinase'] = kinases
    
    data = data.append(df)
    
data = data.reset_index(drop=True)
data.shape

(7201, 5)

In [17]:
groups = data.group.unique()
groups

array(['Other', 'TK', 'TKL', 'AGC', 'CMGC', 'CAMK', 'CK1', 'STE'],
      dtype=object)

For each fragment (group by subpocket and smiles), find the number of kinases/kinase groups it is bound to.

In [93]:
n_multi_frags = 0
n_multi_kin = 0
n_multi_groups = 0

for subpocket in subpockets:
    
    # get data for this subpocket
    ds = data[data.subpocket==subpocket]
    # deduplicate based on SMILES
    #dg = dg.drop_duplicates(['smiles', 'subpocket'])
    
    # count number of fragments per smiles (# occurrence of each fragment)
    n_frags = ds.groupby('smiles').count()['fragment']
    # count number of fragments that occur only once and that occur several times
    n_multi_frags += (n_frags>1).value_counts()
    # smiles of fragments occurring multiple times
    multi_frags = n_frags[n_frags>1].index
    
    # data for only those multi fragments
    ds_multi_frags = ds[ds.smiles.isin(multi_frags)]
    
    # group by smiles and count kinases and kinase groups bound to
    promisc = ds_multi_frags.groupby('smiles').nunique()[['kinase', 'group']]
    # count number of fragments with multiple kinases assigned
    n_multi_kin += (promisc.kinase>1).value_counts()
    # count number of fragments with multiple kinase groups assigned
    n_multi_groups += (promisc.group>1).value_counts()

print('Fragments with multiple occurrences:')
print(n_multi_frags)
print('Fragments with multiple kinases assigned:')
print(n_multi_kin)
print('Fragments with multiple groups assigned:')
print(n_multi_groups)

Fragments with multiple occurrences:
False    2003
True      974
Name: fragment, dtype: int64
Fragments with multiple kinases assigned:
True     605
False    369
Name: kinase, dtype: int64
Fragments with multiple groups assigned:
False    519
True     455
Name: group, dtype: int64


In [95]:
print('Number of multiple occurring fragments that bind to multiple kinases:')
605/974

Number of multiple occurring fragments that bind to multiple kinases:


0.6211498973305954

In [96]:
print('Number of multiple occurring fragments that bind to multiple groups:')
455/974

Number of multiple occurring fragments that bind to multiple groups:


0.4671457905544148