In [4]:
import glob
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
path_to_library = '../FragmentLibrary'

# list of folders for each subpocket
folders = glob.glob(path_to_library+'/*')
subpockets = [folder[-2:] for folder in folders]
subpockets

['FP', 'B2', 'AP', 'SE', 'B1', 'GA']

In [6]:
# read data

data = pd.DataFrame()

# iterate over subpockets
for i, folder in enumerate(folders):
    
    subpocket = subpockets[i]
    
    file = folder+'/'+subpocket+'.sdf' 
    df = pd.DataFrame()
    suppl = Chem.SDMolSupplier(file, removeHs=False)
        
    fragments = []
    smiles = []
    groups = []
    families = []
    kinases = []

    for frag in suppl:
        
        # delete dummy atoms
        du = Chem.MolFromSmiles('*')
        h = Chem.MolFromSmiles('[H]', sanitize=False)
        fixed_frag = AllChem.ReplaceSubstructs(frag, du, h, replaceAll=True)[0]
        fixed_frag = Chem.RemoveHs(fixed_frag)
        # remove duplicates based on smiles
        #if s in smiles:
        #    continue
        s = Chem.MolToSmiles(fixed_frag)
        smiles.append(s)
        fragments.append(Chem.MolFromSmiles(s))
        
        groups.append(frag.GetProp('group'))
        families.append(frag.GetProp('family'))
        kinases.append(frag.GetProp('kinase'))
    
    df['fragment'] = fragments
    df['smiles'] = smiles
    df['subpocket'] = subpocket
    df['group'] = groups
    df['family'] = families
    df['kinase'] = kinases
    
    data = data.append(df)
    
data = data.reset_index(drop=True)
data.shape

(7475, 6)

In [7]:
df_AP = data[data.subpocket=='AP']
df_FP = data[data.subpocket=='FP']
df_SE = data[data.subpocket=='SE']
df_GA = data[data.subpocket=='GA']
df_B1 = data[data.subpocket=='B1']
df_B2 = data[data.subpocket=='B2']

In [8]:
df_AP.drop_duplicates(subset='smiles').shape

(1234, 6)

In [9]:
df_AP.smiles.nunique()

1234

In [10]:
groups = df_AP.group.unique()
groups

array(['Other', 'TK', 'TKL', 'AGC', 'CMGC', 'CAMK', 'CK1', 'STE',
       'Atypical'], dtype=object)

Calculate number of fragments and unique fragments per kinase group per subpocket.

In [11]:
n_frags = {}
n_unique_frags = {}

for subpocket in subpockets:
    
    df = data[data.subpocket==subpocket]
    
    n_frags_group = {}
    n_unique_frags_group = {}
    
    for group in groups:
        
        df_group = df[df.group==group]
        n = df_group.shape[0]
        n_unique = df_group.smiles.nunique()
        
        n_frags_group[group] = n
        n_unique_frags_group[group] = n_unique
        
    n_frags[subpocket] = n_frags_group
    n_unique_frags[subpocket] = n_unique_frags_group


In [12]:
n_frags = pd.DataFrame(data=n_frags)
n_frags.loc["Total"] = n_frags.sum(axis=0)
n_frags["Total"] = n_frags.sum(axis=1)

In [13]:
n_unique_frags = pd.DataFrame(data=n_unique_frags)
n_unique_frags.loc["Total"] = n_unique_frags.sum(axis=0)
n_unique_frags["Total"] = n_unique_frags.sum(axis=1)

In [14]:
df_n_frags = pd.concat([n_frags,n_unique_frags], axis=1, keys=['All', 'Unique'])
df_n_frags.stack(0)

Unnamed: 0,Unnamed: 1,AP,B1,B2,FP,GA,SE,Total
AGC,All,146,0,63,121,36,16,382
AGC,Unique,82,0,38,88,18,12,238
Atypical,All,136,0,1,61,113,39,350
Atypical,Unique,97,0,1,44,67,30,239
CAMK,All,359,2,15,209,158,148,891
CAMK,Unique,269,2,14,129,83,98,595
CK1,All,48,0,0,23,32,19,122
CK1,Unique,39,0,0,18,11,13,81
CMGC,All,776,12,89,505,308,444,2134
CMGC,Unique,413,8,40,280,153,189,1083


In [15]:
groups = df_AP.group.unique()

In [16]:
# unique fragments per group per subpocket
for group in groups:
    
    df = data[data.group==group]
    n = 0
    
    for subpocket in subpockets:
        
        n_s = df[df.subpocket==subpocket].smiles.nunique()
        n += n_s
    
    print(group, n)

Other 398
TK 1034
TKL 263
AGC 238
CMGC 1083
CAMK 595
CK1 81
STE 219
Atypical 239
