In [1]:
import glob
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit import DataStructs
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import IPythonConsole # Enables RDKit IPython integration
from rdkit.Chem import PandasTools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
path_to_library = '../FragmentLibrary'

# list of folders for each subpocket
folders = glob.glob(path_to_library+'/*')
subpockets = [folder[-2:] for folder in folders]
subpockets

['FP', 'B2', 'AP', 'SE', 'B1', 'GA']

In [84]:
# read data

data = pd.DataFrame()
#data = [] # [[(frag, fp, kinase), (frag, fp, kinase), ...],[...]]
# number of fragments per kinase

rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)

# iterate over subpockets
for i, folder in enumerate(folders):
    
    subpocket = subpockets[i]
    #frag_smiles = []
    #data_sb = []
    #kinases = []
    
    file = folder+'/'+subpocket+'.sdf' 
    df = PandasTools.LoadSDF(file, includeFingerprints=True, embedProps=True)
    df['NumHeavyAtoms'] = df.apply(lambda x: x['ROMol'].GetNumHeavyAtoms(), axis=1)
        
    fragments = []
    fingerprints = []

    for frag in df.ROMol:
        # remove dummy atoms from fragments and create smiles
        du = Chem.MolFromSmiles('*')
        h = Chem.MolFromSmiles('[H]', sanitize=False)
        fixed_frag = AllChem.ReplaceSubstructs(frag, du, h, replaceAll=True)[0]
        fixed_frag = Chem.RemoveHs(fixed_frag)
        fragments.append(fixed_frag)
        # smiles for drawing fragment
        #frag_smiles.append(Chem.MolToSmiles(fixed_frag))
        # fingerprint of fragment
        fingerprints.append(rdkit_gen.GetFingerprint(fixed_frag))

        #data_sb.append((fixed_frag, fp, kinase))
        
    df['fragment'] = fragments
    df['fingerprint'] = fingerprints
    df['subpocket'] = subpocket
    #data.append(data_sb)
    
    data = data.append(df)
    
data = data.reset_index(drop=True)
data.shape

(7529, 11)

In [65]:
# number of fragments per subpocket
n_frags_sp = data.subpocket.value_counts()

In [62]:
# mean similarity per subpocket
cutoff = 0.85
means = pd.Series(index=n_frags_sp.index)
dups = pd.Series(index=n_frags_sp.index)
for sp, subpocket in enumerate(subpockets):

    data_sp = data[data.subpocket == subpocket]
    
    mean = 0
    count = 0
    dup = 0
    
    for i, frag1 in enumerate(data_sp.fingerprint):
        for j, frag2 in enumerate(data_sp.fingerprint):
            # do not compare the very same fragment
            if i < j:
                sim = DataStructs.FingerprintSimilarity(frag1,frag2) # tanimoto similarity
                mean += sim
                count += 1
                if sim >= cutoff:
                    dup += 1
                    
    means[subpocket] = mean/count
    dups[subpocket] = dup/count
    
# convert to pandas df for displaying
df = pd.DataFrame([n_frags_sp, means, dups]).T
df.columns = ['number of fragments', 'mean similarity', 'similar fragment pairs']
print(np.mean(df['mean similarity']))

0.14134414130200068


In [63]:
df     

Unnamed: 0,number of fragments,mean similarity,similar fragment pairs
AP,2671.0,0.175534,0.003988
FP,1811.0,0.099695,0.00517
SE,1488.0,0.129028,0.008624
GA,1202.0,0.142869,0.009124
B2,245.0,0.116487,0.018501
B1,112.0,0.184452,0.074485


In [91]:
# number of fragments per kinase
n_frags = data.kinase.value_counts()
n_frags.head()

CDK2    820
EGFR    322
CHK1    316
p38a    290
PIM1    268
Name: kinase, dtype: int64

In [85]:
# mean similarity per kinase (per subpocket)
data.index

RangeIndex(start=0, stop=7529, step=1)

In [92]:
# mean similarity per kinase (per subpocket)
cutoff = 0.85
means = {}
counts = {}
dups = {}
for sp, subpocket in enumerate(subpockets):
    
    data_sp = data[data.subpocket == subpocket].reset_index()
    
    kinases = data_sp.kinase
    
    for i, frag1 in enumerate(data_sp.fingerprint):
        for j, frag2 in enumerate(data_sp.fingerprint):
            # if fragments belong to the same kinase
            if i < j and kinases[i] == kinases[j]:
                kinase = kinases[i]
                sim = DataStructs.FingerprintSimilarity(frag1, frag2) # tanimoto similarity
                # if kinase is already in dict, update
                if kinase in means:
                    means[kinase] += sim
                    counts[kinase] += 1
                    if sim >= cutoff:
                        dups[kinase] += 1
                # else make new entry
                else:
                    means[kinase] = sim
                    counts[kinase] = 1
                    if sim >= cutoff:
                        dups[kinase] = 1
                    else:
                        dups[kinase] = 0
            

# calculate means
for kinase in sorted(means.keys()):
    means[kinase] = means[kinase]/counts[kinase] 
    dups[kinase] = dups[kinase]/counts[kinase]

# convert to pandas df for displaying
df = pd.DataFrame([dict(n_frags), means, dups]).T
df.columns = ['number of fragments', 'mean similarity', 'similar fragment pairs']
print(np.mean(df['mean similarity']))
#df      

FP
B2
AP
SE
B1
GA
0.280584631844504


In [98]:
df.sort_values(by='number of fragments', ascending=False).head()

Unnamed: 0,number of fragments,mean similarity,similar fragment pairs
CDK2,820.0,0.166337,0.023304
EGFR,322.0,0.228091,0.041522
CHK1,316.0,0.175206,0.027601
p38a,290.0,0.230423,0.038579
PIM1,268.0,0.158193,0.008666


In [110]:
# promiscuity assessment
# How similar are fragments across kinases?
