In [26]:
import glob
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit import DataStructs
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import IPythonConsole # Enables RDKit IPython integration

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [12]:
path_to_library = '/home/paula/Masterarbeit/KinaseFocusedFragmentLibrary/FragmentLibrary'

# list of folders for each subpocket
folders = glob.glob(path_to_library+'/*')
subpockets = [folder[-2:] for folder in folders]
subpockets

['FP', 'B2', 'AP', 'SE', 'B1', 'GA']

In [94]:
# iterate over subpockets

data = [] # [[(frag, fp, kinase), (frag, fp, kinase), ...],[...]]
n_frags = {}

rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)

for i, folder in enumerate(folders):
    
    subpocket = subpockets[i]
    frag_smiles = []
    data_sb = []
    #print(subpocket)
    
    files = glob.glob(folder+'/*.sdf')
    kinases = []
    # iterate over kinases
    for file in files:
        
        fragments = [f for f in Chem.SDMolSupplier(file)]
        # kinase that the fragments correspond to
        kinase = file.split('/')[-1].split('.')[0]
        
        # count fragments per kinase
        if kinase in n_frags:
            n_frags[kinase] += len(fragments)
        else:
            n_frags[kinase] = len(fragments)
        
        # all fragments for this kinase
        for f, frag in enumerate(fragments):
            # remove dummy atoms from fragments and create smiles
            du = Chem.MolFromSmiles('*')
            h = Chem.MolFromSmiles('[H]', sanitize=False)
            fixed_frag = AllChem.ReplaceSubstructs(frag, du, h, replaceAll=True)[0]
            fixed_frag = Chem.RemoveHs(fixed_frag)
            fragments[f] = fixed_frag
            # smiles for drawing fragment
            frag_smiles.append(Chem.MolToSmiles(fixed_frag))
            # fingerprint of fragment
            fp = rdkit_gen.GetFingerprint(fixed_frag)
            
            data_sb.append((fixed_frag, fp, kinase))
            
    data.append(data_sb)

In [98]:
# mean similarity per subpocket
cutoff = 0.85
means = {}
n_frags_sp = {}
dups = {}
for sp, subpocket in enumerate(subpockets):

    mean = 0
    count = 0
    dup = 0
    
    n_frags_sp[subpocket] = 0
    for i, frag1 in enumerate(data[sp]):
        n_frags_sp[subpocket] += 1
        for j, frag2 in enumerate(data[sp]):
            # do not compare the very same fragment
            if i < j:
                sim = DataStructs.FingerprintSimilarity(frag1[1],frag2[1]) # tanimoto similarity
                mean += sim
                count += 1
                if sim >= cutoff:
                    dup += 1
                    
    means[subpocket] = mean/count
    dups[subpocket] = dup/count
    
# convert to pandas df for displaying
df = pd.DataFrame([n_frags_sp, means, dups]).T
df.columns = ['number of fragments', 'mean similarity', 'similar fragment pairs']
print(np.mean(df['mean similarity']))
df     

0.1413586026997841


Unnamed: 0,number of fragments,mean similarity,similar fragment pairs
AP,2670.0,0.175561,0.003991
B1,112.0,0.184452,0.074485
B2,245.0,0.116487,0.018501
FP,1811.0,0.099723,0.00517
GA,1202.0,0.142869,0.009124
SE,1487.0,0.12906,0.008635


In [108]:
# mean similarity per kinase (per subpocket)
cutoff = 0.85
means = {}
counts = {}
dups = {}
for sp, subpocket in enumerate(subpockets):
    for i, frag1 in enumerate(data[sp]):
        for j, frag2 in enumerate(data[sp]):
            # if fragments belong to the same kinase
            if i < j and frag1[2] == frag2[2]:
                kinase = frag1[2]
                sim = DataStructs.FingerprintSimilarity(frag1[1],frag2[1]) # tanimoto similarity
                # if kinase is already in dict, update
                if kinase in means:
                    means[kinase] += sim
                    counts[kinase] += 1
                    if sim >= cutoff:
                        dups[kinase] += 1
                # else make new entry
                else:
                    means[kinase] = sim
                    counts[kinase] = 1
                    if sim >= cutoff:
                        dups[kinase] = 1
                    else:
                        dups[kinase] = 0
            

# calculate means
for kinase in sorted(means.keys()):
    means[kinase] = means[kinase]/counts[kinase] 
    dups[kinase] = dups[kinase]/counts[kinase]

# convert to pandas df for displaying
df = pd.DataFrame([n_frags, means, dups]).T
df.columns = ['number of fragments', 'mean similarity', 'similar fragment pairs']
print(np.mean(df['mean similarity']))
#df      

0.2806726893700995


In [109]:
df.loc[['BRAF','CDK2','EGFR','p38a']]

Unnamed: 0,number of fragments,mean similarity,similar fragment pairs
BRAF,126.0,0.306001,0.0912
CDK2,820.0,0.166435,0.023304
EGFR,322.0,0.228091,0.041522
p38a,290.0,0.230423,0.038579


In [110]:
# promiscuity assessment
# How similar are fragments across kinases?
