In [1]:
from rdkit import rdBase
rdBase.rdkitVersion

'2018.09.3'

In [2]:
import glob
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.ML.Cluster import Butina
from rdkit.Chem import Draw
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import IPythonConsole

import numpy as np
import matplotlib.pyplot as plt

In [3]:
path_to_library = '/home/paula/Masterarbeit/KinaseFocusedFragmentLibrary/FragmentLibrary'

# list of folders for each subpocket
folders = glob.glob(path_to_library+'/*')
subpockets = [folder[-2:] for folder in folders]
subpockets

['FP', 'B2', 'AP', 'SE', 'B1', 'GA']

In [13]:
# iterate over subpockets
for i, folder in enumerate(folders):
    
    subpocket = subpockets[i]
    # print(subpocket)
    
    files = glob.glob(folder+'/*.sdf')
    # iterate over kinases
    for file in files:
        fragments = Chem.SDMolSupplier(file)
        # iterate over fragments
        rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
        fingerprints = [rdkit_gen.GetFingerprint(m) for m in fragments]
        # How many compounds/fingerprints do we have?
        # print('Number of compounds converted:',len(fingerprints))
        # print('Fingerprint length per compound:',len(fingerprints[0]))

In [10]:
# Calculate distance matrix for fingerprint list
def Tanimoto_distance_matrix(fp_list):
    dissimilarity_matrix = []
    for i in range(1,len(fp_list)):
        similarities = DataStructs.BulkTanimotoSimilarity(fp_list[i],fp_list[:i])
        # Since we need a distance matrix, calculate 1-x for every element in similarity matrix
        dissimilarity_matrix.extend([1-x for x in similarities])
    return dissimilarity_matrix

In [11]:
# Input: Fingerprints and a threshold for the clustering
def ClusterFps(fps,cutoff=0.2):
    # Calculate Tanimoto distance matrix
    distance_matr = Tanimoto_distance_matrix(fps)
    # Now cluster the data with the implemented Butina algorithm:
    clusters = Butina.ClusterData(distance_matr,len(fps),cutoff,isDistData=True)
    return clusters