In [38]:
import pandas as pd
from rdkit import (Chem, DataStructs, Geometry)
from rdkit.Chem import (Descriptors,
    Lipinski, QED, Draw, AllChem, PandasTools, rdFingerprintGenerator)
import json
from kinfraglib import utils
import matplotlib.pyplot as plt
import seaborn as sns
import utils_eval
import math
from copy import deepcopy
from rdkit.ML.Cluster import Butina
import numpy as np
from chembl_webresource_client.new_client import new_client


In [39]:
crystal_compounds = pd.read_csv('crystal_struct_first_compounds.csv', delimiter=';')

In [40]:
crystal_compounds

Unnamed: 0,Compound,Cluster-ID,SMILES,Final DMSO-stock concentration in mM,KI in uM,Concentration in setup 1 TSA in mM,shift in setup 1 TSA in C,Concentration in setup 2 TSA in mM,shift in setup 2 TSA in C,Cocrystallization Attempt,Crystal Structure,Hit
0,EN001,c1,CC1=NC=2C(=CC=NC2N1)C(=O)OCC3(CC=4C=CC=CC4)CC3,1000,not active,1.25,-1.92,0.07,0.17,No,-,-
1,EN002,c3,NC(=O)C=1C=CC(=CC1)C(=O)N2CC3CCC2C(O)C3,1000,not active,2.5,-0.04,not measured,not measured,No,-,-
2,EN003,c3,NC(=O)C=1C=CC(SC2CCN(C3CCOCC3)C2=O)=CC1,91,not measured,not measured,not measured,not measured,not measured,No,-,-
3,EN004,c1,CC1=NC=2C(=CC=NC2N1)C(=O)OCC3=CN=C(S3)C=4C=CC=CC4,125,not measured,not measured,not measured,not measured,not measured,No,-,-
4,EN005,c4,NC=1NN=C(CCCNC=2C=CC=3C(Cl)=CC=CC3N2)C1C#N,1000,~10000,1.25,2.00,0.07,0.40,No,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...
101,EN102*,c1,Cl.CC(C)(C)C=1C=CC(=CC1)C2(CCCC2)NCC=3C=CC(=CC...,not measured,not measured,not measured,not measured,not measured,not measured,No,-,-
102,EN103*,c1,CNC(=O)CC1=NC(CS(=O)CC=2C=CC(=CC2)C(=O)N)=CS1,not measured,not measured,not measured,not measured,not measured,not measured,No,-,-
103,EN104*,c1,Cl.COC(=O)C=CC1=CC=C(CNCC=2C=CC(=CC2)C(=O)N)O1,not measured,not measured,not measured,not measured,not measured,not measured,No,-,-
104,EN105*,c4,CC(=O)C=1C=CC=C(NCCCC2=NNC(N)=C2C#N)N1,not measured,not measured,not measured,not measured,not measured,not measured,No,-,-


In [41]:
crystal_compounds['ROMol'] = crystal_compounds.SMILES.map(AllChem.MolFromSmiles)

In [42]:
crystal_compounds['inchi'] = crystal_compounds.ROMol.map(lambda x: Chem.MolToInchi(utils.standardize_mol(x)))

In [43]:
def most_similar_crystal_ligand(ligand_inchi, chembl, use_morgan):
    """
    Get the most similar ChEMBL ligand (ChEMBL compound ID and Tanimoto similarity) to the query ligand.

    Parameters
    ----------
    ligand_inchi : str
        Recombined ligand (InChI)
    kinodata : pandas.DataFrame
        kinodata ligands, column fingerprint necessary.

    Returns
    -------
    tuple of (str, str, str, float)
        ChEMBL assay ID, ChEMBL target ID, ChEMBL compound ID and Tanimoto similarity of kinodata ligand most similar to the query ligand.
    """
    try:

        # get ROMol from recombined ligand InChI
        ligand = Chem.MolFromInchi(ligand_inchi)

        # generate query ligand fingerprint
        f_gen = rdFingerprintGenerator.GetMorganGenerator(radius=3) if use_morgan else rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
        query_fingerprint = f_gen.GetFingerprint(ligand)

        # get ChEMBL fingerprints as list
        chembl_fingerprints = chembl.fingerprint.to_list()

        # get pairwise similarities
        chembl['similarity'] = DataStructs.BulkTanimotoSimilarity(query_fingerprint, chembl_fingerprints)

        # get ligand with maximal similarity
        chembl_most_similar_ix = chembl.similarity.idxmax()

        return [
            chembl.loc[chembl_most_similar_ix].Compound,
            round(chembl.loc[chembl_most_similar_ix].similarity, 2)
        ]

    except Exception as e:
        
        print(f'Most similar ChEMBL ligand search problem for {ligand_inchi}: {e}')
        return [None, None]


In [44]:

USE_MORGAN = True
COMPOUNDS_PATH = '../final_results/3amb/results.sdf'

# generate
if USE_MORGAN:
    morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=3)
    crystal_compounds['fingerprint'] = crystal_compounds['inchi'].map(lambda x: morgan_gen.GetFingerprint(Chem.MolFromInchi(x)))
else:
    rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
    crystal_compounds['fingerprint'] = crystal_compounds['inchi'].map(lambda x: rdkit_gen.GetFingerprint(Chem.MolFromInchi(x)))



In [45]:

print('calculated fingerprints')

# read results data
data = utils_eval.read_mols(COMPOUNDS_PATH)
data['binding_affinity'] = data.apply(lambda x: (x['BIOSOLVEIT.HYDE_ESTIMATED_AFFINITY_UPPER_BOUNDARY [nM]'] + x['BIOSOLVEIT.HYDE_ESTIMATED_AFFINITY_LOWER_BOUNDARY [nM]'])/2, axis=1)
# post filtering
data_post_filtered = data[data['binding_affinity'] < 1000].copy().reset_index(drop=True) 

data_post_filtered['inchi'] = data_post_filtered.apply(lambda x: Chem.MolToInchi(utils.standardize_mol(x.ROMol)), axis=1)

# calculated most similar crystal ligand
most_similar_crystal_ligands = [most_similar_crystal_ligand(ligand_inchi, crystal_compounds, USE_MORGAN) for ligand_inchi in data_post_filtered.inchi]
data_post_filtered['most_similar_compound'] = [res[0] for res in most_similar_crystal_ligands]
data_post_filtered['similarity'] = [res[1] for res in most_similar_crystal_ligands]

calculated fingerprints


In [46]:
data_post_filtered['similarity'].describe()

count    9194.000000
mean        0.158769
std         0.022484
min         0.100000
25%         0.140000
50%         0.160000
75%         0.170000
max         0.400000
Name: similarity, dtype: float64

In [47]:
# generate
if USE_MORGAN:
    morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=3)
    data_post_filtered['fingerprint'] = data_post_filtered['inchi'].map(lambda x: morgan_gen.GetFingerprint(Chem.MolFromInchi(x)))
else:
    rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
    data_post_filtered['fingerprint'] = data_post_filtered['inchi'].map(lambda x: rdkit_gen.GetFingerprint(Chem.MolFromInchi(x)))

In [48]:
# and the other way around 
data_post_filtered['Compound'] = data_post_filtered.index.map(lambda idx: f"C{idx}")

In [49]:
# calculated most similar compound for every crystal ligand
most_similar_sbd_ligands = [most_similar_crystal_ligand(ligand_inchi, data_post_filtered, USE_MORGAN) for ligand_inchi in crystal_compounds.inchi]
crystal_compounds['most_similar_compound'] = [res[0] for res in most_similar_sbd_ligands]
crystal_compounds['similarity'] = [res[1] for res in most_similar_sbd_ligands]

In [50]:
crystal_compounds['similarity'].describe()

count    106.000000
mean       0.192642
std        0.041342
min        0.120000
25%        0.160000
50%        0.190000
75%        0.210000
max        0.400000
Name: similarity, dtype: float64

In [51]:
crystal_compounds.sort_values(['similarity'])

Unnamed: 0,Compound,Cluster-ID,SMILES,Final DMSO-stock concentration in mM,KI in uM,Concentration in setup 1 TSA in mM,shift in setup 1 TSA in C,Concentration in setup 2 TSA in mM,shift in setup 2 TSA in C,Cocrystallization Attempt,Crystal Structure,Hit,ROMol,inchi,fingerprint,similarity,most_similar_compound
56,EN057,c4,CC1(CS(=O)(=O)N2CCC=3C(C#N)=C(N)SC3C2)CC1(Cl)Cl,1000,not active,2.5,0.64,not measured,not measured,No,-,-,"<img data-content=""rdkit/molecule"" src=""data:i...","InChI=1S/C13H15Cl2N3O2S2/c1-12(6-13(12,14)15)7...","[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.12,C4229
74,EN075,c4,NC=1C=CC(C=C2CCS(=O)(=O)CC2)=C(Cl)C1C#N,333,not active,0.83,-0.72,not measured,not measured,No,-,-,"<img data-content=""rdkit/molecule"" src=""data:i...",InChI=1S/C13H13ClN2O2S/c14-13-10(1-2-12(16)11(...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.13,C2158
20,EN021,c4,CC1(CS(=O)(=O)N2CCC=3C(C#N)=C(N)SC3C2)CC1(F)F,1000,~5000,1.25,-0.32,not measured,not measured,No,-,-,"<img data-content=""rdkit/molecule"" src=""data:i...","InChI=1S/C13H15F2N3O2S2/c1-12(6-13(12,14)15)7-...","[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.13,C8884
87,EN088,c4,NC=1C=CC(C=C2CCCNC2)=C(Cl)C1C#N,500,85.8,2.5,6.07,0.07,2.01,Yes,7PIG,Yes,"<img data-content=""rdkit/molecule"" src=""data:i...",InChI=1S/C13H14ClN3/c14-13-10(3-4-12(16)11(13)...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.13,C564
41,EN042,c4,CC=1N=C2CCCCN2C(=O)C1CCSC=3N=C(N)C=C(N)C3C#N,334,not active,2.5,1.21,0.07,0.48,No,-,-,"<img data-content=""rdkit/molecule"" src=""data:i...",InChI=1S/C17H20N6OS/c1-10-11(17(24)23-6-3-2-4-...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.14,C706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,EN102*,c1,Cl.CC(C)(C)C=1C=CC(=CC1)C2(CCCC2)NCC=3C=CC(=CC...,not measured,not measured,not measured,not measured,not measured,not measured,No,-,-,"<img data-content=""rdkit/molecule"" src=""data:i...","InChI=1S/C23H30N2O.ClH/c1-22(2,3)19-10-12-20(1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.27,C557
55,EN056,c3,COC=1C=CC=2NCCC(NC(=O)C=3C=CC(=CC3)C(=O)N)C2C1,250,not active,0.83,0.07,not measured,not measured,No,-,-,"<img data-content=""rdkit/molecule"" src=""data:i...",InChI=1S/C18H19N3O3/c1-24-13-6-7-15-14(10-13)1...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.27,C561
34,EN035,c3,COC1CCN(CC1)S(=O)(=O)NCC=2C=CC(=CC2)C(=O)N,1000,not active,2.5,-0.32,not measured,not measured,No,-,-,"<img data-content=""rdkit/molecule"" src=""data:i...",InChI=1S/C14H21N3O4S/c1-21-13-6-8-17(9-7-13)22...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.28,C557
97,EN098*,c1,NC(=O)C=1C=CC(CNS(=O)(=O)N2CCCOCC2)=CC1,not measured,not measured,not measured,not measured,not measured,not measured,No,-,-,"<img data-content=""rdkit/molecule"" src=""data:i...",InChI=1S/C13H19N3O4S/c14-13(17)12-4-2-11(3-5-1...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.29,C557
