In [15]:
import pandas as pd
from rdkit import (Chem, DataStructs, Geometry)
from rdkit.Chem import (Descriptors,
    Lipinski, QED, Draw, AllChem, PandasTools, rdFingerprintGenerator)
import json
from kinfraglib import utils
import matplotlib.pyplot as plt
import seaborn as sns
import utils_eval
import math
from copy import deepcopy
from rdkit.ML.Cluster import Butina
import numpy as np
from chembl_webresource_client.new_client import new_client


In [16]:
chembl_pka_compounds = pd.read_csv('pka_chembl.csv', delimiter=';')
chembl_pka_compounds = chembl_pka_compounds[chembl_pka_compounds['Standard Units'] == 'nM']

In [17]:
chembl_pka_compounds = chembl_pka_compounds[chembl_pka_compounds['Standard Value'] < 1000]

In [18]:
chembl_pka_compounds

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Target Organism,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type
2,CHEMBL49354,,,550.52,2,2.15,(+/-)-syn-1,O=C(NC1CNCCCC1OC(=O)c1cc(O)c(C(=O)c2c(O)cccc2C...,IC50,'=',...,Homo sapiens,PROTEIN FAMILY,CHEMBL1129020,1,Scientific Literature,Bioorg Med Chem Lett,1996,,,
4,CHEMBL226850,,,413.40,0,3.99,37a,N[C@H](COc1cncc(-c2cc3cn[nH]c3cn2)c1)Cc1cccc(C...,IC50,'=',...,Homo sapiens,PROTEIN FAMILY,CHEMBL1145373,1,Scientific Literature,J Med Chem,2007,,,
9,CHEMBL130049,,,524.58,1,4.83,12,COC(=O)N(C)[C@H]1C[C@@H]2O[C@](C)([C@H]1OC)n1c...,IC50,'=',...,Homo sapiens,PROTEIN FAMILY,CHEMBL1127758,1,Scientific Literature,Bioorg Med Chem Lett,1994,,,
11,CHEMBL311399,,,550.52,2,2.15,(-)-1 (Balanol),O=C(N[C@@H]1CNCCCC1OC(=O)c1cc(O)c(C(=O)c2c(O)c...,IC50,'=',...,Homo sapiens,PROTEIN FAMILY,CHEMBL1129020,1,Scientific Literature,Bioorg Med Chem Lett,1996,,,
16,CHEMBL338935,,,523.59,1,3.55,17,CO[C@H]1[C@@H](N(C)C(=O)CN)C[C@@H]2O[C@@]1(C)n...,IC50,'=',...,Homo sapiens,PROTEIN FAMILY,CHEMBL1127758,1,Scientific Literature,Bioorg Med Chem Lett,1994,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
614,CHEMBL60254,BALANOL,,550.52,2,2.15,Balanol,O=C(N[C@@H]1CNCCC[C@H]1OC(=O)c1cc(O)c(C(=O)c2c...,IC50,'=',...,Homo sapiens,PROTEIN FAMILY,CHEMBL5154740,1,Scientific Literature,J Med Chem,2022,,,INHIBITOR
619,CHEMBL5181354,,,566.61,1,4.46,18,CO[C@@H]1[C@H](N(C)C(=O)CCC(=O)O)C[C@H]2O[C@]1...,IC50,'=',...,Homo sapiens,PROTEIN FAMILY,CHEMBL5096167,1,Scientific Literature,Eur J Med Chem,2021,,,INHIBITOR
620,CHEMBL5188493,,,1839.13,,,1,CO[C@@H]1[C@H](N(C)C(=O)CCC(=O)NCCCOCCOCCOCCCN...,IC50,'=',...,Homo sapiens,PROTEIN FAMILY,CHEMBL5096167,1,Scientific Literature,Eur J Med Chem,2021,,,INHIBITOR
621,CHEMBL388978,STAUROSPORINE,,466.54,0,4.35,58,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,IC50,'=',...,Homo sapiens,PROTEIN FAMILY,CHEMBL5096167,1,Scientific Literature,Eur J Med Chem,2021,,,INHIBITOR


In [19]:
chembl_pka_compounds['ROMol'] = chembl_pka_compounds.Smiles.map(AllChem.MolFromSmiles)

In [20]:
chembl_pka_compounds['inchi'] = chembl_pka_compounds.ROMol.map(lambda x: Chem.MolToInchi(utils.standardize_mol(x)))

In [21]:
chembl_pka_compounds.rename(columns={'Molecule ChEMBL ID': 'chembl_id'}, inplace=True)

In [22]:
def most_similar_chembl_ligand(ligand_inchi, chembl, use_morgan=False):
    """
    Get the most similar ChEMBL ligand (ChEMBL compound ID and Tanimoto similarity) to the query ligand.

    Parameters
    ----------
    ligand_inchi : str
        Recombined ligand (InChI)
    kinodata : pandas.DataFrame
        kinodata ligands, column fingerprint necessary.

    Returns
    -------
    tuple of (str, str, str, float)
        ChEMBL assay ID, ChEMBL target ID, ChEMBL compound ID and Tanimoto similarity of kinodata ligand most similar to the query ligand.
    """
    try:

        # get ROMol from recombined ligand InChI
        ligand = Chem.MolFromInchi(ligand_inchi)

        # generate query ligand fingerprint
        f_gen = rdFingerprintGenerator.GetMorganGenerator(radius=3) if use_morgan else rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
        query_fingerprint = f_gen.GetFingerprint(ligand)

        # get ChEMBL fingerprints as list
        chembl_fingerprints = chembl.fingerprint.to_list()

        # get pairwise similarities
        chembl['similarity'] = DataStructs.BulkTanimotoSimilarity(query_fingerprint, chembl_fingerprints)

        # get ligand with maximal similarity
        chembl_most_similar_ix = chembl.similarity.idxmax()

        return [
            chembl.loc[chembl_most_similar_ix].chembl_id,
            round(chembl.loc[chembl_most_similar_ix].similarity, 2)
        ]

    except Exception as e:
        
        print(f'Most similar ChEMBL ligand search problem for {ligand_inchi}: {e}')
        return [None, None]


In [23]:

USE_MORGAN = False
COMPOUNDS_PATH = '../results_5n1f_25_02/5n1f/results.sdf'

# generate
if USE_MORGAN:
    morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=3)
    chembl_pka_compounds['fingerprint'] = chembl_pka_compounds['inchi'].map(lambda x: morgan_gen.GetFingerprint(Chem.MolFromInchi(x)))
else:
    rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
    chembl_pka_compounds['fingerprint'] = chembl_pka_compounds['inchi'].map(lambda x: rdkit_gen.GetFingerprint(Chem.MolFromInchi(x)))


In [24]:

print('calculated fingerprints')

# read results data
data = utils_eval.read_mols(COMPOUNDS_PATH)
data['binding_affinity'] = data.apply(lambda x: (x['BIOSOLVEIT.HYDE_ESTIMATED_AFFINITY_UPPER_BOUNDARY [nM]'] + x['BIOSOLVEIT.HYDE_ESTIMATED_AFFINITY_LOWER_BOUNDARY [nM]'])/2, axis=1)
# post filtering
data_post_filtered = data[data['binding_affinity'] <= 1000].copy() 

data_post_filtered['inchi'] = data_post_filtered.apply(lambda x: Chem.MolToInchi(utils.standardize_mol(x.ROMol)), axis=1)

# calculated most similar kinodata ligand
most_similar_chembl_ligands = [most_similar_chembl_ligand(ligand_inchi, chembl_pka_compounds, USE_MORGAN) for ligand_inchi in data_post_filtered.inchi]
data_post_filtered['most_similar_chembl_ligand.compound_id'] = [res[0] for res in most_similar_chembl_ligands]
data_post_filtered['most_similar_chembl_ligand.similarity'] = [res[1] for res in most_similar_chembl_ligands]

calculated fingerprints


In [25]:
print(f"With rdkit fingerprints {data_post_filtered['most_similar_chembl_ligand.similarity'].describe()}")

With rdkit fingerprints count    1196.000000
mean        0.341622
std         0.079833
min         0.180000
25%         0.290000
50%         0.330000
75%         0.380000
max         1.000000
Name: most_similar_chembl_ligand.similarity, dtype: float64


In [26]:

USE_MORGAN = True
COMPOUNDS_PATH = '../results_5n1f_25_02/5n1f/results.sdf'

# generate
if USE_MORGAN:
    morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=3)
    chembl_pka_compounds['fingerprint'] = chembl_pka_compounds['inchi'].map(lambda x: morgan_gen.GetFingerprint(Chem.MolFromInchi(x)))
else:
    rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=5)
    chembl_pka_compounds['fingerprint'] = chembl_pka_compounds['inchi'].map(lambda x: rdkit_gen.GetFingerprint(Chem.MolFromInchi(x)))


In [27]:

print('calculated fingerprints')

# read results data
data = utils_eval.read_mols(COMPOUNDS_PATH)
data['binding_affinity'] = data.apply(lambda x: (x['BIOSOLVEIT.HYDE_ESTIMATED_AFFINITY_UPPER_BOUNDARY [nM]'] + x['BIOSOLVEIT.HYDE_ESTIMATED_AFFINITY_LOWER_BOUNDARY [nM]'])/2, axis=1)
# post filtering
data_post_filtered = data[data['binding_affinity'] < 1000].copy() 

data_post_filtered['inchi'] = data_post_filtered.apply(lambda x: Chem.MolToInchi(utils.standardize_mol(x.ROMol)), axis=1)

# calculated most similar kinodata ligand
most_similar_chembl_ligands = [most_similar_chembl_ligand(ligand_inchi, chembl_pka_compounds, USE_MORGAN) for ligand_inchi in data_post_filtered.inchi]
data_post_filtered['most_similar_chembl_ligand.compound_id'] = [res[0] for res in most_similar_chembl_ligands]
data_post_filtered['most_similar_chembl_ligand.similarity'] = [res[1] for res in most_similar_chembl_ligands]

calculated fingerprints


In [28]:
print(f"With Morgan fingerprints {data_post_filtered['most_similar_chembl_ligand.similarity'].describe()}")

With Morgan fingerprints count    1196.000000
mean        0.170025
std         0.069073
min         0.090000
25%         0.140000
50%         0.150000
75%         0.180000
max         1.000000
Name: most_similar_chembl_ligand.similarity, dtype: float64
