# KLIFS: Get IFPs and ChEMBL data

The `klifs_utils` library is still work in progress, thus installation and API might change.

In [1]:
import requests
import time

from bs4 import BeautifulSoup
#from chembl_webresource_client.new_client import new_client
import pandas as pd

import klifs_utils as klifs



## Aim of this notebook

- [Done] Get the interaction fingerprint (IFP) from KLIFS for a given kinase-ligand complex
- [Done] Get the associated binding affinity data
- Merge both data sets in one table

## IFP for all kinase-ligand complexes

Let's do this step by step first:

### 1. Get all kinases and their KLIFS kinase IDs

In [2]:
kinases = klifs.remote.kinases.kinase_names()
print(f'All kinases: {kinases.shape}')

kinases = kinases[kinases.species == 'Human']
kinases

All kinases: (1127, 4)


Unnamed: 0,kinase_ID,name,full_name,species
4,277,AAK1,AP2 associated kinase 1,Human
6,443,AATK,apoptosis-associated tyrosine kinase,Human
8,392,ABL1,"ABL proto-oncogene 1, non-receptor tyrosine ki...",Human
10,393,ABL2,"ABL proto-oncogene 2, non-receptor tyrosine ki...",Human
12,516,ACVR1,activin A receptor type I,Human
...,...,...,...,...
1115,339,WNK2,WNK lysine deficient protein kinase 2,Human
1117,338,WNK3,WNK lysine deficient protein kinase 3,Human
1119,337,WNK4,WNK lysine deficient protein kinase 4,Human
1121,463,YES1,"YES proto-oncogene 1, Src family tyrosine kinase",Human


In [3]:
kinase_ids = kinases.kinase_ID.to_list()
len(kinase_ids)

555

### 2. Get all structures and their KLIFS structure IDs

In [4]:
structures = klifs.remote.structures.structures_from_kinase_id(kinase_ids)
structures

Unnamed: 0,structure_ID,kinase,species,kinase_ID,pdb,alt,chain,rmsd1,rmsd2,pocket,...,bp_I_A,bp_I_B,bp_II_in,bp_II_A_in,bp_II_B_in,bp_II_out,bp_II_B,bp_III,bp_IV,bp_V
0,10879,AKT1,Human,1,6npz,B,B,0.776,2.092,KLLGKGTFGKVILYAMKILHTLTENRVLQNSRPFLTALKYSCFVME...,...,False,False,False,False,False,False,False,False,False,False
1,10970,AKT1,Human,1,6hhf,,A,0.961,2.358,KLLGKGTFGKVILYAMKIL___________SRPFLTALKYSCFVME...,...,False,False,False,False,False,True,False,True,True,False
2,10914,AKT1,Human,1,6hhj,,A,0.977,2.648,KLLGKGTFGKVILYAMKILHTLTENRVLQNSRPFLTALKYSCFVME...,...,False,False,False,False,False,True,False,True,True,False
3,6708,AKT1,Human,1,5kcv,,A,0.956,2.328,KLLGKGTFGKVILYAMKIL_______VLQNSRPFLTALKYSCFVME...,...,False,False,False,False,False,True,False,True,True,False
4,10916,AKT1,Human,1,6hhg,,A,0.973,2.385,KLLGKGTFGKVILYAMKIL___________SRPFLTALKYSCFVME...,...,False,False,False,False,False,True,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10279,9095,PI4K2B,Human,1091,4wtv,A,A,1.554,3.088,ERISQGSSGSYFVGVFKPKGYLSEAGAYLVDNSIVPKTKVVGSFQL...,...,False,False,False,False,False,False,False,False,False,False
10280,9098,PI4K2B,Human,1091,4wtv,B,A,1.554,3.088,ERISQGSSGSYFVGVFKPKGYLSEAGAYLVDNSIVPKTKVVGSFQL...,...,False,False,False,False,False,False,False,False,False,False
10281,9096,PI4K2B,Human,1091,4wtv,B,B,1.554,3.041,ERIS___SGSYFVGVFKPKGYLSEAGAYLVDNSIVPKTKVVGSFQL...,...,False,False,False,False,False,False,False,False,False,False
10282,9070,PI4KA,Human,1096,6bq1,,A,1.704,2.676,_PMQSAAKAPYLAAIFKVGDCRQDMLALQIIDLFVFPYRVVCGVIE...,...,False,True,False,False,False,False,False,False,False,False


In [5]:
print(f'{structures.shape[0]} entries covering {len(structures.pdb.unique())} PDB structures')

10284 entries covering 4760 PDB structures


In [6]:
structure_ids = structures.structure_ID.to_list()
len(structure_ids)

10284

### 3. Get all interaction fingerprints

In [7]:
interaction_fingerprints = klifs.remote.interactions.interaction_fingerprint(structure_ids)
interaction_fingerprints

Unnamed: 0,structure_ID,IFP
0,1,0000000000000010000000000000000000000000000000...
1,3,0000000000000010000000000000000000000000000000...
2,5,0000000000000010000000000000000000000000000000...
3,6,0000000000000010000001000000000000000000000000...
4,7,0000000000000010001001000000000000000000000000...
...,...,...
8713,12158,0000000000000010000001000000000000000000000000...
8714,12159,0000000000000010000001000000000000000000000000...
8715,12160,0000000000000010000000000000000000000000000000...
8716,12161,0000000000000010000000000000000000000000000000...


Merge the interaction fingerprint with the structural data in `structures`.

In [8]:
data = pd.merge(
    interaction_fingerprints,
    structures,
    left_on='structure_ID',
    right_on='structure_ID',
    how='outer'
)

In [9]:
# All entries
data

Unnamed: 0,structure_ID,IFP,kinase,species,kinase_ID,pdb,alt,chain,rmsd1,rmsd2,...,bp_I_A,bp_I_B,bp_II_in,bp_II_A_in,bp_II_B_in,bp_II_out,bp_II_B,bp_III,bp_IV,bp_V
0,1,0000000000000010000000000000000000000000000000...,EphA7,Human,415,3dko,A,A,0.944,2.254,...,True,True,False,False,False,True,False,True,False,False
1,3,0000000000000010000000000000000000000000000000...,EphA7,Human,415,3dko,B,A,0.944,2.254,...,True,True,False,False,False,True,False,True,False,False
2,5,0000000000000010000000000000000000000000000000...,ITK,Human,474,3v8t,B,A,0.842,2.047,...,False,False,False,False,False,False,False,False,False,False
3,6,0000000000000010000001000000000000000000000000...,ITK,Human,474,4kio,A,D,0.842,2.151,...,False,False,False,False,False,False,False,False,False,False
4,7,0000000000000010001001000000000000000000000000...,ITK,Human,474,4kio,B,C,0.840,2.148,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10279,9332,,PIK3C3,Human,1088,4uwl,,A,1.542,2.987,...,False,False,False,False,False,False,False,False,False,False
10280,9334,,PIK3C3,Human,1088,4uwh,B,A,1.539,2.992,...,False,False,False,False,False,False,False,False,False,False
10281,9264,,PI4K2A,Human,1089,5eut,,A,1.545,3.070,...,False,False,False,False,False,False,False,False,False,False
10282,9141,,PI4KB,Human,1090,5c46,,E,1.551,2.966,...,False,False,False,False,False,False,False,False,False,False


Interesting, not all KLIFS structures have IFPs.

In [10]:
# Entries without IFP
data[data.IFP.isna()]

Unnamed: 0,structure_ID,IFP,kinase,species,kinase_ID,pdb,alt,chain,rmsd1,rmsd2,...,bp_I_A,bp_I_B,bp_II_in,bp_II_A_in,bp_II_B_in,bp_II_out,bp_II_B,bp_III,bp_IV,bp_V
8718,10879,,AKT1,Human,1,6npz,B,B,0.776,2.092,...,False,False,False,False,False,False,False,False,False,False
8719,10439,,AKT1,Human,1,6buu,B,B,0.776,2.091,...,False,False,False,False,False,False,False,False,False,False
8720,2545,,AKT1,Human,1,3o96,,A,0.938,2.367,...,False,False,False,False,False,False,False,False,False,False
8721,10431,,AKT1,Human,1,6c0i,B,A,0.778,2.092,...,False,False,False,False,False,False,False,False,False,False
8722,10881,,AKT1,Human,1,6npz,A,B,0.776,2.092,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10279,9332,,PIK3C3,Human,1088,4uwl,,A,1.542,2.987,...,False,False,False,False,False,False,False,False,False,False
10280,9334,,PIK3C3,Human,1088,4uwh,B,A,1.539,2.992,...,False,False,False,False,False,False,False,False,False,False
10281,9264,,PI4K2A,Human,1089,5eut,,A,1.545,3.070,...,False,False,False,False,False,False,False,False,False,False
10282,9141,,PI4KB,Human,1090,5c46,,E,1.551,2.966,...,False,False,False,False,False,False,False,False,False,False


Check out a few of these structures online (missing ligand):
- https://klifs.vu-compmedchem.nl/details.php?structure_id=10879
- https://klifs.vu-compmedchem.nl/details.php?structure_id=9332

Thus drop all entries without IFP.

In [11]:
# Entries with IFP
data_with_ifp = data[data.IFP.notna()]
data_with_ifp.shape

(8718, 37)

### Steps 1-3 in short

In [12]:
def get_all_interaction_fingerprints():
    
    # Get all human kinases and their KLIFS kinase IDs
    kinases = klifs.remote.kinases.kinase_names()
    kinases = kinases[kinases.species == 'Human']
    kinase_ids = kinases.kinase_ID.to_list()

    # Get all KLIFS structures and structure IDs from kinase IDs
    structures = klifs.remote.structures.structures_from_kinase_id(kinase_ids)
    structure_ids = structures.structure_ID.to_list()

    # Get all interaction fingerprints (IFP) from structure IDs
    interaction_fingerprints = klifs.remote.interactions.interaction_fingerprint(structure_ids)

    # Merge data about structures with data on IFPs - and drop all entries without IFP
    data = pd.merge(
        interaction_fingerprints,
        structures,
        left_on='structure_ID',
        right_on='structure_ID',
        how='outer'
    )
    data_with_ifp = data[data.IFP.notna()]
    
    return data_with_ifp

In [13]:
ifp_data = get_all_interaction_fingerprints()

In [14]:
ifp_data

Unnamed: 0,structure_ID,IFP,kinase,species,kinase_ID,pdb,alt,chain,rmsd1,rmsd2,...,bp_I_A,bp_I_B,bp_II_in,bp_II_A_in,bp_II_B_in,bp_II_out,bp_II_B,bp_III,bp_IV,bp_V
0,1,0000000000000010000000000000000000000000000000...,EphA7,Human,415,3dko,A,A,0.944,2.254,...,True,True,False,False,False,True,False,True,False,False
1,3,0000000000000010000000000000000000000000000000...,EphA7,Human,415,3dko,B,A,0.944,2.254,...,True,True,False,False,False,True,False,True,False,False
2,5,0000000000000010000000000000000000000000000000...,ITK,Human,474,3v8t,B,A,0.842,2.047,...,False,False,False,False,False,False,False,False,False,False
3,6,0000000000000010000001000000000000000000000000...,ITK,Human,474,4kio,A,D,0.842,2.151,...,False,False,False,False,False,False,False,False,False,False
4,7,0000000000000010001001000000000000000000000000...,ITK,Human,474,4kio,B,C,0.840,2.148,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8713,12158,0000000000000010000001000000000000000000000000...,EGFR,Human,406,6jrj,,A,0.789,2.113,...,True,True,False,False,False,False,False,False,False,False
8714,12159,0000000000000010000001000000000000000000000000...,EGFR,Human,406,6jrk,,A,0.779,2.111,...,True,True,False,False,False,False,False,False,False,False
8715,12160,0000000000000010000000000000000000000000000000...,ALK2,Human,516,6jux,B,A,0.783,1.997,...,False,True,False,False,False,False,False,False,False,False
8716,12161,0000000000000010000000000000000000000000000000...,ALK2,Human,516,6jux,A,A,0.783,1.997,...,False,True,False,False,False,False,False,False,False,False


In [15]:
ifp_data.to_csv('klifs_all_human_ifps.csv', index=False)

## ChEMBL binding affinities for ligand (this could be added to `klifs_utils`?)

Two options:

- (a) Web scraping from KLIFS (extracting the ligand affinity data from the HTML code)
- (b) Using the `chembl_webresource_client` to get data by ligand PDB ID

Since (b) will involve filtering and affinity conversion steps that may or may not result in the data shown in KLIFS, I decided to go for (a).
If you want to try out (b), take a look at this TeachOpenCADD notebook: https://github.com/dominiquesydow/TeachOpenCADD/blob/T1-refactoring/talktorials/1_ChEMBL/T1_ChEMBL.ipynb

In [16]:
def ligand_affinity_from_klifs(structure_ids):
    
    # Store here ligand affinities (table on website) for each structure ID
    ligand_affinity_by_structure_id = {}
    
    # Save here structure_ids where no ChEMBL compound ID or ligand affinity table was found
    no_chembl_ids = []
    no_chembl_bioactivities = []
    
    for i, structure_id in enumerate(structure_ids):
        
        if i%1000 == 0:
            print(f'Progress: {i}/{len(structure_ids)}')
            time.sleep(30)  # After x requests, wait some time
        
        # Request content from URL
        r = requests.get(f'https://klifs.vu-compmedchem.nl/details.php?structure_id={structure_id}')
        r.raise_for_status()

        # Transform into html
        html = BeautifulSoup(r.text, features='html.parser')

        # Get all tables on website (actual tables but also list-like elements on website)
        tables = html.find_all('table')

        # Get ChEMBL compound id
        chembl_compound_id = None

        for table in tables:
            try:
                if table.find('td', {'class': 'detailHeader'}).get_text().strip() == 'Ligand affinity':
                    chembl_compound_id = table.find('td', {"class": "detailInfo"}).get_text()
            except AttributeError:
                pass
            
        if chembl_compound_id is None:
            no_chembl_ids.append(structure_id)

         # Get ligand affinities
        chembl_details = html.find('table', id='chemblDetails')
        
        try:
            # Get header and row data from table and create DataFrame
            header = chembl_details.find_all('tr')[0]
            header = [cell.get_text() for cell in header.find_all('th')]

            rows = chembl_details.find_all('tr')[1:]
            # Measurement for query kinase?
            highlighted = ['highlightChEMBL' in str(row) for row in rows]
            rows = [[cell.get_text() for cell in row.find_all('td')] for row in rows]

            ligand_affinities = pd.DataFrame(rows, columns=header)
            ligand_affinities = ligand_affinities.astype(
                {
                    'Median': 'float64',
                    'Min': 'float64',
                    'Max': 'float64',
                    'Records': 'int32'
                },
            )
            ligand_affinities['ChEMBL compound ID'] = chembl_compound_id
            ligand_affinities['Highlighted'] = highlighted

            # Add DataFrame to dict
            ligand_affinity_by_structure_id[structure_id] = ligand_affinities
            
        except AttributeError:
            no_chembl_bioactivities.append(structure_id)
        
    return pd.concat(ligand_affinity_by_structure_id), no_chembl_ids, no_chembl_bioactivities

In [17]:
# Takes up to an hour
ligand_affinity_by_structure_id, no_chembl_ids, no_chembl_bioactivities = ligand_affinity_from_klifs(
    ifp_data.structure_ID.to_list()
)

Progress: 0/8718
Progress: 1000/8718
Progress: 2000/8718
Progress: 3000/8718
Progress: 4000/8718
Progress: 5000/8718
Progress: 6000/8718
Progress: 7000/8718
Progress: 8000/8718


In [18]:
ligand_affinity_by_structure_id

Unnamed: 0,Unnamed: 1,Species,Kinase (ChEMBL naming),Median,Min,Max,Type,Records,ChEMBL compound ID,Highlighted
1,0,Homo sapiens,Ephrin type-B receptor 2,7.0,7.0,7.4,pEC50,2,CHEMBL552425,False
3,0,Homo sapiens,Ephrin type-B receptor 2,7.0,7.0,7.4,pEC50,2,CHEMBL552425,False
5,0,Homo sapiens,Tyrosine-protein kinase ITK/TSK,9.5,9.5,9.5,pIC50,1,CHEMBL2017556,True
5,1,Homo sapiens,Tyrosine-protein kinase LCK,7.4,7.4,7.4,pIC50,1,CHEMBL2017556,False
5,2,Homo sapiens,Tyrosine-protein kinase SYK,6.5,6.5,6.5,pIC50,1,CHEMBL2017556,False
...,...,...,...,...,...,...,...,...,...,...
12105,279,Homo sapiens,Vascular endothelial growth factor receptor 2,8.8,7.5,9.7,pKd,6,CHEMBL535,False
12105,280,Homo sapiens,Vascular endothelial growth factor receptor 2,8.1,8.1,8.5,pKi,4,CHEMBL535,False
12105,281,Homo sapiens,Vascular endothelial growth factor receptor 3,8.1,8.1,8.1,pIC50,1,CHEMBL535,False
12105,282,Homo sapiens,Vascular endothelial growth factor receptor 3,7.3,7.3,7.5,pKd,5,CHEMBL535,False


In [19]:
ligand_affinity_by_structure_id.to_csv('klifs_all_ligand_affinities_human_kinases.csv')

In [20]:
# KLIFS entries without ChEMBL compound ID
len(no_chembl_ids)

3297

In [21]:
# KLIFS entries without KLIFS bioactivity values
len(no_chembl_bioactivities)

4747

In [25]:
# KLIFS entries that do have a ChEMBL compound ID but no bioactivity values
# In that case it says on KLIFS website: "No (p)Ki/(p)IC50/(p)EC50 values for kinases found (confidence ≥ 8)."
len(set(no_chembl_bioactivities) - set(no_chembl_ids))

1450