In [1]:
import math
from pathlib import Path
from zipfile import ZipFile
from tempfile import TemporaryDirectory

import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools
from chembl_webresource_client.new_client import new_client
from tqdm.auto import tqdm

In [2]:
# Instantiate handles for interacting with ChEMBL
targets_api = new_client.target
compounds_api = new_client.molecule
bioactivities_api = new_client.activity

In [3]:
# After looking at RCSB for imatinib crystallized with a tyrosine kinase inhibitor,
# I chose a target from PDB ID 2HYY, which has a UniProt ID P00519
uniprot_id = 'P00519'

In [4]:
# Get target information from ChEMBL but restrict it to specified values only
targets = targets_api.get(target_components__accession=uniprot_id).only(
    "target_chembl_id", "organism", "pref_name", "target_type"
)
print(f'The type of the targets is "{type(targets)}"')

The type of the targets is "<class 'chembl_webresource_client.query_set.QuerySet'>"


In [6]:
# Get targets from ChEMBL
targets = pd.DataFrame.from_records(targets)
targets

Unnamed: 0,organism,pref_name,target_chembl_id,target_type
0,Homo sapiens,Tyrosine-protein kinase ABL,CHEMBL1862,SINGLE PROTEIN
1,Homo sapiens,Tyrosine-protein kinase ABL,CHEMBL1862,SINGLE PROTEIN
2,Homo sapiens,Bcr/Abl fusion protein,CHEMBL2096618,CHIMERIC PROTEIN
3,Homo sapiens,Tyrosine-protein kinase ABL,CHEMBL2111414,PROTEIN FAMILY
4,Homo sapiens,Ras and Rab interactor 1/Tyrosine-protein kina...,CHEMBL3885630,PROTEIN-PROTEIN INTERACTION
5,Homo sapiens,Transcription factor ETV6/Tyrosine-protein kin...,CHEMBL3885645,CHIMERIC PROTEIN
6,Homo sapiens,Baculoviral IAP repeat-containing protein 2/BC...,CHEMBL4296119,PROTEIN-PROTEIN INTERACTION
7,Homo sapiens,E3 ubiquitin-protein ligase XIAP/BCR/ABL,CHEMBL4296120,PROTEIN-PROTEIN INTERACTION
8,Homo sapiens,Cereblon/BCR/ABL,CHEMBL4296137,PROTEIN-PROTEIN INTERACTION
9,Homo sapiens,VHL/ABL1,CHEMBL4523725,PROTEIN-PROTEIN INTERACTION


In [8]:
# I'm choosing CHEMBL1862, it's single protein and representative of the TKI ABL
target = targets.iloc[0]
target

organism                           Homo sapiens
pref_name           Tyrosine-protein kinase ABL
target_chembl_id                     CHEMBL1862
target_type                      SINGLE PROTEIN
Name: 0, dtype: object

In [9]:
chembl_id = target.target_chembl_id
print(f"The target ChEMBL ID is {chembl_id}")

The target ChEMBL ID is CHEMBL1862


In [10]:
# Get binding assay data
bioactivities = bioactivities_api.filter(
    target_chembl_id=chembl_id, type="IC50", relation="=", assay_type="B"
).only(
    "activity_id",
    "assay_chembl_id",
    "assay_description",
    "assay_type",
    "molecule_chembl_id",
    "type",
    "standard_units",
    "relation",
    "standard_value",
    "target_chembl_id",
    "target_organism",
)

print(f"Length and type of bioactivities object: {len(bioactivities)}, {type(bioactivities)}")

Length and type of bioactivities object: 2197, <class 'chembl_webresource_client.query_set.QuerySet'>


In [12]:
# Look at an entry in bioactivities
print(f"Length and type of first element: {len(bioactivities[0])}, {type(bioactivities[0])}")
bioactivities[0]

Length and type of first element: 13, <class 'dict'>


{'activity_id': 146141,
 'assay_chembl_id': 'CHEMBL806527',
 'assay_description': 'Radioligand displacement assay for the binding of [125I]Glu-Pro-Gln-pTyr-Glu-Glu-Ile-Pro-Ile-Tyr-Leu to ABL SH2 domain',
 'assay_type': 'B',
 'molecule_chembl_id': 'CHEMBL13462',
 'relation': '=',
 'standard_units': 'nM',
 'standard_value': '4000.0',
 'target_chembl_id': 'CHEMBL1862',
 'target_organism': 'Homo sapiens',
 'type': 'IC50',
 'units': 'uM',
 'value': '4.0'}

In [13]:
# Get the query set from ChEMBL
bioactivities_df = pd.DataFrame.from_records(bioactivities)
print(f"DataFrame shape: {bioactivities_df.shape}")
bioactivities_df.head()

DataFrame shape: (2198, 13)


Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type,units,value
0,146141,CHEMBL806527,Radioligand displacement assay for the binding...,B,CHEMBL13462,=,nM,4000.0,CHEMBL1862,Homo sapiens,IC50,uM,4.0
1,146141,CHEMBL806527,Radioligand displacement assay for the binding...,B,CHEMBL13462,=,nM,4000.0,CHEMBL1862,Homo sapiens,IC50,uM,4.0
2,146143,CHEMBL762575,Inhibition of [35S]-labeled SH2-GST Abl bindin...,B,CHEMBL13462,=,nM,16000.0,CHEMBL1862,Homo sapiens,IC50,uM,16.0
3,148569,CHEMBL806527,Radioligand displacement assay for the binding...,B,CHEMBL414123,=,nM,15000.0,CHEMBL1862,Homo sapiens,IC50,uM,15.0
4,148571,CHEMBL762575,Inhibition of [35S]-labeled SH2-GST Abl bindin...,B,CHEMBL414123,=,nM,2900.0,CHEMBL1862,Homo sapiens,IC50,uM,2.9


In [16]:
# We need to clean-up the bioactivites data and do some pandas calisthenics
bioactivities_df["units"].unique()  # Illustrated here, see output

bioactivities_df.drop(["units", "value"], axis=1, inplace=True)
bioactivities_df.head()

KeyError: 'units'