In [1]:
# collection affinity data from chembl
# https://projects.volkamerlab.org/teachopencadd/talktorials/T001_query_chembl.html

In [2]:
import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools
from chembl_webresource_client.new_client import new_client
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
targets_api = new_client.target
compounds_api = new_client.molecule
bioactivities_api = new_client.activity


get target data(EGFR kinase)

In [4]:
# fectch target data from chembl
uniprot_id = "P00533"
targets = targets_api.get(target_components__accession=uniprot_id).only(
    'target_chembl_id','organism','pref_name','target_type'
)
#download target data from chembl
targets = pd.DataFrame.from_records(targets)
targets

Unnamed: 0,organism,pref_name,target_chembl_id,target_type
0,Homo sapiens,Epidermal growth factor receptor erbB1,CHEMBL203,SINGLE PROTEIN
1,Homo sapiens,Epidermal growth factor receptor erbB1,CHEMBL203,SINGLE PROTEIN
2,Homo sapiens,Epidermal growth factor receptor and ErbB2 (HE...,CHEMBL2111431,PROTEIN FAMILY
3,Homo sapiens,Epidermal growth factor receptor,CHEMBL2363049,PROTEIN FAMILY
4,Homo sapiens,MER intracellular domain/EGFR extracellular do...,CHEMBL3137284,CHIMERIC PROTEIN
5,Homo sapiens,Protein cereblon/Epidermal growth factor receptor,CHEMBL4523680,PROTEIN-PROTEIN INTERACTION
6,Homo sapiens,EGFR/PPP1CA,CHEMBL4523747,PROTEIN-PROTEIN INTERACTION
7,Homo sapiens,VHL/EGFR,CHEMBL4523998,PROTEIN-PROTEIN INTERACTION
8,Homo sapiens,Baculoviral IAP repeat-containing protein 2/Ep...,CHEMBL4802031,PROTEIN-PROTEIN INTERACTION


In [5]:
# select target (target chembl id)
target = targets.iloc[0]
target

organism                                      Homo sapiens
pref_name           Epidermal growth factor receptor erbB1
target_chembl_id                                 CHEMBL203
target_type                                 SINGLE PROTEIN
Name: 0, dtype: object

In [6]:
chembl_id = target.target_chembl_id
print(f"The target ChEMBL ID is {chembl_id}")

The target ChEMBL ID is CHEMBL203


get bioactivity data

In [7]:
# data_type = ["Ki","Kd","IC50"]
data_type = ["Ki","Kd"]
bioactivities_list = []
for atype in data_type:
    bioactivities = bioactivities_api.filter(
        target_chembl_id=chembl_id, type=atype, relation="=", assay_type="B"
    ).only(
        "activity_id",
        "assay_chembl_id",
        "assay_description",
        "assay_type",
        "molecule_chembl_id",
        "type",
        "standard_units",
        "relation",
        "standard_value",
        "target_chembl_id",
        "target_organism",
    )
    bioactivities_df = pd.DataFrame.from_dict(bioactivities)
    bioactivities_list.append(bioactivities_df)
    print(f"{atype} - Length of bioactivities object: {len(bioactivities)}")
all_bioactivities_df = pd.concat(bioactivities_list)
all_bioactivities_df.reset_index(drop=True,inplace=True)


Ki - Length of bioactivities object: 259
Kd - Length of bioactivities object: 714


download bioactivity data from chembl

In [8]:
all_bioactivities_df.head(2)

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type,units,value
0,32773,CHEMBL675636,Inhibitory concentration of EGF dependent auto...,B,CHEMBL66879,=,nM,1000000.0,CHEMBL203,Homo sapiens,Ki,uM,1000.0
1,32781,CHEMBL675636,Inhibitory concentration of EGF dependent auto...,B,CHEMBL77085,=,nM,24000.0,CHEMBL203,Homo sapiens,Ki,uM,24.0


In [9]:
# clear data
# remove duplicate molecules or columns
# keep standard_unit == nm
# delete entries with missing values
all_bioactivities_df.drop(["units","value"],axis=1,inplace=True)
all_bioactivities_df = all_bioactivities_df.astype({"standard_value":"float64"})
all_bioactivities_df.dropna(axis=0,how="any",inplace=True)
all_bioactivities_df = all_bioactivities_df[all_bioactivities_df["standard_units"]=="nM"]
all_bioactivities_df.drop_duplicates("molecule_chembl_id",keep="first",inplace=True)
all_bioactivities_df.reset_index(drop=True,inplace=True)


get compound data

In [10]:
compounds_provider = compounds_api.filter(
    molecule_chembl_id__in = list(all_bioactivities_df["molecule_chembl_id"])
).only("molecule_chembl_id","molecule_structures")
compounds = list(tqdm(compounds_provider))
compounds_df = pd.DataFrame.from_records(compounds)

100%|██████████| 324/324 [00:00<00:00, 2663.97it/s]


In [11]:
# remove entries with missing entries
# delete duplicate molecule
# get smiles
compounds_df.dropna(axis=0,how="any",inplace=True)
compounds_df.drop_duplicates("molecule_chembl_id",keep="first",inplace=True)
canonical_smiles = []
for i,compounds in compounds_df.iterrows():
    try:
        canonical_smiles.append(compounds["molecule_structures"]["canonical_smiles"])
    except KeyError:
        canonical_smiles.append(None)
compounds_df["smiles"] = canonical_smiles
compounds_df.drop("molecule_structures",axis=1,inplace=True)
compounds_df.dropna(axis=0,how="any",inplace=True)


In [12]:
print(f"Bioactivities info: {all_bioactivities_df.shape[0]}")
print(all_bioactivities_df.columns)
print(f"Compounds info: {compounds_df.shape[0]}")
print(compounds_df.columns)

Bioactivities info: 324
Index(['activity_id', 'assay_chembl_id', 'assay_description', 'assay_type',
       'molecule_chembl_id', 'relation', 'standard_units', 'standard_value',
       'target_chembl_id', 'target_organism', 'type'],
      dtype='object')
Compounds info: 324
Index(['molecule_chembl_id', 'smiles'], dtype='object')


merge both dataset

In [14]:
output_df = pd.merge(all_bioactivities_df[["molecule_chembl_id","type","standard_units","standard_value"]],
                     compounds_df,
                     on="molecule_chembl_id",)
output_df.reset_index(drop=True,inplace=True)
output_df.head(2)

Unnamed: 0,molecule_chembl_id,type,standard_units,standard_value,smiles
0,CHEMBL66879,Ki,nM,1000000.0,O=C(O)/C=C/c1ccc(O)cc1
1,CHEMBL77085,Ki,nM,24000.0,N#CC(C#N)=Cc1cc(O)ccc1[N+](=O)[O-]


In [15]:
output_df.to_csv("EGFR_compouds.csv",index=False)
