# ChEMBL molecules details

Get more details on molecules from the recombined library that were found in ChEMBL:
- Get targets that these molecules are active on
- Get classification for these targets (e.g. are these targets kinases?)

In [1]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
import requests
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole

  curious_george.patch_all(thread=False, select=False)


## Load ChEMBL molecules

These are the molecules from the recombined library that were found in ChEMBL.

In [2]:
with open('novel_chembl_mols.txt', 'r') as f:
    lines = f.readlines()
    
molecules = [i[:-1] for i in lines]
molecules = [Chem.InchiToInchiKey(inchi) for inchi in molecules]

In [3]:
len(molecules)

121

## Get molecules details from ChEMBL

In [4]:
similarity = new_client.similarity
compound = new_client.molecule
activity = new_client.activity
target = new_client.target

### Get molecule ChEMBL IDs

In [5]:
details = []

for molecule in molecules:
    
    result = compound.get(molecule) # similarity.filter(smiles=molecule, similarity=100)
    
    details.append({
            'smiles': result['molecule_structures']['canonical_smiles'], 
            'inchi': result['molecule_structures']['standard_inchi'],
            'atc_classification': result['atc_classifications'], 
            'first_approval': result['first_approval'],
            'chembl_id': result['molecule_chembl_id']
    })
        
details = pd.DataFrame(details)

In [6]:
print(f'Number of queries: {len(molecules)}')
print(f'Number of query results: {details.shape[0]}')

Number of queries: 121
Number of query results: 121


In [7]:
details.head()

Unnamed: 0,atc_classification,chembl_id,first_approval,inchi,smiles
0,[],CHEMBL520734,,InChI=1S/C15H18N4O2S2/c16-15(21)19-14-11(6-12(...,NC(=O)Nc1sc(cc1C(=O)N[C@H]2CCCNC2)c3ccsc3
1,[],CHEMBL2064599,,InChI=1S/C14H13NO2/c1-17-13-9-7-11(8-10-13)14(...,COc1ccc(cc1)C(=O)Nc2ccccc2
2,[],CHEMBL1562513,,InChI=1S/C13H17NO/c15-13(11-7-3-1-4-8-11)14-12...,O=C(Nc1ccccc1)C2CCCCC2
3,[],CHEMBL487776,,InChI=1S/C18H21N5/c1-13-16-12-19-18(21-15-10-6...,Cc1nc(c2ccccc2)n3nc(NC4CCCCC4)ncc13
4,[],CHEMBL1446166,,InChI=1S/C16H11ClN2OS/c17-12-6-8-13(9-7-12)18-...,Clc1ccc(NC(=O)c2csc(n2)c3ccccc3)cc1


### Get measured activities for molecules

In [8]:
activities = activity.filter(molecule_chembl_id__in = list(details.chembl_id),
                             type = 'IC50', 
                             relation = '=', 
                             assay_type = 'B') \
                    .only('activity_id','assay_chembl_id', 'assay_description', 'assay_type', \
                        'molecule_chembl_id', 'type', 'units', 'relation', 'value', \
                        'target_chembl_id', 'target_organism', 'target_pref_name')

In [9]:
len(activities)

69

In [10]:
# TO DO: run for all activities
bioact_df = pd.DataFrame.from_records(activities)
bioact_df.shape

(70, 12)

In [11]:
print(f'Number of unique molecules: {len(bioact_df.molecule_chembl_id.unique())}')

Number of unique molecules: 33


In [12]:
bioact_df.units.unique()

array(['mM', 'uM', 'nM', 'mmol/L'], dtype=object)

In [13]:
def convert_to_nM(unit, bioactivity):
    
    conversion_factors = {
        "pM": 1e-3,
        "10'-11M": 1e-2,
        "10'-10M": 1e-1,
        "nM": 1e+0,
        "10'-8M": 1e+1,
        "10'-1microM": 1e+2,
        "10'-7M": 1e+2,
        "uM": 1e+3,
        "/uM": 1e+3,
        "10'-6M": 1e+3,
        "10'1 uM": 1e+4,
        "10'2 uM": 1e+5,
        "mM": 1e+6,
        "mmol/L": 1e+6,
        "M": 1e+9
    }
    
    try:
        return float(bioactivity) * conversion_factors[unit]
        
    except KeyError:
        print(f'Unit not recognized: {unit}')
        return

In [14]:
bioact_df.rename(columns={"units": "units_original", "value": "value_original"}, inplace=True)

In [15]:
# convert all to nM
bioactivity_nM = []
for i, row in bioact_df.iterrows():
    bioact_nM = convert_to_nM(row['units_original'], row['value_original'])
    bioactivity_nM.append(bioact_nM)
bioact_df['value'] = bioactivity_nM
bioact_df['units'] = 'nM'
bioact_df.shape

(70, 14)

In [16]:
bioact_df.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,target_chembl_id,target_organism,target_pref_name,type,units_original,value_original,value,units
0,314877,CHEMBL751094,In vitro inhibitory activity against H1N9 stra...,B,CHEMBL414,=,CHEMBL3046,Homo sapiens,Sialidase 3,IC50,mM,2.5,2500000.0,nM
1,314877,CHEMBL751094,In vitro inhibitory activity against H1N9 stra...,B,CHEMBL414,=,CHEMBL3046,Homo sapiens,Sialidase 3,IC50,mM,2.5,2500000.0,nM
2,388701,CHEMBL615559,In vitro inhibition against of 4-Hydroxyphenyl...,B,CHEMBL297343,=,CHEMBL3203,Sus scrofa,4-hydroxyphenylpyruvate dioxygenase,IC50,uM,6.0,6000.0,nM
3,477909,CHEMBL661123,Evaluated for inhibition of human cyclin depen...,B,CHEMBL269827,=,CHEMBL301,Homo sapiens,Cyclin-dependent kinase 2,IC50,nM,2.3,2.3,nM
4,479851,CHEMBL658939,In vitro inhibition of Cyclin-dependent kinase 2,B,CHEMBL311992,=,CHEMBL301,Homo sapiens,Cyclin-dependent kinase 2,IC50,nM,12.0,12.0,nM


In [17]:
bioact_df = bioact_df.astype({'value': 'float'})

### Get all active molecules

In [18]:
activity_threshold = 500

In [19]:
bioact_df_active = bioact_df[bioact_df.value <= activity_threshold].copy()  # try out with 500, 50, 5

In [20]:
bioact_df_active.shape

(28, 14)

In [21]:
bioact_df_active.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,target_chembl_id,target_organism,target_pref_name,type,units_original,value_original,value,units
3,477909,CHEMBL661123,Evaluated for inhibition of human cyclin depen...,B,CHEMBL269827,=,CHEMBL301,Homo sapiens,Cyclin-dependent kinase 2,IC50,nM,2.3,2.3,nM
4,479851,CHEMBL658939,In vitro inhibition of Cyclin-dependent kinase 2,B,CHEMBL311992,=,CHEMBL301,Homo sapiens,Cyclin-dependent kinase 2,IC50,nM,12.0,12.0,nM
5,1021445,CHEMBL732386,Inhibitory concentration against rat brain mit...,B,CHEMBL300999,=,CHEMBL3358,Rattus norvegicus,Monoamine oxidase A,IC50,uM,0.002,2.0,nM
6,1168727,CHEMBL666140,Inhibitory activity against human cyclin-depen...,B,CHEMBL269827,=,CHEMBL308,Homo sapiens,Cyclin-dependent kinase 1,IC50,nM,10.0,10.0,nM
7,1168728,CHEMBL661125,Inhibitory activity against human cyclin-depen...,B,CHEMBL269827,=,CHEMBL301,Homo sapiens,Cyclin-dependent kinase 2,IC50,nM,2.3,2.3,nM


#### Check number of unique target names

In [22]:
bioact_df_active.target_pref_name.value_counts()

Cyclin-dependent kinase 2                                4
Cyclooxygenase-2                                         2
Serine/threonine-protein kinase Chk1                     2
Serine/threonine-protein kinase PIM1                     2
Cyclin-dependent kinase 1                                2
Cell division cycle 7-related protein kinase             2
Epoxide hydrolase 1                                      2
Casein kinase II alpha'/ beta                            1
Casein kinase II alpha/beta                              1
CDK1/Cyclin A                                            1
Leucine-rich repeat serine/threonine-protein kinase 2    1
CDK2/Cyclin A                                            1
Monoamine oxidase A                                      1
Serine/threonine-protein kinase PIM2                     1
7,8-dihydro-8-oxoguanine triphosphatase                  1
ATP-binding cassette sub-family G member 2               1
Serine/threonine-protein kinase PIM3                    

In [23]:
len(bioact_df_active.target_pref_name)

28

In [24]:
len(bioact_df_active.target_pref_name.value_counts())

19

#### Check number of unique ChEMBL target IDs

In [25]:
bioact_df_active.target_chembl_id.value_counts()

CHEMBL301        4
CHEMBL5443       2
CHEMBL4630       2
CHEMBL308        2
CHEMBL2147       2
CHEMBL4102       2
CHEMBL5407       1
CHEMBL5393       1
CHEMBL3038477    1
CHEMBL1075104    1
CHEMBL4523       1
CHEMBL3038467    1
CHEMBL3708265    1
CHEMBL1075293    1
CHEMBL3038469    1
CHEMBL1968       1
CHEMBL2842       1
CHEMBL612545     1
CHEMBL3358       1
CHEMBL3883328    1
Name: target_chembl_id, dtype: int64

In [26]:
len(bioact_df_active.target_chembl_id)

28

In [27]:
len(bioact_df_active.target_chembl_id.value_counts())

20

### Get all ATC classified molecules

In [28]:
# check if first_approval field is not empty
details[details.first_approval == details.first_approval]

Unnamed: 0,atc_classification,chembl_id,first_approval,inchi,smiles


In [29]:
# check if atc classified field is not empty
details[details.atc_classification.str.len() != 0]

Unnamed: 0,atc_classification,chembl_id,first_approval,inchi,smiles


### Get all approved drugs

In [30]:
approved_drugs = compound.filter(molecule_chembl_id__in = list(details.chembl_id), max_phase=4)

In [31]:
len(approved_drugs)

1

In [32]:
approved_mol = approved_drugs[0]

In [33]:
chembl_id = approved_mol['molecule_chembl_id']
chembl_id

'CHEMBL1909282'

In [34]:
activities = activity.filter(molecule_chembl_id = chembl_id) \
              .filter(type = 'IC50') \
              .filter(assay_type = 'B') \
              .only('activity_id','assay_chembl_id', 'assay_description', 'assay_type', \
                    'molecule_chembl_id', 'type', 'units', 'relation', 'value', \
                    'target_chembl_id', 'target_organism')

In [35]:
act = pd.DataFrame(list(activities))
act

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,target_chembl_id,target_organism,type,units,value
0,15460367,CHEMBL3540574,Inhibition of Sprague-Dawley rat Bsep expresse...,B,CHEMBL1909282,>,CHEMBL2073674,Rattus norvegicus,IC50,uM,1000
1,15460435,CHEMBL3540573,Inhibition of human BSEP expressed in plasma m...,B,CHEMBL1909282,>,CHEMBL6020,Homo sapiens,IC50,uM,1000


### Get ChEMBL target details: Protein target classification

In order to access the protein target classification, we follow the instructions given here:

https://github.com/chembl/chembl_webservices_2/issues/95

In [36]:
target_chembl_ids = list(bioact_df_active.target_chembl_id.unique())
target_chembl_ids

['CHEMBL301',
 'CHEMBL3358',
 'CHEMBL308',
 'CHEMBL4102',
 'CHEMBL5443',
 'CHEMBL4630',
 'CHEMBL3038477',
 'CHEMBL2147',
 'CHEMBL4523',
 'CHEMBL5407',
 'CHEMBL3883328',
 'CHEMBL1075104',
 'CHEMBL1075293',
 'CHEMBL1968',
 'CHEMBL3038467',
 'CHEMBL3038469',
 'CHEMBL3708265',
 'CHEMBL5393',
 'CHEMBL2842',
 'CHEMBL612545']

#### Define request functions for `target`, `target_components` and `protein_class` ChEMBL endpoints

In [37]:
def extract_component_id_from_target(target_chembl_id):
    """
    Go to `target` endpoint and extract `component_id`
    """
    
    target_url = f'https://www.ebi.ac.uk/chembl/api/data/target/{target_chembl_id}.json'
    #print(target_url)
    
    response = requests.get(target_url)
    response.raise_for_status()
    result = response.json()

    component_ids = [i['component_id'] for i in result['target_components']]
    return component_ids


In [38]:
def extract_protein_classification_id_from_target_components(component_id):
    """
    Go to `target_components` endpoint and extract `protein_classification_id`
    """
    
    target_components_url = f'https://www.ebi.ac.uk/chembl/api/data/target_component/{component_id}.json'
    #print(target_components_url)

    response = requests.get(target_components_url)
    response.raise_for_status()  # this line checks for potential errors
    result = response.json()

    protein_classification_ids = [i['protein_classification_id'] for i in result['protein_classifications']]

    return protein_classification_ids

In [39]:
def get_protein_target_classification_from_protein_class(protein_classification_id):
    """
    Go to `protein_class` endpoint and extract protein target classification.
    """

    protein_class_url = f'https://www.ebi.ac.uk/chembl/api/data/protein_class/{protein_classification_id}.json'
    #print(protein_class_url)

    response = requests.get(protein_class_url)
    response.raise_for_status()  # this line checks for potential errors
    result = response.json()

    return pd.Series(result)

In [40]:
def get_protein_target_classifications(target_chembl_ids):
    """
    Get protein target classifications for a list of target ChEMBL IDs (in the form of a DataFrame).
    """

    results = []

    for target_chembl_id in target_chembl_ids:
        #print(target_chembl_id)

        # Go to `target` endpoint and extract `component_id`
        component_ids = extract_component_id_from_target(target_chembl_id)

        for component_id in component_ids:
            #print(component_id)

            # Go to `target_components` endpoint and extract `protein_classification_id`
            protein_classification_ids = extract_protein_classification_id_from_target_components(component_id)

            for protein_classification_id in protein_classification_ids:
                #print(protein_classification_id)

                # Go to `protein_class` endpoint and extract protein target classification.
                protein_target_classification = get_protein_target_classification_from_protein_class(protein_classification_id)

                # Add ID details
                protein_target_classification['target_chembl_id'] = target_chembl_id
                protein_target_classification['component_id'] = component_id
                protein_target_classification['protein_classification_id'] = protein_classification_id

                results.append(protein_target_classification)
                
    return pd.DataFrame(results)

In [41]:
protein_target_classifications = get_protein_target_classifications(target_chembl_ids)

In [42]:
protein_target_classifications

Unnamed: 0,l1,l2,l3,l4,l5,l6,l7,l8,protein_class_id,target_chembl_id,component_id,protein_classification_id
0,Enzyme,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,,,323,CHEMBL301,36,323
1,Enzyme,,,,,,,,1,CHEMBL3358,1680,1
2,Enzyme,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,,,323,CHEMBL308,7,323
3,Enzyme,Oxidoreductase,,,,,,,10,CHEMBL4102,2420,10
4,Enzyme,Kinase,Protein Kinase,Other protein kinase group,Other protein kinase CDC7 family,,,,271,CHEMBL5443,5015,271
5,Enzyme,Kinase,Protein Kinase,CAMK protein kinase group,CAMK protein kinase CAMK1 family,CAMK protein kinase CHK1 subfamily,,,411,CHEMBL4630,2947,411
6,Enzyme,Kinase,Protein kinase regulatory subunit,,,,,,129,CHEMBL3038477,699,129
7,Enzyme,Kinase,Protein Kinase,Other protein kinase group,Other protein kinase CK2 family,,,,252,CHEMBL3038477,1946,252
8,Enzyme,Kinase,Protein Kinase,CAMK protein kinase group,CAMK protein kinase PIM family,,,,235,CHEMBL2147,489,235
9,Enzyme,Kinase,Protein Kinase,CAMK protein kinase group,CAMK protein kinase PIM family,,,,235,CHEMBL4523,2840,235
