# ChEMBL molecules details

Get more details on molecules from the recombined library that were found in ChEMBL:
- Get targets that these molecules are active on
- Get classification for these targets (e.g. are these targets kinases?)

In [1]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
import requests
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole

## Load ChEMBL molecules

These are the molecules from the recombined library that were found in ChEMBL.

In [2]:
with open('novel_chembl_mols.txt', 'r') as f:
    lines = f.readlines()
    
molecules = [i[:-1] for i in lines]
molecules = [Chem.InchiToInchiKey(inchi) for inchi in molecules]

In [3]:
len(molecules)

121

## Get molecules details from ChEMBL

In [4]:
similarity = new_client.similarity
compound = new_client.molecule
activity = new_client.activity
target = new_client.target

### Get molecule ChEMBL IDs

In [5]:
details = []

for molecule in molecules:
    
    result = compound.get(molecule) # similarity.filter(smiles=molecule, similarity=100)
    
    details.append({
            'smiles': result['molecule_structures']['canonical_smiles'], 
            'inchi': result['molecule_structures']['standard_inchi'],
            'atc_classification': result['atc_classifications'], 
            'first_approval': result['first_approval'],
            'chembl_id': result['molecule_chembl_id']
    })
        
details = pd.DataFrame(details)

In [6]:
print(f'Number of queries: {len(molecules)}')
print(f'Number of query results: {details.shape[0]}')

Number of queries: 121
Number of query results: 121


In [7]:
details.head()

Unnamed: 0,smiles,inchi,atc_classification,first_approval,chembl_id
0,NC(=O)Nc1sc(cc1C(=O)N[C@H]2CCCNC2)c3ccsc3,InChI=1S/C15H18N4O2S2/c16-15(21)19-14-11(6-12(...,[],,CHEMBL520734
1,COc1ccc(cc1)C(=O)Nc2ccccc2,InChI=1S/C14H13NO2/c1-17-13-9-7-11(8-10-13)14(...,[],,CHEMBL2064599
2,O=C(Nc1ccccc1)C2CCCCC2,InChI=1S/C13H17NO/c15-13(11-7-3-1-4-8-11)14-12...,[],,CHEMBL1562513
3,Cc1nc(c2ccccc2)n3nc(NC4CCCCC4)ncc13,InChI=1S/C18H21N5/c1-13-16-12-19-18(21-15-10-6...,[],,CHEMBL487776
4,Clc1ccc(NC(=O)c2csc(n2)c3ccccc3)cc1,InChI=1S/C16H11ClN2OS/c17-12-6-8-13(9-7-12)18-...,[],,CHEMBL1446166


### Get measured activities for molecules

In [8]:
activities = activity.filter(molecule_chembl_id__in = list(details.chembl_id),
                             type = 'IC50', 
                             relation = '=', 
                             assay_type = 'B') \
                    .only('activity_id','assay_chembl_id', 'assay_description', 'assay_type', \
                        'molecule_chembl_id', 'type', 'units', 'relation', 'value', \
                        'target_chembl_id', 'target_organism', 'target_pref_name')

In [9]:
len(activities)

69

In [10]:
bioact_df = pd.DataFrame.from_records(activities)
bioact_df.shape

(70, 12)

In [11]:
print(f'Number of unique molecules: {len(bioact_df.molecule_chembl_id.unique())}')

Number of unique molecules: 33


In [12]:
bioact_df.units.unique()

array(['mM', 'uM', 'nM', 'mmol/L'], dtype=object)

In [13]:
def convert_to_nM(unit, bioactivity):
    
    conversion_factors = {
        "pM": 1e-3,
        "10'-11M": 1e-2,
        "10'-10M": 1e-1,
        "nM": 1e+0,
        "10'-8M": 1e+1,
        "10'-1microM": 1e+2,
        "10'-7M": 1e+2,
        "uM": 1e+3,
        "/uM": 1e+3,
        "10'-6M": 1e+3,
        "10'1 uM": 1e+4,
        "10'2 uM": 1e+5,
        "mM": 1e+6,
        "mmol/L": 1e+6,
        "M": 1e+9
    }
    
    try:
        return float(bioactivity) * conversion_factors[unit]
        
    except KeyError:
        print(f'Unit not recognized: {unit}')
        return

In [14]:
bioact_df.rename(columns={"units": "units_original", "value": "value_original"}, inplace=True)

In [15]:
# Convert all to nM
bioactivity_nM = []
for i, row in bioact_df.iterrows():
    bioact_nM = convert_to_nM(row['units_original'], row['value_original'])
    bioactivity_nM.append(bioact_nM)
bioact_df['value'] = bioactivity_nM
bioact_df['units'] = 'nM'
bioact_df.shape

(70, 14)

In [16]:
bioact_df.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,target_chembl_id,target_organism,target_pref_name,type,units_original,value_original,value,units
0,314877,CHEMBL751094,In vitro inhibitory activity against H1N9 stra...,B,CHEMBL414,=,CHEMBL3046,Homo sapiens,Sialidase 3,IC50,mM,2.5,2500000.0,nM
1,314877,CHEMBL751094,In vitro inhibitory activity against H1N9 stra...,B,CHEMBL414,=,CHEMBL3046,Homo sapiens,Sialidase 3,IC50,mM,2.5,2500000.0,nM
2,388701,CHEMBL615559,In vitro inhibition against of 4-Hydroxyphenyl...,B,CHEMBL297343,=,CHEMBL3203,Sus scrofa,4-hydroxyphenylpyruvate dioxygenase,IC50,uM,6.0,6000.0,nM
3,477909,CHEMBL661123,Evaluated for inhibition of human cyclin depen...,B,CHEMBL269827,=,CHEMBL301,Homo sapiens,Cyclin-dependent kinase 2,IC50,nM,2.3,2.3,nM
4,479851,CHEMBL658939,In vitro inhibition of Cyclin-dependent kinase 2,B,CHEMBL311992,=,CHEMBL301,Homo sapiens,Cyclin-dependent kinase 2,IC50,nM,12.0,12.0,nM


In [17]:
bioact_df = bioact_df.astype({'value': 'float'})

### Get all active molecules

In [18]:
activity_threshold = 500

In [19]:
bioact_df_active = bioact_df[bioact_df.value <= activity_threshold].copy()  # try out with 500, 50, 5

In [20]:
bioact_df_active.shape

(28, 14)

In [21]:
len(bioact_df_active.molecule_chembl_id.unique())

14

In [22]:
bioact_df_active.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,target_chembl_id,target_organism,target_pref_name,type,units_original,value_original,value,units
3,477909,CHEMBL661123,Evaluated for inhibition of human cyclin depen...,B,CHEMBL269827,=,CHEMBL301,Homo sapiens,Cyclin-dependent kinase 2,IC50,nM,2.3,2.3,nM
4,479851,CHEMBL658939,In vitro inhibition of Cyclin-dependent kinase 2,B,CHEMBL311992,=,CHEMBL301,Homo sapiens,Cyclin-dependent kinase 2,IC50,nM,12.0,12.0,nM
5,1021445,CHEMBL732386,Inhibitory concentration against rat brain mit...,B,CHEMBL300999,=,CHEMBL3358,Rattus norvegicus,Monoamine oxidase A,IC50,uM,0.002,2.0,nM
6,1168727,CHEMBL666140,Inhibitory activity against human cyclin-depen...,B,CHEMBL269827,=,CHEMBL308,Homo sapiens,Cyclin-dependent kinase 1,IC50,nM,10.0,10.0,nM
7,1168728,CHEMBL661125,Inhibitory activity against human cyclin-depen...,B,CHEMBL269827,=,CHEMBL301,Homo sapiens,Cyclin-dependent kinase 2,IC50,nM,2.3,2.3,nM


#### Check number of unique target names

In [23]:
bioact_df_active.target_pref_name.value_counts()

Cyclin-dependent kinase 2                                4
Cyclooxygenase-2                                         2
Cyclin-dependent kinase 1                                2
Serine/threonine-protein kinase PIM1                     2
Epoxide hydrolase 1                                      2
Cell division cycle 7-related protein kinase             2
Serine/threonine-protein kinase Chk1                     2
7,8-dihydro-8-oxoguanine triphosphatase                  1
Casein kinase II alpha'/ beta                            1
Leucine-rich repeat serine/threonine-protein kinase 2    1
CDK2/Cyclin A                                            1
Serine/threonine-protein kinase PIM2                     1
ATP-binding cassette sub-family G member 2               1
Monoamine oxidase A                                      1
CDK1/Cyclin A                                            1
Serine/threonine-protein kinase PIM3                     1
Serine/threonine-protein kinase mTOR                    

In [24]:
len(bioact_df_active.target_pref_name)

28

In [25]:
len(bioact_df_active.target_pref_name.value_counts())

19

#### Check number of unique ChEMBL target IDs

In [26]:
bioact_df_active.target_chembl_id.value_counts()

CHEMBL301        4
CHEMBL308        2
CHEMBL4630       2
CHEMBL4102       2
CHEMBL2147       2
CHEMBL5443       2
CHEMBL1075293    1
CHEMBL3038469    1
CHEMBL5393       1
CHEMBL3883328    1
CHEMBL3358       1
CHEMBL2842       1
CHEMBL1075104    1
CHEMBL3038477    1
CHEMBL3708265    1
CHEMBL5407       1
CHEMBL3038467    1
CHEMBL1968       1
CHEMBL612545     1
CHEMBL4523       1
Name: target_chembl_id, dtype: int64

In [27]:
len(bioact_df_active.target_chembl_id)

28

In [28]:
len(bioact_df_active.target_chembl_id.value_counts())

20

### Multiple assay tests per molecule

In [29]:
bioact_df_active.groupby(by='molecule_chembl_id')['value'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
molecule_chembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CHEMBL1410991,1.0,211.0,,211.0,211.0,211.0,211.0,211.0
CHEMBL1709874,1.0,100.0,,100.0,100.0,100.0,100.0,100.0
CHEMBL2030386,7.0,39.928571,42.131173,4.0,5.25,13.0,78.5,95.0
CHEMBL253195,1.0,270.0,,270.0,270.0,270.0,270.0,270.0
CHEMBL254465,1.0,110.0,,110.0,110.0,110.0,110.0,110.0
CHEMBL269827,7.0,5.6,4.115823,2.3,2.3,2.3,10.0,10.0
CHEMBL300999,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
CHEMBL311992,1.0,12.0,,12.0,12.0,12.0,12.0,12.0
CHEMBL3326132,1.0,64.0,,64.0,64.0,64.0,64.0,64.0
CHEMBL3661365,2.0,75.0,35.355339,50.0,62.5,75.0,87.5,100.0


In [30]:
len(bioact_df_active.molecule_chembl_id.unique())

14

### Get all ATC classified molecules

In [31]:
# check if first_approval field is not empty
details[details.first_approval == details.first_approval]

Unnamed: 0,smiles,inchi,atc_classification,first_approval,chembl_id


In [32]:
# check if atc classified field is not empty
details[details.atc_classification.str.len() != 0]

Unnamed: 0,smiles,inchi,atc_classification,first_approval,chembl_id


### Get all approved drugs

In [33]:
approved_drugs = compound.filter(molecule_chembl_id__in = list(details.chembl_id), max_phase=4)

In [34]:
len(approved_drugs)

1

In [35]:
approved_mol = approved_drugs[0]

In [36]:
chembl_id = approved_mol['molecule_chembl_id']
chembl_id

'CHEMBL1909282'

In [37]:
activities = activity.filter(molecule_chembl_id = chembl_id) \
              .filter(type = 'IC50') \
              .filter(assay_type = 'B') \
              .only('activity_id','assay_chembl_id', 'assay_description', 'assay_type', \
                    'molecule_chembl_id', 'type', 'units', 'relation', 'value', \
                    'target_chembl_id', 'target_organism')

In [38]:
act = pd.DataFrame(list(activities))
act

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,target_chembl_id,target_organism,type,units,value
0,15460367,CHEMBL3540574,Inhibition of Sprague-Dawley rat Bsep expresse...,B,CHEMBL1909282,>,CHEMBL2073674,Rattus norvegicus,IC50,uM,1000
1,15460435,CHEMBL3540573,Inhibition of human BSEP expressed in plasma m...,B,CHEMBL1909282,>,CHEMBL6020,Homo sapiens,IC50,uM,1000


### Get ChEMBL target details: Protein target classification

In order to access the protein target classification, we follow the instructions given here:

https://github.com/chembl/chembl_webservices_2/issues/95

In [39]:
target_chembl_ids = list(bioact_df_active.target_chembl_id.unique())
print(len(target_chembl_ids))
target_chembl_ids

20


['CHEMBL301',
 'CHEMBL3358',
 'CHEMBL308',
 'CHEMBL4102',
 'CHEMBL5443',
 'CHEMBL4630',
 'CHEMBL3038477',
 'CHEMBL2147',
 'CHEMBL4523',
 'CHEMBL5407',
 'CHEMBL3883328',
 'CHEMBL1075104',
 'CHEMBL1075293',
 'CHEMBL1968',
 'CHEMBL3038467',
 'CHEMBL3038469',
 'CHEMBL3708265',
 'CHEMBL5393',
 'CHEMBL2842',
 'CHEMBL612545']

#### Define request functions for `target`, `target_components` and `protein_class` ChEMBL endpoints

In [40]:
def extract_component_id_from_target(target_chembl_id):
    """
    Go to `target` endpoint and extract `component_id`
    """
    
    target_url = f'https://www.ebi.ac.uk/chembl/api/data/target/{target_chembl_id}.json'
    #print(target_url)
    
    response = requests.get(target_url)
    response.raise_for_status()
    result = response.json()

    component_ids = [i['component_id'] for i in result['target_components']]
    return component_ids


In [41]:
def extract_protein_classification_id_from_target_components(component_id):
    """
    Go to `target_components` endpoint and extract `protein_classification_id`
    """
    
    target_components_url = f'https://www.ebi.ac.uk/chembl/api/data/target_component/{component_id}.json'
    #print(target_components_url)

    response = requests.get(target_components_url)
    response.raise_for_status()  # this line checks for potential errors
    result = response.json()

    protein_classification_ids = [i['protein_classification_id'] for i in result['protein_classifications']]

    return protein_classification_ids

In [42]:
def get_protein_target_classification_from_protein_class(protein_classification_id):
    """
    Go to `protein_class` endpoint and extract protein target classification.
    """

    protein_class_url = f'https://www.ebi.ac.uk/chembl/api/data/protein_class/{protein_classification_id}.json'
    #print(protein_class_url)

    response = requests.get(protein_class_url)
    response.raise_for_status()  # this line checks for potential errors
    result = response.json()

    return pd.Series(result)

In [43]:
def get_protein_target_classifications(target_chembl_ids):
    """
    Get protein target classifications for a list of target ChEMBL IDs (in the form of a DataFrame).
    """

    results = []

    for target_chembl_id in target_chembl_ids:
        #print(target_chembl_id)

        # Go to `target` endpoint and extract `component_id`
        component_ids = extract_component_id_from_target(target_chembl_id)
        
        if len(component_ids) != 1:
            print(f'{target_chembl_id}: {len(component_ids)} component IDs for target ChEMBL ID.')

        for component_id in component_ids:
            #print(component_id)

            # Go to `target_components` endpoint and extract `protein_classification_id`
            protein_classification_ids = extract_protein_classification_id_from_target_components(component_id)

            if len(protein_classification_ids) != 1:
                print(f'{target_chembl_id}: {len(protein_classification_ids)} protein classification IDs for target ChEMBL ID.\n')    
            
            for protein_classification_id in protein_classification_ids:
                #print(protein_classification_id)

                # Go to `protein_class` endpoint and extract protein target classification.
                protein_target_classification = get_protein_target_classification_from_protein_class(protein_classification_id)

                # Add ID details
                protein_target_classification['target_chembl_id'] = target_chembl_id
                protein_target_classification['component_id'] = component_id
                protein_target_classification['protein_classification_id'] = protein_classification_id

                results.append(protein_target_classification)
                
    return pd.DataFrame(results)

#### Get protein target classification for target ChEMBL IDs

Check for each query, how many protein target classification entries are available!

In [44]:
len(target_chembl_ids)

20

In [45]:
protein_target_classifications = get_protein_target_classifications(target_chembl_ids)

CHEMBL3038477: 2 component IDs for target ChEMBL ID.
CHEMBL3883328: 2 component IDs for target ChEMBL ID.
CHEMBL3038467: 2 component IDs for target ChEMBL ID.
CHEMBL3038469: 2 component IDs for target ChEMBL ID.
CHEMBL612545: 0 component IDs for target ChEMBL ID.


In [46]:
protein_target_classifications.shape

(23, 12)

In [47]:
len(protein_target_classifications.target_chembl_id.unique())

19

#### Look at protein target classifications

In [48]:
protein_target_classifications

Unnamed: 0,l1,l2,l3,l4,l5,l6,l7,l8,protein_class_id,target_chembl_id,component_id,protein_classification_id
0,Enzyme,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,,,323,CHEMBL301,36,323
1,Enzyme,,,,,,,,1,CHEMBL3358,1680,1
2,Enzyme,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,,,323,CHEMBL308,7,323
3,Enzyme,Oxidoreductase,,,,,,,10,CHEMBL4102,2420,10
4,Enzyme,Kinase,Protein Kinase,Other protein kinase group,Other protein kinase CDC7 family,,,,271,CHEMBL5443,5015,271
5,Enzyme,Kinase,Protein Kinase,CAMK protein kinase group,CAMK protein kinase CAMK1 family,CAMK protein kinase CHK1 subfamily,,,411,CHEMBL4630,2947,411
6,Enzyme,Kinase,Protein kinase regulatory subunit,,,,,,129,CHEMBL3038477,699,129
7,Enzyme,Kinase,Protein Kinase,Other protein kinase group,Other protein kinase CK2 family,,,,252,CHEMBL3038477,1946,252
8,Enzyme,Kinase,Protein Kinase,CAMK protein kinase group,CAMK protein kinase PIM family,,,,235,CHEMBL2147,489,235
9,Enzyme,Kinase,Protein Kinase,CAMK protein kinase group,CAMK protein kinase PIM family,,,,235,CHEMBL4523,2840,235


#### Per target ChEMBL ID, drop duplicated content (manually!)

Get indices that we want to drop - this is a manual step because we need to decide which entry per target ChEMBL ID we would like to keep/toss.

In [49]:
protein_target_classifications.groupby(by='target_chembl_id').apply(lambda x: x if (x.shape[0] > 1) else None).dropna(how='all')

Unnamed: 0,l1,l2,l3,l4,l5,l6,l7,l8,protein_class_id,target_chembl_id,component_id,protein_classification_id
6,Enzyme,Kinase,Protein kinase regulatory subunit,,,,,,129.0,CHEMBL3038477,699.0,129.0
7,Enzyme,Kinase,Protein Kinase,Other protein kinase group,Other protein kinase CK2 family,,,,252.0,CHEMBL3038477,1946.0,252.0
11,Enzyme,Kinase,Protein kinase regulatory subunit,,,,,,129.0,CHEMBL3883328,699.0,129.0
12,Enzyme,Kinase,Protein Kinase,Other protein kinase group,Other protein kinase CK2 family,,,,252.0,CHEMBL3883328,2389.0,252.0
16,Enzyme,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,,,323.0,CHEMBL3038467,7.0,323.0
17,Other cytosolic protein,,,,,,,,8.0,CHEMBL3038467,918.0,8.0
18,Enzyme,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,,,323.0,CHEMBL3038469,36.0,323.0
19,Other cytosolic protein,,,,,,,,8.0,CHEMBL3038469,918.0,8.0


In [50]:
# Manual step!!
drop_class_ix = [6, 11, 17, 19]

In [51]:
protein_target_classifications.shape

(23, 12)

In [52]:
protein_target_classifications.drop(drop_class_ix, inplace=True)

In [53]:
protein_target_classifications.shape

(19, 12)

#### Show enzyme group distribution

In [54]:
protein_target_classifications.groupby(by='l2').size()

l2
Kinase                        13
Oxidoreductase                 1
Primary active transporter     1
Protease                       2
dtype: int64

### Now combine molecule with target information!

In [55]:
protein_target_classifications.shape

(19, 12)

In [56]:
bioact_df_active.shape

(28, 14)

In [57]:
molecules_targets_details = pd.merge(bioact_df_active, protein_target_classifications, on='target_chembl_id', how='left')

In [58]:
molecules_targets_details.sort_values(by='molecule_chembl_id')

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,target_chembl_id,target_organism,target_pref_name,type,...,l2,l3,l4,l5,l6,l7,l8,protein_class_id,component_id,protein_classification_id
25,18072484,CHEMBL4015695,Inhibition of recombinant human GFP-fused ABCG...,B,CHEMBL1410991,=,CHEMBL5393,Homo sapiens,ATP-binding cassette sub-family G member 2,IC50,...,Primary active transporter,ATP-binding cassette,ABCG subfamily,,,,,750.0,3688.0,750.0
24,16844205,CHEMBL3877226,Inhibition of DsRed-fused MTH1 (unknown origin...,B,CHEMBL1709874,=,CHEMBL3708265,Homo sapiens,"7,8-dihydro-8-oxoguanine triphosphatase",IC50,...,,,,,,,,1.0,10418.0,1.0
13,10893468,CHEMBL2034661,Inhibition of human tertrameric CK2alphabeta h...,B,CHEMBL2030386,=,CHEMBL3038477,Homo sapiens,Casein kinase II alpha/beta,IC50,...,Kinase,Protein Kinase,Other protein kinase group,Other protein kinase CK2 family,,,,252.0,1946.0,252.0
18,10900705,CHEMBL2038011,Inhibition of N-terminal MBP-tagged human alph...,B,CHEMBL2030386,=,CHEMBL3883328,Homo sapiens,Casein kinase II alpha'/ beta,IC50,...,Kinase,Protein Kinase,Other protein kinase group,Other protein kinase CK2 family,,,,252.0,2389.0,252.0
17,10900658,CHEMBL2038010,Inhibition of N-terminal His-tagged human PIM3...,B,CHEMBL2030386,=,CHEMBL5407,Homo sapiens,Serine/threonine-protein kinase PIM3,IC50,...,Kinase,Protein Kinase,CAMK protein kinase group,CAMK protein kinase PIM family,,,,235.0,3702.0,235.0
16,10900611,CHEMBL2038009,Inhibition of N-terminal His-tagged human PIM2...,B,CHEMBL2030386,=,CHEMBL4523,Homo sapiens,Serine/threonine-protein kinase PIM2,IC50,...,Kinase,Protein Kinase,CAMK protein kinase group,CAMK protein kinase PIM family,,,,235.0,2840.0,235.0
15,10900564,CHEMBL2038008,Inhibition of N-terminal His-tagged human PIM1...,B,CHEMBL2030386,=,CHEMBL2147,Homo sapiens,Serine/threonine-protein kinase PIM1,IC50,...,Kinase,Protein Kinase,CAMK protein kinase group,CAMK protein kinase PIM family,,,,235.0,489.0,235.0
14,10893469,CHEMBL2034662,Inhibition of N-terminus His-tagged human PIM1...,B,CHEMBL2030386,=,CHEMBL2147,Homo sapiens,Serine/threonine-protein kinase PIM1,IC50,...,Kinase,Protein Kinase,CAMK protein kinase group,CAMK protein kinase PIM family,,,,235.0,489.0,235.0
12,10893467,CHEMBL2034660,Inhibition of N-terminus Myc-tagged human CDC7...,B,CHEMBL2030386,=,CHEMBL5443,Homo sapiens,Cell division cycle 7-related protein kinase,IC50,...,Kinase,Protein Kinase,Other protein kinase group,Other protein kinase CDC7 family,,,,271.0,5015.0,271.0
6,2078467,CHEMBL946051,Inhibition of ovine COX2 by chemiluminescent a...,B,CHEMBL253195,=,CHEMBL4102,Ovis aries,Cyclooxygenase-2,IC50,...,Oxidoreductase,,,,,,,10.0,2420.0,10.0


In [59]:
columns_of_interest = 'molecule_chembl_id target_chembl_id assay_chembl_id target_organism target_pref_name l2 l3 l4 l4 l5 l6 value units assay_description'.split()

In [60]:
molecules_of_interest = molecules_targets_details[columns_of_interest]
molecules_of_interest.shape

(28, 14)

In [61]:
molecules_of_interest

Unnamed: 0,molecule_chembl_id,target_chembl_id,assay_chembl_id,target_organism,target_pref_name,l2,l3,l4,l4.1,l5,l6,value,units,assay_description
0,CHEMBL269827,CHEMBL301,CHEMBL661123,Homo sapiens,Cyclin-dependent kinase 2,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,2.3,nM,Evaluated for inhibition of human cyclin depen...
1,CHEMBL311992,CHEMBL301,CHEMBL658939,Homo sapiens,Cyclin-dependent kinase 2,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,12.0,nM,In vitro inhibition of Cyclin-dependent kinase 2
2,CHEMBL300999,CHEMBL3358,CHEMBL732386,Rattus norvegicus,Monoamine oxidase A,,,,,,,2.0,nM,Inhibitory concentration against rat brain mit...
3,CHEMBL269827,CHEMBL308,CHEMBL666140,Homo sapiens,Cyclin-dependent kinase 1,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,10.0,nM,Inhibitory activity against human cyclin-depen...
4,CHEMBL269827,CHEMBL301,CHEMBL661125,Homo sapiens,Cyclin-dependent kinase 2,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,2.3,nM,Inhibitory activity against human cyclin-depen...
5,CHEMBL254465,CHEMBL4102,CHEMBL946051,Ovis aries,Cyclooxygenase-2,Oxidoreductase,,,,,,110.0,nM,Inhibition of ovine COX2 by chemiluminescent a...
6,CHEMBL253195,CHEMBL4102,CHEMBL946051,Ovis aries,Cyclooxygenase-2,Oxidoreductase,,,,,,270.0,nM,Inhibition of ovine COX2 by chemiluminescent a...
7,CHEMBL269827,CHEMBL301,CHEMBL952109,Homo sapiens,Cyclin-dependent kinase 2,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,2.3,nM,Inhibition of CDK2
8,CHEMBL269827,CHEMBL308,CHEMBL952119,Homo sapiens,Cyclin-dependent kinase 1,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,10.0,nM,Inhibition of CDK1
9,CHEMBL526133,CHEMBL5443,CHEMBL998766,Homo sapiens,Cell division cycle 7-related protein kinase,Kinase,Protein Kinase,Other protein kinase group,Other protein kinase group,Other protein kinase CDC7 family,,10.0,nM,Inhibition of CDC7-DBF4


#### Filter molecules of interest

- Human targets
- Kinases

In [62]:
molecules_selected = molecules_of_interest[molecules_of_interest.target_organism == 'Homo sapiens'].copy()
molecules_selected.drop(['target_organism'], axis=1, inplace=True)
molecules_selected.shape

(23, 13)

In [63]:
molecules_selected

Unnamed: 0,molecule_chembl_id,target_chembl_id,assay_chembl_id,target_pref_name,l2,l3,l4,l4.1,l5,l6,value,units,assay_description
0,CHEMBL269827,CHEMBL301,CHEMBL661123,Cyclin-dependent kinase 2,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,2.3,nM,Evaluated for inhibition of human cyclin depen...
1,CHEMBL311992,CHEMBL301,CHEMBL658939,Cyclin-dependent kinase 2,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,12.0,nM,In vitro inhibition of Cyclin-dependent kinase 2
3,CHEMBL269827,CHEMBL308,CHEMBL666140,Cyclin-dependent kinase 1,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,10.0,nM,Inhibitory activity against human cyclin-depen...
4,CHEMBL269827,CHEMBL301,CHEMBL661125,Cyclin-dependent kinase 2,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,2.3,nM,Inhibitory activity against human cyclin-depen...
7,CHEMBL269827,CHEMBL301,CHEMBL952109,Cyclin-dependent kinase 2,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,2.3,nM,Inhibition of CDK2
8,CHEMBL269827,CHEMBL308,CHEMBL952119,Cyclin-dependent kinase 1,Kinase,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,10.0,nM,Inhibition of CDK1
9,CHEMBL526133,CHEMBL5443,CHEMBL998766,Cell division cycle 7-related protein kinase,Kinase,Protein Kinase,Other protein kinase group,Other protein kinase group,Other protein kinase CDC7 family,,10.0,nM,Inhibition of CDC7-DBF4
10,CHEMBL520734,CHEMBL4630,CHEMBL944207,Serine/threonine-protein kinase Chk1,Kinase,Protein Kinase,CAMK protein kinase group,CAMK protein kinase group,CAMK protein kinase CAMK1 family,CAMK protein kinase CHK1 subfamily,6.0,nM,Inhibition of Chk1
11,CHEMBL470288,CHEMBL4630,CHEMBL944207,Serine/threonine-protein kinase Chk1,Kinase,Protein Kinase,CAMK protein kinase group,CAMK protein kinase group,CAMK protein kinase CAMK1 family,CAMK protein kinase CHK1 subfamily,10.0,nM,Inhibition of Chk1
12,CHEMBL2030386,CHEMBL5443,CHEMBL2034660,Cell division cycle 7-related protein kinase,Kinase,Protein Kinase,Other protein kinase group,Other protein kinase group,Other protein kinase CDC7 family,,5.5,nM,Inhibition of N-terminus Myc-tagged human CDC7...


In [64]:
molecules_selected = molecules_selected[molecules_selected.l2 == 'Kinase'].copy()
molecules_selected.drop(['l2'], axis=1, inplace=True)
molecules_selected.shape

(20, 12)

In [65]:
molecules_selected.sort_values(by=['molecule_chembl_id', 'value'])

Unnamed: 0,molecule_chembl_id,target_chembl_id,assay_chembl_id,target_pref_name,l3,l4,l4.1,l5,l6,value,units,assay_description
17,CHEMBL2030386,CHEMBL5407,CHEMBL2038010,Serine/threonine-protein kinase PIM3,Protein Kinase,CAMK protein kinase group,CAMK protein kinase group,CAMK protein kinase PIM family,,4.0,nM,Inhibition of N-terminal His-tagged human PIM3...
15,CHEMBL2030386,CHEMBL2147,CHEMBL2038008,Serine/threonine-protein kinase PIM1,Protein Kinase,CAMK protein kinase group,CAMK protein kinase group,CAMK protein kinase PIM family,,5.0,nM,Inhibition of N-terminal His-tagged human PIM1...
12,CHEMBL2030386,CHEMBL5443,CHEMBL2034660,Cell division cycle 7-related protein kinase,Protein Kinase,Other protein kinase group,Other protein kinase group,Other protein kinase CDC7 family,,5.5,nM,Inhibition of N-terminus Myc-tagged human CDC7...
14,CHEMBL2030386,CHEMBL2147,CHEMBL2034662,Serine/threonine-protein kinase PIM1,Protein Kinase,CAMK protein kinase group,CAMK protein kinase group,CAMK protein kinase PIM family,,13.0,nM,Inhibition of N-terminus His-tagged human PIM1...
16,CHEMBL2030386,CHEMBL4523,CHEMBL2038009,Serine/threonine-protein kinase PIM2,Protein Kinase,CAMK protein kinase group,CAMK protein kinase group,CAMK protein kinase PIM family,,68.0,nM,Inhibition of N-terminal His-tagged human PIM2...
13,CHEMBL2030386,CHEMBL3038477,CHEMBL2034661,Casein kinase II alpha/beta,Protein Kinase,Other protein kinase group,Other protein kinase group,Other protein kinase CK2 family,,89.0,nM,Inhibition of human tertrameric CK2alphabeta h...
18,CHEMBL2030386,CHEMBL3883328,CHEMBL2038011,Casein kinase II alpha'/ beta,Protein Kinase,Other protein kinase group,Other protein kinase group,Other protein kinase CK2 family,,95.0,nM,Inhibition of N-terminal MBP-tagged human alph...
0,CHEMBL269827,CHEMBL301,CHEMBL661123,Cyclin-dependent kinase 2,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,2.3,nM,Evaluated for inhibition of human cyclin depen...
4,CHEMBL269827,CHEMBL301,CHEMBL661125,Cyclin-dependent kinase 2,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,2.3,nM,Inhibitory activity against human cyclin-depen...
7,CHEMBL269827,CHEMBL301,CHEMBL952109,Cyclin-dependent kinase 2,Protein Kinase,CMGC protein kinase group,CMGC protein kinase group,CMGC protein kinase CDK family,CMGC protein kinase CDC2 subfamily,2.3,nM,Inhibition of CDK2


In [66]:
len(molecules_selected.molecule_chembl_id.unique())

8

In [67]:
molecules_selected.groupby(by='molecule_chembl_id').mean().sort_values('value')

Unnamed: 0_level_0,value
molecule_chembl_id,Unnamed: 1_level_1
CHEMBL269827,5.6
CHEMBL520734,6.0
CHEMBL470288,10.0
CHEMBL526133,10.0
CHEMBL311992,12.0
CHEMBL2030386,39.928571
CHEMBL3326132,64.0
CHEMBL4068426,415.0
