# ChEMBL molecules details

In [1]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole

  curious_george.patch_all(thread=False, select=False)


## Load ChEMBL molecules

In [2]:
with open('/home/paula/Masterarbeit/CombinatorialLibrary_manuscript/statistics/novel_chembl_mols.txt', 'r') as f:
    lines = f.readlines()
    
molecules = [i[:-1] for i in lines]
molecules = [Chem.InchiToInchiKey(inchi) for inchi in molecules]

## Get molecules details

In [3]:
similarity = new_client.similarity
compound = new_client.molecule
activity = new_client.activity
target = new_client.target

In [4]:
details = []

for molecule in molecules:
    
    result = compound.get(molecule) # similarity.filter(smiles=molecule, similarity=100)
    
    details.append({
            'smiles': result['molecule_structures']['canonical_smiles'], 
            'inchi': result['molecule_structures']['standard_inchi'],
            'atc_classification': result['atc_classifications'], 
            'first_approval': result['first_approval'],
            'chembl_id': result['molecule_chembl_id']
    })
        
details = pd.DataFrame(details)

In [5]:
print(f'Number of queries: {len(molecules)}')
print(f'Number of query results: {details.shape[0]}')

Number of queries: 121
Number of query results: 121


In [6]:
details

Unnamed: 0,atc_classification,chembl_id,first_approval,inchi,smiles
0,[],CHEMBL520734,,InChI=1S/C15H18N4O2S2/c16-15(21)19-14-11(6-12(...,NC(=O)Nc1sc(cc1C(=O)N[C@H]2CCCNC2)c3ccsc3
1,[],CHEMBL2064599,,InChI=1S/C14H13NO2/c1-17-13-9-7-11(8-10-13)14(...,COc1ccc(cc1)C(=O)Nc2ccccc2
2,[],CHEMBL1562513,,InChI=1S/C13H17NO/c15-13(11-7-3-1-4-8-11)14-12...,O=C(Nc1ccccc1)C2CCCCC2
3,[],CHEMBL487776,,InChI=1S/C18H21N5/c1-13-16-12-19-18(21-15-10-6...,Cc1nc(c2ccccc2)n3nc(NC4CCCCC4)ncc13
4,[],CHEMBL1446166,,InChI=1S/C16H11ClN2OS/c17-12-6-8-13(9-7-12)18-...,Clc1ccc(NC(=O)c2csc(n2)c3ccccc3)cc1
5,[],CHEMBL3183624,,"InChI=1S/C6H6O3/c1-4-2-3-5(9-4)6(7)8/h2-3H,1H3...",Cc1oc(cc1)C(=O)O
6,[],CHEMBL3979296,,InChI=1S/C17H18ClN3O/c18-14-3-1-13(2-4-14)17(2...,Clc1ccc(cc1)C(=O)Nc2ccc(cc2)N3CCNCC3
7,[],CHEMBL1077476,,InChI=1S/C14H11N3O/c18-14(16-11-6-2-1-3-7-11)1...,O=C(Nc1ccccc1)c2cnn3ccccc23
8,[],CHEMBL2234687,,InChI=1S/C14H10ClN3OS/c15-9-5-7-10(8-6-9)16-13...,Clc1ccc(NC(=O)Nc2nc3ccccc3s2)cc1
9,[],CHEMBL1529450,,InChI=1S/C15H12N2O2S/c1-19-11-8-6-10(7-9-11)14...,COc1ccc(cc1)C(=O)Nc2nc3ccccc3s2


### All targets:

In [9]:
activities = activity.filter(molecule_chembl_id_in = list(details.chembl_id),
                             type = 'IC50', 
                             relation = '=', 
                             assay_type = 'B') \
                    .only('activity_id','assay_chembl_id', 'assay_description', 'assay_type', \
                        'molecule_chembl_id', 'type', 'units', 'relation', 'value', \
                        'target_chembl_id', 'target_organism', 'target_pref_name')

In [10]:
len(activities)

756852

In [11]:
# TO DO: run for all activities
bioact_df = pd.DataFrame.from_records(activities[:50])
bioact_df.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,target_chembl_id,target_organism,target_pref_name,type,units,value
0,31864,CHEMBL872937,In vivo inhibitory activity against human Hepa...,B,CHEMBL324340,=,CHEMBL3921,Homo sapiens,Heparanase,IC50,uM,2.5
1,31864,CHEMBL872937,In vivo inhibitory activity against human Hepa...,B,CHEMBL324340,=,CHEMBL3921,Homo sapiens,Heparanase,IC50,uM,2.5
2,31866,CHEMBL872937,In vivo inhibitory activity against human Hepa...,B,CHEMBL109600,=,CHEMBL3921,Homo sapiens,Heparanase,IC50,uM,9.0
3,31868,CHEMBL760688,Inhibitory activity against Palmitoyl-CoA oxid...,B,CHEMBL357278,=,CHEMBL4632,Rattus norvegicus,Palmitoyl-CoA oxidase,IC50,uM,4.0
4,31870,CHEMBL760688,Inhibitory activity against Palmitoyl-CoA oxid...,B,CHEMBL357119,=,CHEMBL4632,Rattus norvegicus,Palmitoyl-CoA oxidase,IC50,uM,17.0


In [12]:
# keep only molar units
bioact_df = bioact_df.drop(bioact_df.index[~bioact_df.units.str.contains('M')])
bioact_df.shape

(48, 12)

In [13]:
def convert_to_NM(unit, bioactivity):
#     c=0
# for i, unit in enumerate(bioact_df.units):
    if unit != "nM":        
        if unit == "pM":
            value = float(bioactivity)/1000
        elif unit == "10'-11M":
            value = float(bioactivity)/100
        elif unit == "10'-10M":
            value = float(bioactivity)/10
        elif unit == "10'-8M":
            value = float(bioactivity)*10
        elif unit == "10'-1microM" or unit == "10'-7M":
            value = float(bioactivity)*100
        elif unit == "uM" or unit == "/uM" or unit == "10'-6M":
            value = float(bioactivity)*1000
        elif unit == "10'1 uM":
            value = float(bioactivity)*10000
        elif unit == "10'2 uM":
            value = float(bioactivity)*100000
        elif unit == "mM":
            value = float(bioactivity)*1000000
        elif unit == "M":
            value = float(bioactivity)*1000000000
        else:
            print ('unit not recognized...', unit)
        return value
    else: return bioactivity

In [14]:
# convert all to nM
bioactivity_nM = []
for i, row in bioact_df.iterrows():
    bioact_nM = convert_to_NM(row['units'], row['value'])
    bioactivity_nM.append(bioact_nM)
bioact_df['value'] = bioactivity_nM
bioact_df['units'] = 'nM'
bioact_df.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,target_chembl_id,target_organism,target_pref_name,type,units,value
0,31864,CHEMBL872937,In vivo inhibitory activity against human Hepa...,B,CHEMBL324340,=,CHEMBL3921,Homo sapiens,Heparanase,IC50,nM,2500
1,31864,CHEMBL872937,In vivo inhibitory activity against human Hepa...,B,CHEMBL324340,=,CHEMBL3921,Homo sapiens,Heparanase,IC50,nM,2500
2,31866,CHEMBL872937,In vivo inhibitory activity against human Hepa...,B,CHEMBL109600,=,CHEMBL3921,Homo sapiens,Heparanase,IC50,nM,9000
3,31868,CHEMBL760688,Inhibitory activity against Palmitoyl-CoA oxid...,B,CHEMBL357278,=,CHEMBL4632,Rattus norvegicus,Palmitoyl-CoA oxidase,IC50,nM,4000
4,31870,CHEMBL760688,Inhibitory activity against Palmitoyl-CoA oxid...,B,CHEMBL357119,=,CHEMBL4632,Rattus norvegicus,Palmitoyl-CoA oxidase,IC50,nM,17000


In [21]:
bioact_df = bioact_df.astype({'value': 'float'})

In [23]:
bioact_df = bioact_df[bioact_df.value >= 500]

In [31]:
#target_names = []
#for t in bioact_df.target_chembl_id:
#    target_names.append(target.get(t)['pref_name'])
#target_names = pd.Series(target_names)

In [32]:
target_names = pd.Series(bioact_df.target_pref_name)

In [33]:
target_names.value_counts()[:50]

Heparanase                                       3
Palmitoyl-CoA oxidase                            2
Protein farnesyltransferase                      2
Endothelin receptor ET-A                         1
Oxytocin receptor                                1
Acyl coenzyme A:cholesterol acyltransferase 1    1
Glutathionylspermidine synthase                  1
Vascular endothelial growth factor receptor 2    1
Vascular cell adhesion protein 1                 1
Thrombin                                         1
Dopamine D3 receptor                             1
Proteinase-activated receptor 1                  1
Tubulin                                          1
Plasminogen                                      1
Dopamine D2 receptor                             1
Thromboxane-A synthase                           1
GABA-A receptor; anion channel                   1
Name: target_pref_name, dtype: int64

### ATC classified molecules:

In [34]:
# check if first_approval field is not empty
details[details.first_approval == details.first_approval]

Unnamed: 0,atc_classification,chembl_id,first_approval,inchi,smiles


In [37]:
# check if atc classified field is not empty
details[details.atc_classification.str.len() != 0]

Unnamed: 0,atc_classification,chembl_id,first_approval,inchi,smiles


### Approved drugs:

In [38]:
approved_drugs = compound.filter(molecule_chembl_id__in = list(details.chembl_id), max_phase=4)

In [39]:
len(approved_drugs)

1

In [40]:
approved_mol = approved_drugs[0]

In [41]:
chembl_id = approved_mol['molecule_chembl_id']
chembl_id

'CHEMBL1909282'

In [42]:
activities = activity.filter(molecule_chembl_id = chembl_id) \
              .filter(type = 'IC50') \
              .filter(assay_type = 'B') \
              .only('activity_id','assay_chembl_id', 'assay_description', 'assay_type', \
                    'molecule_chembl_id', 'type', 'units', 'relation', 'value', \
                    'target_chembl_id', 'target_organism')

In [43]:
act = pd.DataFrame(list(activities))
act

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,target_chembl_id,target_organism,type,units,value
0,15460367,CHEMBL3540574,Inhibition of Sprague-Dawley rat Bsep expresse...,B,CHEMBL1909282,>,CHEMBL2073674,Rattus norvegicus,IC50,uM,1000
1,15460435,CHEMBL3540573,Inhibition of human BSEP expressed in plasma m...,B,CHEMBL1909282,>,CHEMBL6020,Homo sapiens,IC50,uM,1000
