## Beta-2 adrenergic receptor - part 1 (dataset preparation)

### Import libraries

In [1]:
import requests
import json
import pandas as pd

In [2]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

import sys
sys.path.append('/home/daniel/wizepair2')
from classes.chem import strip_salts, strip_stereo

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Define POST request and download (40% similarity to EPINEPHRINE)

In [3]:
url = "https://www.ebi.ac.uk/chembl/elk/es/chembl_activity/_search"
headers = {'Content-Type': 'application/json'}
with open('beta2_agonists.chembl') as f:
    payload = f.read()
response = requests.post(url, headers=headers, data=payload)
response

<Response [200]>

### Load results into a pandas data frame

In [4]:
df = json.loads(response.text)['hits']['hits']
df = pd.DataFrame(df)._source
df = pd.json_normalize(df)
df.sample(3)

Unnamed: 0,activity_properties,standard_units,standard_type,standard_relation,data_validity_comment,activity_comment,target_pref_name,bao_label,pchembl_value,assay_chembl_id,...,_metadata.assay_data.cell_chembl_id,_metadata.assay_data.assay_organism,_metadata.assay_data.tissue_chembl_id,_metadata.assay_data.assay_parameters,_metadata.assay_data.assay_tissue,_metadata.source.src_description,ligand_efficiency.lle,ligand_efficiency.sei,ligand_efficiency.bei,ligand_efficiency.le
418,[],nM,IC50,=,,,Plasmodium falciparum,organism-based format,4.9,CHEMBL1267247,...,,Plasmodium falciparum D10,,[],,Scientific Literature,,,,
1163,[],nM,Ki,=,,,Adrenergic receptor alpha-1,tissue-based format,4.45,CHEMBL647128,...,,,CHEMBL3638188,[],Brain,Scientific Literature,3.32,6.12,21.06,0.41
815,[],nM,Ki,=,,,Beta-2 adrenergic receptor,cell-based format,7.34,CHEMBL4397556,...,CHEMBL3308072,Homo sapiens,,[],,Scientific Literature,6.21,10.09,34.73,0.67


### Summarise the most common molecules.

In [5]:
df[['molecule_chembl_id', '_metadata.parent_molecule_data.compound_key']].value_counts()[0:10]

molecule_chembl_id  _metadata.parent_molecule_data.compound_key
CHEMBL434           Isoproterenol                                  53
                    Isoprenaline                                   26
CHEMBL1256484       SID50106160                                    22
CHEMBL2062275       SID11112034                                    19
CHEMBL679           EPINEPHRINE                                    18
CHEMBL1160723       SID11114195                                    17
CHEMBL1215          Phenylephrine                                  17
CHEMBL2062273       SID50105780                                    17
CHEMBL1437          NE                                             17
CHEMBL434           ISO                                            17
dtype: int64

### Summarise the most common targets.

In [6]:
df[['assay_chembl_id', 'target_pref_name', 'bao_label', 'standard_type', 'standard_units']].value_counts()[0:10]

assay_chembl_id  target_pref_name                                                       bao_label              standard_type  standard_units
CHEMBL1614275    Putative fructose-1,6-bisphosphate aldolase                            assay format           Potency        nM                30
CHEMBL1613914    Lysine-specific demethylase 4D-like                                    assay format           Potency        nM                28
CHEMBL1613803    Hypoxia-inducible factor 1 alpha                                       assay format           Potency        nM                27
CHEMBL1614456    Hypoxia-inducible factor 1 alpha                                       assay format           Potency        nM                27
CHEMBL1614364    Tyrosyl-DNA phosphodiesterase 1                                        single protein format  Potency        nM                24
CHEMBL1614361    Thyroid stimulating hormone receptor                                   assay format           Potency      

### Consider logD measurements to be pchembl values

In [7]:
df['pchembl_value'] = df.apply(lambda x: x['standard_value'] if x['standard_type'] == 'LogD' else x['pchembl_value'], axis=1)

### Strip salts from the molecular structures.

In [8]:
df['stripped_smiles'] = df.canonical_smiles.parallel_apply(strip_salts)
df.stripped_smiles.unique().size

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=214), Label(value='0 / 214'))), HB…

87

### Split training from test set and save

In [9]:
df['achiral_smiles'] = df.stripped_smiles.apply(strip_stereo)
training = ~df['achiral_smiles'].isin(['CC(C)(C)NCC(O)c1ccc(O)c(CO)c1'])

In [10]:
df[training].to_csv('beta2_agonists_stripped.csv', index=False)
df[~training].to_csv('beta2_agonists_stripped_holdout.csv', index=False)

In [11]:
#df.to_csv('beta2_agonists_stripped.csv', index=False)