## Beta-2 adrenergic receptor - part 1 (dataset preparation)

### Import libraries

In [1]:
import requests
import json
import pandas as pd

from wizepair2.chem import strip_salts, strip_stereo

In [2]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


### Define POST request and download (40% similarity to EPINEPHRINE)

In [3]:
url = "https://www.ebi.ac.uk/chembl/elk/es/chembl_activity/_search"
headers = {'Content-Type': 'application/json'}
with open('beta2_agonists.chembl') as f:
    payload = f.read()
response = requests.post(url, headers=headers, data=payload)
response

<Response [200]>

### Load results into a pandas data frame

In [4]:
df = json.loads(response.text)['hits']['hits']
df = pd.DataFrame(df)._source
df = pd.json_normalize(df)
df.sample(3)

Unnamed: 0,activity_properties,standard_units,standard_type,standard_relation,data_validity_comment,activity_comment,target_pref_name,bao_label,pchembl_value,assay_chembl_id,...,_metadata.assay_data.cell_chembl_id,_metadata.assay_data.assay_organism,_metadata.assay_data.tissue_chembl_id,_metadata.assay_data.assay_parameters,_metadata.assay_data.assay_tissue,_metadata.source.src_description,ligand_efficiency.lle,ligand_efficiency.sei,ligand_efficiency.bei,ligand_efficiency.le
920,[],nM,AC50,=,,Summarised AC50 (mean value for measurements w...,Dopamine D1 receptor,single protein format,4.74,CHEMBL5291779,...,,,,[],,Scientific Literature,4.09,9.04,28.37,0.54
39,[],nM,IC50,=,,,Beta-2 adrenergic receptor,single protein format,5.1,CHEMBL1943767,...,,,,[],,Scientific Literature,3.51,3.51,11.37,0.22
487,[],nM,Potency,=,,Active,Lysine-specific demethylase 4D-like,assay format,4.8,CHEMBL1613914,...,,Homo sapiens,,[],,PubChem BioAssays,,,,


### Summarise the most common molecules.

In [5]:
df[['molecule_chembl_id', '_metadata.parent_molecule_data.compound_key']].value_counts()[0:10]

molecule_chembl_id  _metadata.parent_molecule_data.compound_key
CHEMBL434           Isoproterenol                                  58
                    Isoprenaline                                   25
                    ISO                                            22
CHEMBL1256484       SID50106160                                    22
CHEMBL1215          phenylephrine                                  19
CHEMBL1160723       SID11114195                                    19
CHEMBL1215          Phenylephrine                                  18
CHEMBL1472703       SID11112037                                    17
CHEMBL2062273       SID50105780                                    17
CHEMBL1437          NE                                             17
dtype: int64

### Summarise the most common targets.

In [6]:
df[['assay_chembl_id', 'target_pref_name', 'bao_label', 'standard_type', 'standard_units']].value_counts()[0:10]

assay_chembl_id  target_pref_name                                                       bao_label              standard_type  standard_units
CHEMBL1614275    Putative fructose-1,6-bisphosphate aldolase                            assay format           Potency        nM                30
CHEMBL1613914    Lysine-specific demethylase 4D-like                                    assay format           Potency        nM                28
CHEMBL1613803    Hypoxia-inducible factor 1 alpha                                       assay format           Potency        nM                27
CHEMBL1614456    Hypoxia-inducible factor 1 alpha                                       assay format           Potency        nM                27
CHEMBL1614364    Tyrosyl-DNA phosphodiesterase 1                                        single protein format  Potency        nM                24
CHEMBL1614361    Thyroid stimulating hormone receptor                                   assay format           Potency      

### Consider logD measurements to be pchembl values

In [7]:
df['pchembl_value'] = df.apply(lambda x: x['standard_value'] if x['standard_type'] == 'LogD' else x['pchembl_value'], axis=1)

### Strip salts from the molecular structures.

In [8]:
df['stripped_smiles'] = df.canonical_smiles.parallel_apply(strip_salts)
df.stripped_smiles.unique().size

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=137), Label(value='0 / 137'))), HB…

88

### Split training from test set and save

In [9]:
df['achiral_smiles'] = df.stripped_smiles.apply(strip_stereo)
training = ~df['achiral_smiles'].isin(['CC(C)(C)NCC(O)c1ccc(O)c(CO)c1'])

In [10]:
df[training].to_csv('beta2_agonists_stripped.csv', index=False)
df[~training].to_csv('beta2_agonists_stripped_holdout.csv', index=False)

In [11]:
#df.to_csv('beta2_agonists_stripped.csv', index=False)