## Beta-2 adrenergic receptor - part 1 (dataset preparation)

### Import libraries

In [1]:
import requests
import json
import pandas as pd

In [2]:
from tqdm import tqdm
tqdm.pandas()

import sys
sys.path.append('/home/daniel/wizepair2')
from mmpa.chem import strip_salts

### Define POST request and download 

In [3]:
url = "https://www.ebi.ac.uk/chembl/elk/es/chembl_activity/_search"
headers = {'Content-Type': 'application/json'}
with open('beta2_agonists.chembl') as f:
    payload = f.read()
response = requests.post(url, headers=headers, data=payload)
response

<Response [200]>

### Load results into a pandas data frame

In [4]:
df = json.loads(response.text)['hits']['hits']
df = pd.DataFrame(df)._source
df = pd.json_normalize(df)
df.sample(3)

Unnamed: 0,activity_properties,standard_units,standard_type,standard_relation,data_validity_comment,activity_comment,target_pref_name,pchembl_value,bao_label,molecule_pref_name,...,_metadata.assay_data.assay_cell_type,_metadata.assay_data.assay_organism,_metadata.assay_data.tissue_chembl_id,_metadata.assay_data.assay_parameters,_metadata.assay_data.assay_tissue,_metadata.source.src_description,ligand_efficiency.lle,ligand_efficiency.sei,ligand_efficiency.bei,ligand_efficiency.le
1362,[],nM,Potency,,,inconclusive,"Histone-lysine N-methyltransferase, H3 lysine-...",,assay format,OXEDRINE,...,,Homo sapiens,,[],,PubChem BioAssays,,,,
5211,"[{'text_value': 'LEFT VENTRICLE, MYOCYTE, DEGE...",,Tissue Severity Score,,,See Activity_Supp For Individual Animal Data,Rattus norvegicus,,organism-based format,EPINEPHRINE,...,,Rattus norvegicus,,"[{'comments': None, 'standard_units': None, 's...",,DrugMatrix,,,,
1097,[],nM,Potency,=,,Inconclusive,Lysine-specific demethylase 4D-like,4.9,assay format,PHENYLEPHRINE HYDROCHLORIDE,...,,Homo sapiens,,[],,PubChem BioAssays,,,,


### Summarise the most common molecules.

In [5]:
df[['molecule_chembl_id', '_metadata.parent_molecule_data.compound_key']].value_counts()[0:10]

molecule_chembl_id  _metadata.parent_molecule_data.compound_key
CHEMBL434           ISOPRENALINE                                   1448
CHEMBL679           EPINEPHRINE                                     781
CHEMBL1215          L-PHENYLEPHRINE                                 485
CHEMBL1437          NOREPINEPHRINE                                  481
CHEMBL1740          RACEPINEPHRINE                                  414
CHEMBL714           SALBUTAMOL                                      256
CHEMBL434           Isoproterenol                                   135
                    Isoprenaline                                     74
CHEMBL714           Salbutamol                                       61
                    salbutamol                                       59
dtype: int64

### Summarise the most common targets.

In [6]:
df[['assay_chembl_id', 'target_pref_name', 'bao_label', 'standard_type', 'standard_units']].value_counts()[0:10]

assay_chembl_id  target_pref_name   bao_label              standard_type  standard_units
CHEMBL1794375    Unchecked          assay format           Potency        nM                51
CHEMBL3885882    Rattus norvegicus  organism-based format  MONOLE         %                 41
                                                           EOS            cells.uL-1        41
                                                           GLUC           ug.mL-1           41
                                                           HCT            %                 41
                                                           HGB            ug.mL-1           41
                                                           LIPASE         U.L-1             41
                                                           LYM            cells.uL-1        41
                                                           MCH            pg                41
                                                        

### Strip salts from the molecular structures.

In [7]:
df['stripped_smiles'] = df.canonical_smiles.progress_apply(strip_salts)
df.stripped_smiles.unique().size

100%|██████████| 8115/8115 [00:15<00:00, 538.31it/s]


233

### Split training from test set and save

In [8]:
training = ~df['molecule_chembl_id'].isin(['CHEMBL714'])

In [9]:
df[training].to_csv('beta2_agonists_stripped.csv', index=False)
df[~training].to_csv('beta2_agonists_stripped_holdout.csv', index=False)