## Beta-2 adrenergic receptor - part 1 (dataset preparation)

### Import libraries

In [1]:
import requests
import json
import pandas as pd

In [2]:
from tqdm import tqdm
tqdm.pandas()

import sys
sys.path.append('/home/daniel/wizepair2')
from mmpa.chem import strip_salts

### Define POST request and download 

In [3]:
url = "https://www.ebi.ac.uk/chembl/elk/es/chembl_activity/_search"
headers = {'Content-Type': 'application/json'}
with open('beta2_agonists.chembl') as f:
    payload = f.read()
response = requests.post(url, headers=headers, data=payload)
response

<Response [200]>

### Load results into a pandas data frame

In [4]:
df = json.loads(response.text)['hits']['hits']
df = pd.DataFrame(df)._source
df = pd.json_normalize(df)
df.sample(3)

Unnamed: 0,activity_properties,standard_units,standard_type,standard_relation,data_validity_comment,target_pref_name,activity_comment,bao_label,pchembl_value,assay_chembl_id,...,_metadata.assay_data.cell_chembl_id,_metadata.assay_data.assay_organism,_metadata.assay_data.tissue_chembl_id,_metadata.assay_data.assay_parameters,_metadata.assay_data.assay_tissue,_metadata.source.src_description,ligand_efficiency.lle,ligand_efficiency.sei,ligand_efficiency.bei,ligand_efficiency.le
2522,[],nM,GI50,,,PC-3,inactive,cell-based format,,CHEMBL1963885,...,CHEMBL3307570,,,[],,PubChem BioAssays,,,,
7463,"[{'text_value': 'RIGHT VENTRICLE, MYOCYTE, DEG...",,Tissue Severity Score,,,Rattus norvegicus,See Activity_Supp For Individual Animal Data,organism-based format,,CHEMBL3885882,...,,Rattus norvegicus,,"[{'comments': None, 'standard_units': None, 's...",,DrugMatrix,,,,
7708,[],,EC50,,,Alpha-1a adrenergic receptor,Not Determined,cell-based format,,CHEMBL3603442,...,CHEMBL3307715,Homo sapiens,,[],,Scientific Literature,,,,


### Summarise the most common molecules.

In [5]:
df[['molecule_chembl_id', '_metadata.parent_molecule_data.compound_key']].value_counts()[0:10]

molecule_chembl_id  _metadata.parent_molecule_data.compound_key
CHEMBL434           ISOPRENALINE                                   1478
CHEMBL679           EPINEPHRINE                                     788
CHEMBL1215          L-PHENYLEPHRINE                                 482
CHEMBL1437          NOREPINEPHRINE                                  457
CHEMBL1740          RACEPINEPHRINE                                  414
CHEMBL714           SALBUTAMOL                                      230
CHEMBL434           Isoproterenol                                   132
CHEMBL714           salbutamol                                       83
CHEMBL1437          SID29215339                                      65
CHEMBL1973413       SID545524                                        58
dtype: int64

### Summarise the most common targets.

In [6]:
df[['assay_chembl_id', 'target_pref_name', 'bao_label', 'standard_type', 'standard_units']].value_counts()[0:10]

assay_chembl_id  target_pref_name   bao_label              standard_type  standard_units
CHEMBL1794375    Unchecked          assay format           Potency        nM                51
CHEMBL3885882    Rattus norvegicus  organism-based format  POTASSIUM      mEq.L-1           41
                                                           BUN            ug.mL-1           41
                                                           ALP            U.L-1             41
                                                           ALT            U.L-1             41
                                                           AST            U.L-1             41
                                                           MCH            pg                41
                                                           BASO           cells.uL-1        41
                                                           LYMLE          %                 41
                                                        

### Consider logD measurements to be pchembl values

In [7]:
df['pchembl_value'] = df.apply(lambda x: x['standard_value'] if x['standard_type'] == 'LogD' else x['pchembl_value'], axis=1)

### Strip salts from the molecular structures.

In [8]:
df['stripped_smiles'] = df.canonical_smiles.progress_apply(strip_salts)
df.stripped_smiles.unique().size

100%|██████████| 8113/8113 [00:06<00:00, 1301.26it/s]


233

### Split training from test set and save

In [9]:
training = ~df['stripped_smiles'].isin(['CC(C)(C)NCC(O)c1ccc(O)c(CO)c1'])

In [10]:
df[training].to_csv('beta2_agonists_stripped.csv', index=False)
df[~training].to_csv('beta2_agonists_stripped_holdout.csv', index=False)