## Glucocorticoid receptor - part 1 (dataset preparation)

### Import libraries

In [1]:
import requests
import json
import pandas as pd

In [2]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

import sys
sys.path.append('/home/daniel/wizepair2')
from mmpa.chem import strip_salts, strip_stereo

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Define POST request and download (40% similarity to PREDNISOLONE)

In [3]:
url = "https://www.ebi.ac.uk/chembl/elk/es/chembl_activity/_search"
headers = {'Content-Type': 'application/json'}
with open('nr3c1_agonists.chembl') as f:
    payload = f.read()
response = requests.post(url, headers=headers, data=payload)
response

<Response [200]>

### Load results into a pandas data frame

In [4]:
df = json.loads(response.text)['hits']['hits']
df = pd.DataFrame(df)._source
df = pd.json_normalize(df)
df.sample(3)

Unnamed: 0,activity_properties,standard_units,standard_type,standard_relation,data_validity_comment,target_pref_name,activity_comment,bao_label,pchembl_value,assay_chembl_id,...,_metadata.assay_data.cell_chembl_id,_metadata.assay_data.assay_organism,_metadata.assay_data.tissue_chembl_id,_metadata.assay_data.assay_parameters,_metadata.assay_data.assay_tissue,_metadata.source.src_description,ligand_efficiency.lle,ligand_efficiency.sei,ligand_efficiency.bei,ligand_efficiency.le
834,[],nM,Ki,=,,Glucocorticoid receptor,,assay format,8.28,CHEMBL1648865,...,,,,[],,Scientific Literature,6.72,8.73,22.96,0.43
262,[],nM,IC50,=,,Plasmodium falciparum,,organism-based format,5.4,CHEMBL1267251,...,,Plasmodium falciparum,,[],,Scientific Literature,,,,
137,[],nM,Ki,=,,Cytochrome P450 19A1,,assay format,6.16,CHEMBL663782,...,,Homo sapiens,,[],,Scientific Literature,1.48,36.09,22.78,0.42


### Summarise the most common molecules.

In [5]:
df[['molecule_chembl_id', '_metadata.parent_molecule_data.compound_key']].value_counts()[0:10]

molecule_chembl_id  _metadata.parent_molecule_data.compound_key
CHEMBL384467        Dexamethasone                                  79
CHEMBL131           Prednisolone                                   46
CHEMBL384467        dexamethasone                                  36
CHEMBL131           prednisolone                                   27
CHEMBL384467        Dex                                            21
CHEMBL131           1                                              19
CHEMBL1370          Budesonide                                     18
CHEMBL384467        1                                              18
CHEMBL389621        Hydrocortisone                                 18
CHEMBL110739        Corticosterone                                 17
dtype: int64

### Summarise the most common targets.

In [6]:
df[['assay_chembl_id', 'target_pref_name', 'bao_label', 'standard_type', 'standard_units']].value_counts()[0:10]

assay_chembl_id  target_pref_name                                                       bao_label              standard_type  standard_units
CHEMBL1614456    Hypoxia-inducible factor 1 alpha                                       assay format           Potency        nM                35
CHEMBL1613803    Hypoxia-inducible factor 1 alpha                                       assay format           Potency        nM                35
CHEMBL1614108    Cytochrome P450 3A4                                                    single protein format  Potency        nM                26
CHEMBL1942833    Glucocorticoid receptor                                                cell-based format      IC50           nM                26
CHEMBL1613886    Cytochrome P450 3A4                                                    single protein format  Potency        nM                26
CHEMBL2025282    Glucocorticoid receptor                                                cell-based format      IC50         

### Consider logD measurements to be pchembl values

In [7]:
df['pchembl_value'] = df.apply(lambda x: x['standard_value'] if x['standard_type'] == 'LogD' else x['pchembl_value'], axis=1)

### Strip salts from the molecular structures.

In [8]:
df['stripped_smiles'] = df.canonical_smiles.parallel_apply(strip_salts)
df = df[df.stripped_smiles.notna()]
df.stripped_smiles.unique().size

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=97), Label(value='0 / 97'))), HBox…

178

### Split training from test set and save

In [9]:
df['achiral_smiles'] = df.stripped_smiles.apply(strip_stereo)
training = ~df['achiral_smiles'].isin(['CCCC1OC2CC3C4CCC5=CC(=O)C=CC5(C)C4C(O)CC3(C)C2(C(=O)CO)O1'])

In [10]:
df[training].to_csv('nr3c1_agonists_stripped.csv', index=False)
df[~training].to_csv('nr3c1_agonists_stripped_holdout.csv', index=False)