## Glucocorticoid receptor - part 1 (dataset preparation)

### Import libraries

In [1]:
import requests
import json
import pandas as pd

In [2]:
from pandarallel import pandarallel
pandarallel.initialize()

import sys
sys.path.append('/home/daniel/wizepair2')
from mmpa.chem import strip_salts, strip_stereo

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Define POST request and download (~~40% similarity to CHEMBL1201389~~ manually curated dataset)

In [3]:
url = "https://www.ebi.ac.uk/chembl/elk/es/chembl_activity/_search"
headers = {'Content-Type': 'application/json'}
with open('nr3c1_agonists.chembl') as f:
    payload = f.read()
response = requests.post(url, headers=headers, data=payload)
response

<Response [200]>

### Load results into a pandas data frame

In [4]:
df = json.loads(response.text)['hits']['hits']
df = pd.DataFrame(df)._source
df = pd.json_normalize(df)

### Summarise the most common molecules.

In [5]:
df[['molecule_chembl_id', '_metadata.parent_molecule_data.compound_key']].value_counts()[0:10]

molecule_chembl_id  _metadata.parent_molecule_data.compound_key
CHEMBL389621        HYDROCORTISONE                                 695
CHEMBL131           PREDNISOLONE                                   681
                    Prednisolone                                   338
CHEMBL1370          BUDESONIDE                                     291
CHEMBL389621        Hydrocortisone                                 251
CHEMBL1370          Budesonide                                     138
CHEMBL131           prednisolone                                   103
CHEMBL389621        hydrocortisone                                  81
CHEMBL131           2, pred                                         68
CHEMBL389621        SID75748                                        62
dtype: int64

### Summarise the most common targets.

In [6]:
df[['assay_chembl_id', 'target_pref_name', 'bao_label', 'standard_type', 'standard_units']].value_counts()[0:10]

assay_chembl_id  target_pref_name   bao_label              standard_type  standard_units
CHEMBL3885882    Rattus norvegicus  organism-based format  BUN            ug.mL-1           12
                                                           CHLORIDE       mEq.L-1           12
                                                           CK             U.L-1             12
                                                           CO2            nM                12
                                                           CREAT          ug.mL-1           12
                                                           EOS            cells.uL-1        12
                                                           EOSLE          %                 12
                                                           GLUC           ug.mL-1           12
                                                           HCT            %                 12
                                                        

### Consider logD measurements to be pchembl values

In [7]:
df['pchembl_value'] = df.apply(lambda x: x['standard_value'] if x['standard_type'] == 'LogD' else x['pchembl_value'], axis=1)

### Treat alogp as 'assay' to mitigate sparse activity matrix

In [8]:
df_alogp = df[['_metadata.parent_molecule_data.alogp', 'canonical_smiles']].drop_duplicates().merge(
    pd.Series({
        'target_pref_name': 'alogp', 
        'standard_type': '_metadata.parent_molecule_data',
        'standard_relation': '='}).to_frame().T, how='cross').rename(
    columns={'_metadata.parent_molecule_data.alogp': 'pchembl_value'})
df = pd.concat([df, df_alogp])

### Strip salts from the molecular structures.

In [9]:
df['stripped_smiles'] = df.canonical_smiles.parallel_apply(strip_salts)
df = df[df.stripped_smiles.notna()]
df.stripped_smiles.unique().size

5

### Split training from test set and save

In [10]:
df['achiral_smiles'] = df.stripped_smiles.apply(strip_stereo)
training = ~df['achiral_smiles'].isin(['CCCC1OC2CC3C4CCC5=CC(=O)C=CC5(C)C4C(O)CC3(C)C2(C(=O)CO)O1'])

In [11]:
df[training].to_csv('nr3c1_agonists_stripped.csv', index=False)
df[~training].to_csv('nr3c1_agonists_stripped_holdout.csv', index=False)