## Histone deacetylase 1 - part 1 (dataset preparation)

### Import libraries

In [1]:
import requests
import json
import pandas as pd

In [2]:
from tqdm import tqdm
tqdm.pandas()

import sys
sys.path.append('/home/daniel/wizepair2')
from mmpa.chem import strip_salts

### Define POST request and download 

In [3]:
url = "https://www.ebi.ac.uk/chembl/elk/es/chembl_activity/_search"
headers = {'Content-Type': 'application/json'}
with open('hdac1_inhibitors.chembl') as f:
    payload = f.read()
response = requests.post(url, headers=headers, data=payload)
response

<Response [200]>

### Load results into a pandas data frame

In [4]:
df = json.loads(response.text)['hits']['hits']
df = pd.DataFrame(df)._source
df = pd.json_normalize(df)
df.sample(3)

Unnamed: 0,activity_properties,standard_units,standard_type,standard_relation,data_validity_comment,activity_comment,target_pref_name,pchembl_value,bao_label,molecule_pref_name,...,_metadata.assay_data.assay_cell_type,_metadata.assay_data.assay_organism,_metadata.assay_data.tissue_chembl_id,_metadata.assay_data.assay_parameters,_metadata.assay_data.assay_tissue,_metadata.source.src_description,ligand_efficiency.lle,ligand_efficiency.sei,ligand_efficiency.bei,ligand_efficiency.le
11,[],nM,Solubility,>,,,No relevant target,,small-molecule physicochemical format,,...,,,,[],,Scientific Literature,,,,
53,[],hr,T1/2,=,,,Canis familiaris,,organism-based format,,...,,Canis lupus familiaris,,[],,Scientific Literature,,,,
32,[],hr.Kg/L,AUC,=,Non standard unit for type,,Rattus norvegicus,,organism-based format,,...,,Rattus norvegicus,,[],,Scientific Literature,,,,


### Summarise the most common molecules.

In [5]:
df[['molecule_chembl_id', '_metadata.parent_molecule_data.compound_key']].value_counts()[0:10]

molecule_chembl_id  _metadata.parent_molecule_data.compound_key
CHEMBL403813        13b                                            23
CHEMBL403812        13d                                            11
CHEMBL402208        13a                                            11
CHEMBL258177        15b                                            11
CHEMBL256985        14b                                            11
CHEMBL257176        3                                               9
CHEMBL402542        14c                                             7
CHEMBL258176        15a                                             7
CHEMBL258175        15c                                             6
CHEMBL256998        15d                                             6
dtype: int64

### Summarise the most common targets.

In [6]:
df[['assay_chembl_id', 'target_pref_name', 'bao_label', 'standard_type', 'standard_units']].value_counts()[0:10]

assay_chembl_id  target_pref_name       bao_label                              standard_type  standard_units
CHEMBL927950     HERG                   single protein format                  IC50           nM                13
CHEMBL927949     HCT-116                cell-based format                      IC50           nM                13
CHEMBL927948     Histone deacetylase 1  cell-based format                      IC50           nM                13
CHEMBL927951     No relevant target     small-molecule physicochemical format  Solubility     nM                12
CHEMBL927956     Rattus norvegicus      organism-based format                  AUC            hr.Kg/L            4
CHEMBL927955     Rattus norvegicus      organism-based format                  Vdss           L.kg-1             4
CHEMBL927953     Rattus norvegicus      organism-based format                  CL             mL.min-1.kg-1      4
CHEMBL927954     Rattus norvegicus      organism-based format                  T1/2   

### Consider logD measurements to be pchembl values

In [7]:
df['pchembl_value'] = df.apply(lambda x: x['standard_value'] if x['standard_type'] == 'LogD' else x['pchembl_value'], axis=1)

### Strip salts from the molecular structures.

In [8]:
df['stripped_smiles'] = df.canonical_smiles.progress_apply(strip_salts)
df.stripped_smiles.unique().size

100%|██████████| 119/119 [00:00<00:00, 434.17it/s]


13

### Split training from test set and save

In [9]:
training = ~df['_metadata.parent_molecule_data.compound_key'].isin(['13c', '14d'])

In [10]:
df[training].to_csv('hdac1_inhibitors_stripped.csv', index=False)
df[~training].to_csv('hdac1_inhibitors_stripped_holdout.csv', index=False)