## A. ChEMBL DATABASE - Bioactivity Data
https://www.ebi.ac.uk/chembl/


In [40]:
# ! pip install chembl-webresource-client, dropbox

In [41]:
#############
# LIBRARIES #
#############

import os
# Current working directory
curr_dir = os.getcwd()
data_dir = os.path.join('data')
notebook_dir = os.path.join('notebooks')
project_dir = os.path.join('..')

import pandas as pd
import numpy as np
from chembl_webresource_client.new_client import new_client

### A.1. Target protein search

In [42]:
# Target search for Ube3A (Ubiquitin-protein ligase E3A)
target = new_client.target
gene_name = 'CASP2'

target_query = target.search(gene_name)
targets = pd.DataFrame.from_dict(target_query)
targets.head()

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P42575', 'xref_name': None, 'xre...",Homo sapiens,Caspase-2,15.0,False,CHEMBL4884,"[{'accession': 'P42575', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Homo sapiens,Caspase,4.0,False,CHEMBL3831289,"[{'accession': 'P49662', 'component_descriptio...",PROTEIN FAMILY,9606


In [43]:
# Select and retrieve bioactivity data caspase-2 (1st entry)
selected_target = targets.target_chembl_id[0]

# Retrieve bioactivity data for caspase-2
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
# Converting to dataframe
res_df = pd.DataFrame.from_dict(res)
# Saving to csv
if not os.path.exists(os.path.join(project_dir, data_dir)): os.mkdir(os.path.join(project_dir, data_dir)) # Creating a data directory, if it doesn't exist

res_df.to_csv(os.path.join(project_dir, data_dir, f'{gene_name}_bioactivity_data_raw.csv'), index=False)
res_df.head()

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1421057,[],CHEMBL831581,Inhibitory concentration against caspase-2 in ...,B,,,BAO_0000190,BAO_0000019,...,Homo sapiens,Caspase-2,9606,,,IC50,nM,UO_0000065,,10.0
1,,1421073,[],CHEMBL831973,Inhibitory concentration against casp-2 in neu...,B,,,BAO_0000190,BAO_0000019,...,Homo sapiens,Caspase-2,9606,,,IC50,nM,UO_0000065,,10.0
2,,1691804,[],CHEMBL863728,Inhibition of caspase2,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Caspase-2,9606,,,IC50,uM,UO_0000065,,1.53
3,,1691808,[],CHEMBL863728,Inhibition of caspase2,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Caspase-2,9606,,,IC50,uM,UO_0000065,,0.537
4,,1691809,[],CHEMBL863728,Inhibition of caspase2,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Caspase-2,9606,,,IC50,uM,UO_0000065,,0.231


In [44]:
', '.join(res_df.columns)

'activity_comment, activity_id, activity_properties, assay_chembl_id, assay_description, assay_type, assay_variant_accession, assay_variant_mutation, bao_endpoint, bao_format, bao_label, canonical_smiles, data_validity_comment, data_validity_description, document_chembl_id, document_journal, document_year, ligand_efficiency, molecule_chembl_id, molecule_pref_name, parent_molecule_chembl_id, pchembl_value, potential_duplicate, qudt_units, record_id, relation, src_id, standard_flag, standard_relation, standard_text_value, standard_type, standard_units, standard_upper_value, standard_value, target_chembl_id, target_organism, target_pref_name, target_tax_id, text_value, toid, type, units, uo_units, upper_value, value'

In [45]:
# Handling missing data --removing any compound that has missing value for "standard_value
missing_values_before = res_df.activity_id.isna().sum()
print(f'Missing values before removing NA: {missing_values_before}')

res_df_noNA = res_df.dropna(subset=['standard_value'])

missing_values_after = res_df_noNA.activity_id.isna().sum()
print(f'Missing values after removing NA: {missing_values_after}')

Missing values before removing NA: 0
Missing values after removing NA: 0


### A.2. Data preprocessing

##### Labeling the compounds as either being 
- active (IC50 < 1000 nm)
- intermediate (1000, 10000 nm)
- inactive (> 10000 nm)

In [46]:
#- active (IC50 < 1000 nm)
#- intermediate (1000, 10000 nm)
#- inactive (> 10000 nm)

labeling = lambda x: 'active' if float(x) <= 1000 else 'intermediate' if float(x) <= 10000 else 'inactive'
res_df_noNA['bioavtivity_class'] = res_df_noNA.standard_value.apply(labeling)


In [47]:
res_df_noNA.head()

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value,bioavtivity_class
0,,1421057,[],CHEMBL831581,Inhibitory concentration against caspase-2 in ...,B,,,BAO_0000190,BAO_0000019,...,Caspase-2,9606,,,IC50,nM,UO_0000065,,10.0,active
1,,1421073,[],CHEMBL831973,Inhibitory concentration against casp-2 in neu...,B,,,BAO_0000190,BAO_0000019,...,Caspase-2,9606,,,IC50,nM,UO_0000065,,10.0,active
2,,1691804,[],CHEMBL863728,Inhibition of caspase2,B,,,BAO_0000190,BAO_0000357,...,Caspase-2,9606,,,IC50,uM,UO_0000065,,1.53,intermediate
3,,1691808,[],CHEMBL863728,Inhibition of caspase2,B,,,BAO_0000190,BAO_0000357,...,Caspase-2,9606,,,IC50,uM,UO_0000065,,0.537,active
4,,1691809,[],CHEMBL863728,Inhibition of caspase2,B,,,BAO_0000190,BAO_0000357,...,Caspase-2,9606,,,IC50,uM,UO_0000065,,0.231,active


In [48]:
# Keeping only the relevant columns
cols_to_keep = ['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'bioavtivity_class']

df_final = res_df_noNA[cols_to_keep]

# Checking for duplicates
print(f'Number of duplicates: {df_final.duplicated().sum()}')

df_final

Number of duplicates: 0


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioavtivity_class
0,CHEMBL366927,CCCCCCN(C)CC(=O)C(CC(=O)O)NC(=O)C(CC)n1cc(C(C)...,10.0,active
1,CHEMBL179503,CCCCCN(C)CC(=O)C(CC(=O)O)NC(=O)C(CC)n1cc(C(C)(...,10.0,active
2,CHEMBL206435,O=C1NC(=O)c2ccccc2C1=O,1530.0,intermediate
3,CHEMBL203709,COc1ccccc1NC(=O)CCC(=O)Nc1ccc2c(c1)C(=O)C(=O)N...,537.0,active
4,CHEMBL438969,O=C(CCC(=O)N1CCCCC1)Nc1ccc2c(c1)C(=O)C(=O)NC2=O,231.0,active
...,...,...,...,...
65,CHEMBL3678073,CC(C)[C@H](NC(=O)[C@H](Cc1cccc2ccccc12)NC(=O)[...,3330.0,intermediate
66,CHEMBL2402203,C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)OCc1ccccc1)...,100000.0,inactive
67,CHEMBL4590201,O=C(CF)CNC(=O)[C@H](Cc1ccccc1)NC(=O)c1ccccc1,100000.0,inactive
68,CHEMBL4520267,O=C(CF)CNC(=O)[C@H](Cc1ccccc1)NC(=O)c1cccc2ccc...,100000.0,inactive


In [49]:
# Saving final dataframe to csv
df_final.to_csv(os.path.join(project_dir, data_dir, f'{gene_name}_bioactivity_data_preprocessed.csv'), index=False)