### Import relevant libraries

In [1]:
# Import relevant libraries

# For data manipulation
import numpy as np
import pandas as pd

# To access ChEMBL database
from chembl_webresource_client.new_client import new_client

# Import custom functions
import sys
sys.path.append("../utils")
from lipinski import add_lipinski_descriptors

### Search for target protein

In [2]:
# target search for protein
target = new_client.target
target_query = target.search("coronavirus")
targets = pd.DataFrame.from_dict(target_query)
targets.head()

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],Feline coronavirus,Feline coronavirus,14.0,False,CHEMBL612744,[],ORGANISM,12663
2,[],Murine coronavirus,Murine coronavirus,14.0,False,CHEMBL5209664,[],ORGANISM,694005
3,[],Canine coronavirus,Canine coronavirus,14.0,False,CHEMBL5291668,[],ORGANISM,11153
4,[],Human coronavirus 229E,Human coronavirus 229E,13.0,False,CHEMBL613837,[],ORGANISM,11137


In [3]:
# target for "SARS coronavirus 3C-like proteinase"
targets[targets["pref_name"] == "SARS coronavirus 3C-like proteinase"]

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
6,[],Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,694009


In [4]:
# get the id
target_id = targets[targets["pref_name"] == "SARS coronavirus 3C-like proteinase"]["target_chembl_id"].iloc[0]
target_id

'CHEMBL3927'

In [5]:
# Get information how these compounds interact with biological targets
activity = new_client.activity
activity_target = activity.filter(target_chembl_id=target_id).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(activity_target)

In [6]:
# Check amount of rows and columns
df.shape

(247, 46)

In [7]:
# Inspect the dataframe
df.head()

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,7.2
1,,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,9.4
2,,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,13.5
3,,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,13.11
4,,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,2.0


In [8]:
# Save dataframe as csv
df.to_csv("../data/raw/bioactivity_data.csv", index=False)

### Handling missing data

In [9]:
# Handling missing values in standard_value column 
df = df[df.standard_value.notna()]
df.head()

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,7.2
1,,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,9.4
2,,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,13.5
3,,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,13.11
4,,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,2.0


In [10]:
# Check amount of rows and columns
df.shape

(245, 46)

In [11]:
# Handling missing values in canonical_smiles column
df = df[df.canonical_smiles.notna()]
df.head()

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,7.2
1,,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,9.4
2,,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,13.5
3,,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,13.11
4,,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,2.0


In [12]:
# Check amount of rows and columns
df.shape

(245, 46)

In [13]:
# Remove duplicates in canonical_smiles column
df = df.drop_duplicates(["canonical_smiles"])

# Check amount of rows and columns
df.shape

(184, 46)

### Data pre-processing of bioactivity data 

##### Keep relevant columns 

In [14]:
# Keep only the relevant columns
columns = ["molecule_chembl_id", "canonical_smiles", "standard_value"]
df = df[columns]

# Inspect the new dataframe
df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0


##### Labeling compounds as activate, inactive or intermediate

In [15]:
# Convert standard value column to float
df["standard_value"] = df["standard_value"].astype(float)

In [16]:
# Produce a label for each compound
def compound_class(x):
    if float(x) >= 10000:
        return "inactive"
    elif float(x) <= 1000:
        return "active"
    else:
        return "intermediate"
    
df["class_label"] = df["standard_value"].apply(compound_class)
df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class_label
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0,intermediate
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0,intermediate
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0,inactive
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0,inactive
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0,intermediate


In [17]:
# Save dataframe as csv
df.to_csv("../data/processed/bioactivity_data_reduced.csv", index=False)

### Clean the canonical SMILES column

In [18]:
# Keep only the main component of SMILES string
df["canonical_smiles_new"] = df.canonical_smiles.apply(lambda x: max(str(x).split('.'), key=len))

In [19]:
# Inspect which columns are affected by the transformation
df[df["canonical_smiles"] != df["canonical_smiles_new"]]

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class_label,canonical_smiles_new
95,CHEMBL215733,O=S(=O)(Cc1[nH]c(-c2ccc(Cl)s2)c[s+]1)c1cccs1.[...,18000.0,inactive,O=S(=O)(Cc1[nH]c(-c2ccc(Cl)s2)c[s+]1)c1cccs1
243,CHEMBL5436771,S=C([S-])NCc1cccnc1.[K+],165.0,active,S=C([S-])NCc1cccnc1


In [20]:
# Only keep the new canonical smiles column
df = df.drop(columns="canonical_smiles")
df = df.rename(columns={"canonical_smiles_new": "canonical_smiles"})

### Calculate Lipinski descriptors

In [21]:
# Function created in utils to calculate the Lipinski descriptors and add them to the dataframe
df = add_lipinski_descriptors(df=df, smiles_column="canonical_smiles")

In [22]:
# Inspect the new dataframe
df.head()

Unnamed: 0,molecule_chembl_id,standard_value,class_label,canonical_smiles,MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL187579,7200.0,intermediate,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,281.271,1.89262,0.0,5.0
1,CHEMBL188487,9400.0,intermediate,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,415.589,3.8132,0.0,2.0
2,CHEMBL185698,13500.0,inactive,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,421.19,2.6605,0.0,4.0
3,CHEMBL426082,13110.0,inactive,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,293.347,3.6308,0.0,3.0
4,CHEMBL187717,2000.0,intermediate,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],338.344,3.539,0.0,5.0


### Convert IC50 to pIC50

In [23]:
# Check distribution of standard_value
df["standard_value"].describe()

count    1.840000e+02
mean     6.100187e+04
std      1.903620e+05
min      5.000000e+01
25%      4.955000e+03
50%      1.506781e+04
75%      4.073803e+04
max      2.000000e+06
Name: standard_value, dtype: float64

In [24]:
# Bound standard_value to 100,000,000 maximum (no effect in this case)
df["standard_value"].apply(lambda x: 100000000 if x > 100000000 else x)
df["standard_value"].describe()

count    1.840000e+02
mean     6.100187e+04
std      1.903620e+05
min      5.000000e+01
25%      4.955000e+03
50%      1.506781e+04
75%      4.073803e+04
max      2.000000e+06
Name: standard_value, dtype: float64

In [25]:
# Convert nM to M and apply negative logarithm (higher value implies less concentration for drug efficacy required)
df["pIC50"] = df["standard_value"].apply(lambda x: -np.log10(x*(10**-9)))

In [26]:
# Check distribution of new pIC50 column
df["pIC50"].describe()

count    184.000000
mean       4.938454
std        0.889779
min        2.698970
25%        4.390000
50%        4.821954
75%        5.305011
max        7.301030
Name: pIC50, dtype: float64

In [27]:
# After transformation remove column
df = df.drop(columns="standard_value")

### Only keep relevant classes (convert to binary classification)

In [28]:
# Check amount of each label
df.class_label.value_counts()

class_label
inactive        122
intermediate     36
active           26
Name: count, dtype: int64

In [29]:
# Remove intermediate label
df = df[df["class_label"] != "intermediate"]

In [30]:
# Save final dataframe to csv
df.to_csv("../data/processed/bioactivity_data_final.csv", index=False)