In [1]:
#download biological activity data directly form chembl data base
! pip install chembl_webresource_client



In [1]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

In [2]:
# Target search for coronavirus
target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)
print(targets.shape)
targets

(8, 9)


Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],SARS coronavirus,SARS coronavirus,15.0,False,CHEMBL612575,[],ORGANISM,227859
2,[],Feline coronavirus,Feline coronavirus,15.0,False,CHEMBL612744,[],ORGANISM,12663
3,[],Human coronavirus 229E,Human coronavirus 229E,13.0,False,CHEMBL613837,[],ORGANISM,11137
4,"[{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...",SARS coronavirus,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,227859
5,[],Middle East respiratory syndrome-related coron...,Middle East respiratory syndrome-related coron...,9.0,False,CHEMBL4296578,[],ORGANISM,1335626
6,"[{'xref_id': 'P0C6X7', 'xref_name': None, 'xre...",SARS coronavirus,Replicase polyprotein 1ab,4.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,227859
7,[],Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,4.0,False,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


In [12]:
# Select and retrieve bioactivity data for SARS coronavirus 3C-like proteinase (fifth entry)

selected_target = targets.target_chembl_id[4]  #bcancer id=2; coronavirus id=4 ;aceytlcolinestrase id=0
print("Selected Target ID:",selected_target)

activity=new_client.activity
# print(activity)
#filtering out data with traget ID  and type IC50
res=activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50") 

df = pd.DataFrame.from_dict(res)
df.head()


#selecting unique values
#df.standard_type.unique()


# Finally we will save the resulting bioactivity data to a CSV file bioactivity_data.csv.

df.to_csv("C:/Users/laksh/Documents/DM_PROJECT/Acetyl/ace_bioactivity_data.csv", index=False)

# Handling missing data

df2=df[df.standard_value.notna()]
df2.shape

# Data pre-processing of the bioactivity data

#Labeling compounds as either being active, inactive or intermediate
#The bioactivity data is in the IC50 unit.
#Compounds having values of less than 1000 nM(nano Molar) will be considered to be active while those greater than 10,000 nM will be considered to be inactive.
#As for those values in between 1,000 and 10,000 nM will be referred to as intermediate.

bioactivity_class=[]

for i in df2.standard_value:
    if float(i)<=10000:
        bioactivity_class.append("active")
    elif float(i)>=10000:
        bioactivity_class.append("inactive")
    else:
        bioactivity_class.append("intermediate")
        
print(bioactivity_class)




# Iterating molecule chembl id to a list 

molecule_cid=[]
for i in df2.molecule_chembl_id:
    molecule_cid.append(i)

#print(molecule_cid)


# Appending Canonical Smiles to a list

#SMILES=>Simplified Molecular-input Line-entry System
# SMILES: which is used to translate a chemical’s three-dimensional structure into a string of symbols that is easily understood by computer software. 
#Canonical Smiles :special version of SMILES where each SMILES string uniquely identifies a single molecule structure.
canonical_smiles=[]
for i in df2.canonical_smiles:
    canonical_smiles.append(i)

#print(canonical_smiles)

# Appending standard values to a list

standard_value=[]
for i in df2.standard_value:
    standard_value.append(i)

#print(standard_values)

# Combine the 4 lists into a dataframe

data_tuples = list(zip(molecule_cid, canonical_smiles, bioactivity_class, standard_value))
df3 = pd.DataFrame( data_tuples,  columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])
print("df3",df3)

pd.concat([df3,pd.Series(bioactivity_class)], axis=1)

print("Combined df3 and bioactivity_Class",pd.concat([df3,pd.Series(bioactivity_class)], axis=1))

#saving the preprocessed data to the csv file
df3.to_csv("C:/Users/laksh/Documents/DM_PROJECT/Acetyl/ace_bioactivity_preprocessed_data.csv", index=False)



import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

df = pd.read_csv('C:/Users/laksh/Documents/DM_PROJECT/Acetyl/ace_bioactivity_preprocessed_data.csv')
print("ac bioactivity preprocessed data",df.head())

#Calculate discriptors
def lipinski(smiles, verbose=False):

    moldata= []
    for element in smiles:
        mol=Chem.MolFromSmiles(element) 
        moldata.append(mol)
    
    print(moldata)
    baseData= np.arange(1,1)
    i=0  
    for mol in moldata:        
       
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])   
    
        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1      
    
    columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    
    return descriptors

df_lipinski = lipinski(df.canonical_smiles)
print(df_lipinski)

Selected Target ID: CHEMBL220
['active', 'active', 'inactive', 'active', 'active', 'active', 'active', 'inactive', 'active', 'inactive', 'inactive', 'active', 'inactive', 'active', 'active', 'active', 'active', 'inactive', 'active', 'inactive', 'active', 'inactive', 'active', 'inactive', 'inactive', 'active', 'active', 'active', 'active', 'inactive', 'active', 'inactive', 'inactive', 'inactive', 'active', 'inactive', 'active', 'active', 'inactive', 'active', 'active', 'inactive', 'active', 'active', 'inactive', 'inactive', 'active', 'active', 'active', 'inactive', 'inactive', 'inactive', 'active', 'active', 'active', 'inactive', 'active', 'active', 'inactive', 'inactive', 'active', 'active', 'inactive', 'active', 'active', 'inactive', 'inactive', 'inactive', 'active', 'inactive', 'active', 'active', 'active', 'inactive', 'inactive', 'inactive', 'active', 'active', 'active', 'inactive', 'active', 'active', 'active', 'active', 'active', 'inactive', 'active', 'active', 'active', 'inactive

TypeError: No registered converter was able to produce a C++ rvalue of type class std::basic_string<wchar_t,struct std::char_traits<wchar_t>,class std::allocator<wchar_t> > from this Python object of type float