#### Import Libraries

In [100]:
import numpy as np
import math
import pandas as pd
from chembl_webresource_client.new_client import new_client
from tqdm.auto import tqdm

In [118]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

#### Connect with Chembl api

In [80]:
targets_api = new_client.target
compounds_api = new_client.molecule
bioactivities_api = new_client.activity

##### In previous notebook, we retrieved the inhibitors lists for all the targets. PIK3CA (Uniprot: P42336) has the most inhibitors i.e. 5423 molecules. 

In [81]:
PIK3CA_df = pd.read_excel('/home/sumit/Documents/Documents/databasejmi/drug_discovery/brc_targets_inhibitors.xlsx', sheet_name='P42336')

In [82]:
PIK3CA_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5423 entries, 0 to 5422
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   activity_id         5423 non-null   int64  
 1   assay_chembl_id     5423 non-null   object 
 2   assay_description   5423 non-null   object 
 3   assay_type          5423 non-null   object 
 4   molecule_chembl_id  5423 non-null   object 
 5   relation            5423 non-null   object 
 6   standard_units      5423 non-null   object 
 7   standard_value      5423 non-null   float64
 8   target_chembl_id    5423 non-null   object 
 9   target_organism     5423 non-null   object 
 10  type                5423 non-null   object 
 11  units               5423 non-null   object 
 12  value               5423 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 550.9+ KB


In [83]:
PIK3CA_df.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type,units,value
0,1410288,CHEMBL829496,Inhibition of Phosphatidylinositol 3-kinase,B,CHEMBL435507,=,nM,2400.0,CHEMBL4005,Homo sapiens,IC50,uM,2.4
1,1410288,CHEMBL829496,Inhibition of Phosphatidylinositol 3-kinase,B,CHEMBL435507,=,nM,2400.0,CHEMBL4005,Homo sapiens,IC50,uM,2.4
2,1412178,CHEMBL830973,Inhibition of human recombinant p110 alpha Pho...,B,CHEMBL98350,=,nM,2300.0,CHEMBL4005,Homo sapiens,IC50,uM,2.3
3,1412285,CHEMBL829496,Inhibition of Phosphatidylinositol 3-kinase,B,CHEMBL104468,=,nM,13000.0,CHEMBL4005,Homo sapiens,IC50,uM,13.0
4,1459741,CHEMBL831920,Inhibition of Phosphatidylinositol 3-kinase p1...,B,CHEMBL98350,=,nM,2300.0,CHEMBL4005,Homo sapiens,IC50,uM,2.3


In [84]:
PIK3CA_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape: {PIK3CA_df.shape}")

#No missing values detected

DataFrame shape: (5423, 13)


In [85]:
PIK3CA_df['standard_units'].unique()

array(['nM'], dtype=object)

##### Drop entries with duplicate molecule IDs.

In [86]:
PIK3CA_df.drop_duplicates("molecule_chembl_id", keep="first", inplace=True)
len(PIK3CA_df) # 633 duplicate molecules were removed. 

4790

In [87]:
PIK3CA_df.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type,units,value
0,1410288,CHEMBL829496,Inhibition of Phosphatidylinositol 3-kinase,B,CHEMBL435507,=,nM,2400.0,CHEMBL4005,Homo sapiens,IC50,uM,2.4
2,1412178,CHEMBL830973,Inhibition of human recombinant p110 alpha Pho...,B,CHEMBL98350,=,nM,2300.0,CHEMBL4005,Homo sapiens,IC50,uM,2.3
3,1412285,CHEMBL829496,Inhibition of Phosphatidylinositol 3-kinase,B,CHEMBL104468,=,nM,13000.0,CHEMBL4005,Homo sapiens,IC50,uM,13.0
6,1472257,CHEMBL831920,Inhibition of Phosphatidylinositol 3-kinase p1...,B,CHEMBL188678,=,nM,5000.0,CHEMBL4005,Homo sapiens,IC50,uM,5.0
8,1734324,CHEMBL871583,Inhibition of human PI3Kalpha,B,CHEMBL379156,=,nM,940.0,CHEMBL4005,Homo sapiens,IC50,nM,940.0


In [88]:
PIK3CA_df.reset_index(drop=True, inplace=True)
PIK3CA_df.head()  # reset index

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type,units,value
0,1410288,CHEMBL829496,Inhibition of Phosphatidylinositol 3-kinase,B,CHEMBL435507,=,nM,2400.0,CHEMBL4005,Homo sapiens,IC50,uM,2.4
1,1412178,CHEMBL830973,Inhibition of human recombinant p110 alpha Pho...,B,CHEMBL98350,=,nM,2300.0,CHEMBL4005,Homo sapiens,IC50,uM,2.3
2,1412285,CHEMBL829496,Inhibition of Phosphatidylinositol 3-kinase,B,CHEMBL104468,=,nM,13000.0,CHEMBL4005,Homo sapiens,IC50,uM,13.0
3,1472257,CHEMBL831920,Inhibition of Phosphatidylinositol 3-kinase p1...,B,CHEMBL188678,=,nM,5000.0,CHEMBL4005,Homo sapiens,IC50,uM,5.0
4,1734324,CHEMBL871583,Inhibition of human PI3Kalpha,B,CHEMBL379156,=,nM,940.0,CHEMBL4005,Homo sapiens,IC50,nM,940.0


In [89]:
PIK3CA_df.rename(columns={'standard_value': 'IC50'}, inplace=True)

In [90]:
PIK3CA_df.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,IC50,target_chembl_id,target_organism,type,units,value
0,1410288,CHEMBL829496,Inhibition of Phosphatidylinositol 3-kinase,B,CHEMBL435507,=,nM,2400.0,CHEMBL4005,Homo sapiens,IC50,uM,2.4
1,1412178,CHEMBL830973,Inhibition of human recombinant p110 alpha Pho...,B,CHEMBL98350,=,nM,2300.0,CHEMBL4005,Homo sapiens,IC50,uM,2.3
2,1412285,CHEMBL829496,Inhibition of Phosphatidylinositol 3-kinase,B,CHEMBL104468,=,nM,13000.0,CHEMBL4005,Homo sapiens,IC50,uM,13.0
3,1472257,CHEMBL831920,Inhibition of Phosphatidylinositol 3-kinase p1...,B,CHEMBL188678,=,nM,5000.0,CHEMBL4005,Homo sapiens,IC50,uM,5.0
4,1734324,CHEMBL871583,Inhibition of human PI3Kalpha,B,CHEMBL379156,=,nM,940.0,CHEMBL4005,Homo sapiens,IC50,nM,940.0


##### Get SMILES from Chembl IDs

In [91]:
mol_api = new_client.molecule

In [92]:
def get_smiles(molecule_chembl_id):
    try:
        molecule = mol_api.get(molecule_chembl_id)
        smiles = molecule['molecule_structures']['canonical_smiles']
        return smiles
    except Exception as e:
        print(f"Error fetching SMILES for ChEMBL ID {molecule_chembl_id}: {str(e)}")
        return None

In [93]:
tqdm.pandas()

In [94]:
PIK3CA_df['canonical_smiles'] = PIK3CA_df['molecule_chembl_id'].progress_apply(get_smiles)

  0%|          | 0/4790 [00:00<?, ?it/s]

In [95]:
PIK3CA_df.shape

(4790, 14)

In [101]:
def convert_ic50_to_pic50(IC50_value):
    pIC50_value = 9 - math.log10(IC50_value)
    return pIC50_value

In [102]:
PIK3CA_df['pIC50'] = PIK3CA_df['IC50'].apply(convert_ic50_to_pic50)

In [108]:
PIK3CA_df.shape

(4790, 15)

In [104]:
PIK3CA_df['pIC50'].head()

0    5.619789
1    5.638272
2    4.886057
3    5.301030
4    6.026872
Name: pIC50, dtype: float64

In [110]:
PIK3CA_df2 = PIK3CA_df[['molecule_chembl_id', 'canonical_smiles', 'pIC50', 'IC50', 'standard_units']]

In [111]:
PIK3CA_df2.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,pIC50,IC50,standard_units
0,CHEMBL435507,CC1CN(c2cc(=O)c3ccc4ccccc4c3o2)CCO1,5.619789,2400.0,nM
1,CHEMBL98350,O=c1cc(N2CCOCC2)oc2c(-c3ccccc3)cccc12,5.638272,2300.0,nM
2,CHEMBL104468,O=c1cc(N2CCOCC2)oc2c1ccc1ccccc12,4.886057,13000.0,nM
3,CHEMBL188678,O=c1cc(N2CCOCC2)oc2c(-c3cccc4c3sc3ccccc34)cccc12,5.30103,5000.0,nM
4,CHEMBL379156,O=C1NC(=O)/C(=C/c2ccc(-c3ccc(F)cc3O)o2)S1,6.026872,940.0,nM


In [116]:
PIK3CA_df2.to_excel('PIK3CA.xlsx')

##### Get molecular descriptors from RDkit 

In [120]:
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        # add hydrogens to molecules
        mol=Chem.AddHs(mol)
        # Calculate all the descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names

In [121]:
Mol_descriptors, desc_names = RDkit_descriptors(PIK3CA_df2['canonical_smiles'])

In [122]:
PIK3CA_descriptors = pd.DataFrame(Mol_descriptors,columns=desc_names)
PIK3CA_descriptors

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,13.300824,13.300824,0.373221,-3.873970,0.646725,295.338,278.202,295.120843,112,0,...,0,0,0,0,0,0,0,0,0,0
1,13.362025,13.362025,0.305310,-3.665314,0.728379,307.349,290.213,307.120843,116,0,...,0,0,0,0,0,0,0,0,0,0
2,13.168156,13.168156,0.247596,-3.587512,0.643252,281.311,266.191,281.105193,106,0,...,0,0,0,0,0,0,0,0,0,0
3,13.774156,13.774156,0.116155,-3.706189,0.374251,413.498,394.346,413.108564,148,0,...,0,0,0,0,0,0,0,1,0,0
4,13.864007,13.864007,0.010190,-1.434530,0.833356,305.286,297.222,305.015807,106,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4785,15.670877,15.670877,0.585865,-5.865303,0.021182,1093.295,1050.959,1092.151570,358,0,...,0,1,0,0,0,0,0,0,0,0
4786,9.121117,9.121117,0.020793,-3.868042,0.579179,364.409,344.249,364.164774,138,0,...,0,0,0,0,0,0,0,0,0,0
4787,9.096410,9.096410,0.080504,-3.809673,0.488260,384.827,367.691,384.110151,138,0,...,0,0,0,0,0,0,0,0,0,0
4788,12.893909,12.893909,0.077597,-4.211548,0.452677,425.880,405.720,425.136701,154,0,...,0,0,0,0,0,0,0,0,0,0


In [123]:
PIK3CA_descriptors.to_excel('PIK3CA_descriptors.xlsx')