# Data Preparation Tutorial

The Data preparation contains 3 steps: Descriptor Generation Feature Engineering and Pair Generation.    
If you have finished your own data curation, please check the output file format by the end of each step and make yours fit for next step.    
Please keep the dataframe column names as ('smiles','value','assay').

In [1]:
import numpy as np
import pandas as pd
#load data
df = pd.read_csv('Data/chembl_fax.csv')
df.head()

Unnamed: 0,Smiles,pChEMBL Value,Assay ChEMBL ID
0,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.33,CHEMBL3885768
1,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.39,CHEMBL3885772
2,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.19,CHEMBL3885774
3,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.22,CHEMBL3885775
4,O=C1[C@@H](NS(=O)(=O)c2cc3ncccc3s2)CCN1Cc1cc2c...,7.56,CHEMBL3885768


## Descriptor Generation

Generate RDKIT_1D descriptors.

In [2]:
from DPDI.utilities.descriptor_generator import DescriptorGenerator

In [3]:
DG = DescriptorGenerator(df)
df = DG.rdkit_1d()
df.to_csv('Data/chembl_fax_rdkit1d.csv',index=False)
df.head()

Unnamed: 0,smiles,value,assay,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.33,CHEMBL3885768,12.841695,-3.817283,12.841695,0.03952,0.371398,455.565,434.397,...,0,1,0,0,0,0,0,1,0,0
1,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.39,CHEMBL3885772,12.841695,-3.817283,12.841695,0.03952,0.371398,455.565,434.397,...,0,1,0,0,0,0,0,1,0,0
2,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.19,CHEMBL3885774,12.841695,-3.817283,12.841695,0.03952,0.371398,455.565,434.397,...,0,1,0,0,0,0,0,1,0,0
3,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.22,CHEMBL3885775,12.841695,-3.817283,12.841695,0.03952,0.371398,455.565,434.397,...,0,1,0,0,0,0,0,1,0,0
4,O=C1[C@@H](NS(=O)(=O)c2cc3ncccc3s2)CCN1Cc1cc2c...,7.56,CHEMBL3885768,12.773431,-3.79032,12.773431,0.167192,0.50789,427.511,410.375,...,0,1,0,0,0,0,0,1,0,0


## Feature Engineering (with scaler) 

Exclude columns with low variance and high correlation. It contains a sklearn StandardScaler.

In [4]:
from DPDI.utilities.feature_engineering import FeatureEngineer
df = pd.read_csv('Data/chembl_fax_rdkit1d.csv')
df.head()

Unnamed: 0,smiles,value,assay,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.33,CHEMBL3885768,12.841695,-3.817283,12.841695,0.03952,0.371398,455.565,434.397,...,0,1,0,0,0,0,0,1,0,0
1,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.39,CHEMBL3885772,12.841695,-3.817283,12.841695,0.03952,0.371398,455.565,434.397,...,0,1,0,0,0,0,0,1,0,0
2,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.19,CHEMBL3885774,12.841695,-3.817283,12.841695,0.03952,0.371398,455.565,434.397,...,0,1,0,0,0,0,0,1,0,0
3,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.22,CHEMBL3885775,12.841695,-3.817283,12.841695,0.03952,0.371398,455.565,434.397,...,0,1,0,0,0,0,0,1,0,0
4,O=C1[C@@H](NS(=O)(=O)c2cc3ncccc3s2)CCN1Cc1cc2c...,7.56,CHEMBL3885768,12.773431,-3.79032,12.773431,0.167192,0.50789,427.511,410.375,...,0,1,0,0,0,0,0,1,0,0


In [5]:
FE = FeatureEngineer(df)
df = FE.filter(std_thd=0.1,corr_thd=0.9,returnfeatures=True)
df.head()
#df.to_csv('Data/GLP1R/glp1r_chembl_rdkit1d_filtered.csv',index=False)

Unnamed: 0,smiles,value,assay,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,MaxPartialCharge,MinPartialCharge,...,fr_piperzine,fr_priamide,fr_pyridine,fr_sulfide,fr_sulfone,fr_term_acetylene,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
0,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,-0.44836,CHEMBL3885768,-0.621792,-0.820514,-0.492365,-0.450592,-0.829236,-0.91673,0.740195,...,-0.426563,-0.103807,0.657483,-0.103807,-0.080236,-0.092748,-0.386665,1.32011,-0.139876,-0.187936
1,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,-0.392466,CHEMBL3885772,-0.621792,-0.820514,-0.492365,-0.450592,-0.829236,-0.91673,0.740195,...,-0.426563,-0.103807,0.657483,-0.103807,-0.080236,-0.092748,-0.386665,1.32011,-0.139876,-0.187936
2,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,-0.578778,CHEMBL3885774,-0.621792,-0.820514,-0.492365,-0.450592,-0.829236,-0.91673,0.740195,...,-0.426563,-0.103807,0.657483,-0.103807,-0.080236,-0.092748,-0.386665,1.32011,-0.139876,-0.187936
3,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,-0.550831,CHEMBL3885775,-0.621792,-0.820514,-0.492365,-0.450592,-0.829236,-0.91673,0.740195,...,-0.426563,-0.103807,0.657483,-0.103807,-0.080236,-0.092748,-0.386665,1.32011,-0.139876,-0.187936
4,O=C1[C@@H](NS(=O)(=O)c2cc3ncccc3s2)CCN1Cc1cc2c...,-0.234101,CHEMBL3885768,-0.711091,-0.805257,1.44499,0.277246,-1.288409,-0.915685,1.229818,...,-0.426563,-0.103807,2.16905,-0.103807,-0.080236,-0.092748,-0.386665,1.32011,-0.139876,-0.187936


In [6]:
#Scaler and featurelist are saved for further prediction
print(FE.scaler) #run scaler.transform() for new data to predict
print(len(FE.featurelist)) #remained features name

StandardScaler()
129


## Feature Engineering (without scaler) 

The scaling step is done before model training.

In [7]:
from DPDI.utilities.feature_engineering import FeatureEngineer

df = pd.read_csv('Data/chembl_fax_rdkit1d.csv')
df_ = df.copy()
df.head()

Unnamed: 0,smiles,value,assay,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.33,CHEMBL3885768,12.841695,-3.817283,12.841695,0.03952,0.371398,455.565,434.397,...,0,1,0,0,0,0,0,1,0,0
1,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.39,CHEMBL3885772,12.841695,-3.817283,12.841695,0.03952,0.371398,455.565,434.397,...,0,1,0,0,0,0,0,1,0,0
2,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.19,CHEMBL3885774,12.841695,-3.817283,12.841695,0.03952,0.371398,455.565,434.397,...,0,1,0,0,0,0,0,1,0,0
3,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.22,CHEMBL3885775,12.841695,-3.817283,12.841695,0.03952,0.371398,455.565,434.397,...,0,1,0,0,0,0,0,1,0,0
4,O=C1[C@@H](NS(=O)(=O)c2cc3ncccc3s2)CCN1Cc1cc2c...,7.56,CHEMBL3885768,12.773431,-3.79032,12.773431,0.167192,0.50789,427.511,410.375,...,0,1,0,0,0,0,0,1,0,0


In [8]:
FE = FeatureEngineer(df)
df = FE.filter(std_thd=0.1,corr_thd=0.9,returnfeatures=True)
columns = ['smiles','value','assay']
columns.extend(FE.featurelist)
df = df_[columns]
df.to_csv('Data/chembl_fax_rdkit1d_filtered.csv',index=False)
df.head()

Unnamed: 0,smiles,value,assay,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,MaxPartialCharge,MinPartialCharge,...,fr_piperzine,fr_priamide,fr_pyridine,fr_sulfide,fr_sulfone,fr_term_acetylene,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
0,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.33,CHEMBL3885768,12.841695,-3.817283,0.03952,0.371398,455.565,0.250295,-0.383682,...,0,0,1,0,0,0,0,1,0,0
1,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.39,CHEMBL3885772,12.841695,-3.817283,0.03952,0.371398,455.565,0.250295,-0.383682,...,0,0,1,0,0,0,0,1,0,0
2,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.19,CHEMBL3885774,12.841695,-3.817283,0.03952,0.371398,455.565,0.250295,-0.383682,...,0,0,1,0,0,0,0,1,0,0
3,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,7.22,CHEMBL3885775,12.841695,-3.817283,0.03952,0.371398,455.565,0.250295,-0.383682,...,0,0,1,0,0,0,0,1,0,0
4,O=C1[C@@H](NS(=O)(=O)c2cc3ncccc3s2)CCN1Cc1cc2c...,7.56,CHEMBL3885768,12.773431,-3.79032,0.167192,0.50789,427.511,0.250377,-0.355654,...,0,0,2,0,0,0,0,1,0,0


## Normalize Data with Minmax Scaler

In [9]:
df = pd.read_csv('Data/chembl_fax_rdkit1d_filtered.csv')

In [10]:
#Normalize Data regardless of group
FE = FeatureEngineer(df)
#save min max value of each column for reverse transform in further prediction
df = FE.normalize('Data/minmax.csv')
FE.minmax

Unnamed: 0,value,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,BCUT2D_MWHI,...,fr_piperzine,fr_priamide,fr_pyridine,fr_sulfide,fr_sulfone,fr_term_acetylene,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
min,5.15,12.011251,-5.127858,0.0,0.112995,379.869,0.240814,-0.618693,0.722222,16.149901,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.72,15.468666,-0.034857,0.342591,0.865972,706.294,0.572606,-0.319324,1.4,79.918732,...,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
df.to_csv('Data/chembl_fax_rdkit1d_normalized.csv',index=False)
df.head()

Unnamed: 0,smiles,value,assay,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,MaxPartialCharge,MinPartialCharge,...,fr_piperzine,fr_priamide,fr_pyridine,fr_sulfide,fr_sulfone,fr_term_acetylene,fr_thiazole,fr_thiophene,fr_unbrch_alkane,fr_urea
0,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,0.477024,CHEMBL3885768,0.240192,0.257329,0.115358,0.343176,0.231894,0.028575,0.785021,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,0.490153,CHEMBL3885772,0.240192,0.257329,0.115358,0.343176,0.231894,0.028575,0.785021,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,0.446389,CHEMBL3885774,0.240192,0.257329,0.115358,0.343176,0.231894,0.028575,0.785021,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,N=C(N)c1cccc(CN2CC[C@H](NS(=O)(=O)c3ccc(-c4ccn...,0.452954,CHEMBL3885775,0.240192,0.257329,0.115358,0.343176,0.231894,0.028575,0.785021,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,O=C1[C@@H](NS(=O)(=O)c2cc3ncccc3s2)CCN1Cc1cc2c...,0.527352,CHEMBL3885768,0.220448,0.262623,0.488022,0.524445,0.145951,0.028822,0.878645,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
