In [30]:
import pandas as pd
import numpy as np
from rdkit import Chem 
from rdkit.Chem.Crippen import MolMR, MolLogP
from rdkit.Chem.Descriptors import MolWt
from rdkit.Chem.GraphDescriptors import BalabanJ
from rdkit.Chem.GraphDescriptors import BertzCT
from rdkit.Chem.rdMolDescriptors import CalcTPSA
from rdkit.Chem.rdMolDescriptors import CalcLabuteASA
from rdkit.Chem.rdMolDescriptors import CalcNumHBA
from rdkit.Chem.rdMolDescriptors import CalcNumHBD

In [2]:
data_train = pd.read_csv('data_train.csv')
data_train.head()

Unnamed: 0,molecule_chembl_id,Smiles,pIC50,bioactivity_class
0,CHEMBL3404003,COCCCOc1cc(C(=O)N(C[C@@H]2CNC[C@H]2OC(=O)N(C)C...,7.523,1
1,CHEMBL3403999,COCCCOc1cc(C(=O)N(C[C@@H]2CNC[C@H]2OC(=O)NCc2c...,7.046,1
2,CHEMBL4109808,COCCCOc1ccc([C@@H](C)N(C(=O)[C@H]2CNCCO2)C2CC2...,7.469,1
3,CHEMBL557193,CCCCNC(=O)[C@H](C)C[C@H](O)[C@@H](N)C[C@H](Cc1...,9.0,1
4,CHEMBL4114302,COC(=O)NCCCn1nc([C@@H](C)N(C(=O)[C@H]2CNCCO2)C...,8.444,1


In [6]:
data_train['mol'] = [Chem.MolFromSmiles(i) for i in data_train['Smiles']]

# Физико-химические дескрипторы 

In [9]:
data_train['mr'] = [MolMR(mol) for mol in data_train['mol']]
data_train['logp'] = [MolLogP(mol) for mol in data_train['mol']]
data_train['mw'] = [MolWt(mol) for mol in data_train['mol']]

# Топологические дескрипторы

In [12]:
data_train['balabanJ'] = [BalabanJ(mol) for mol in data_train['mol']]

In [22]:
data_train['bertzCT'] = [BertzCT(mol) for mol in data_train['mol']]

# Поверхностные дескрипторы

In [24]:
data_train['tpsa'] = [CalcTPSA(mol) for mol in data_train['mol']]

In [27]:
data_train['labuteASA'] = [CalcLabuteASA(mol) for mol in data_train['mol']]

# Структурные дескрипторы

In [29]:
data_train['numHBA'] = [CalcNumHBA(mol) for mol in data_train['mol']]

In [31]:
data_train['numHBD'] = [CalcNumHBD(mol) for mol in data_train['mol']]

In [32]:
data_train.head()

Unnamed: 0,molecule_chembl_id,Smiles,pIC50,bioactivity_class,mol,mr,logp,mw,balabanJ,bertzCT,tpsa,labuteASA,numHBA,numHBD
0,CHEMBL3404003,COCCCOc1cc(C(=O)N(C[C@@H]2CNC[C@H]2OC(=O)N(C)C...,7.523,1,<rdkit.Chem.rdchem.Mol object at 0x000002E58BA...,158.8702,4.4274,584.739,1.387979,1282.128385,102.46,245.075113,9,1
1,CHEMBL3403999,COCCCOc1cc(C(=O)N(C[C@@H]2CNC[C@H]2OC(=O)NCc2c...,7.046,1,<rdkit.Chem.rdchem.Mol object at 0x000002E58BF...,133.3309,3.0685,503.596,1.63215,964.452342,111.5,211.752852,8,2
2,CHEMBL4109808,COCCCOc1ccc([C@@H](C)N(C(=O)[C@H]2CNCCO2)C2CC2...,7.469,1,<rdkit.Chem.rdchem.Mol object at 0x000002E58BF...,121.3107,2.5576,450.576,1.732491,705.729893,78.49,191.027643,7,1
3,CHEMBL557193,CCCCNC(=O)[C@H](C)C[C@H](O)[C@@H](N)C[C@H](Cc1...,9.0,1,<rdkit.Chem.rdchem.Mol object at 0x000002E58BF...,144.5599,4.3678,517.151,1.2e-05,704.149228,103.04,218.434425,6,3
4,CHEMBL4114302,COC(=O)NCCCn1nc([C@@H](C)N(C(=O)[C@H]2CNCCO2)C...,8.444,1,<rdkit.Chem.rdchem.Mol object at 0x000002E58BF...,113.4454,1.2178,430.509,1.54063,921.730758,110.61,180.812111,8,2


In [33]:
data_train.to_csv("data_descriptors.csv")