In [None]:
#!pip install rdkit

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem, Draw, Descriptors
from rdkit.Chem.Draw import IPythonConsole
from sklearn.preprocessing import FunctionTransformer

In [2]:
df = pd.read_excel("data/19_35000.xlsx", index_col=0).reset_index(drop=True)
df.head()

Unnamed: 0,Title,IC50,SMILES
0,CHEMBL2206459,1.5e-05,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...
1,CHEMBL3818159,1.6e-05,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...
2,CHEMBL1956716,3.3e-05,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...
3,CHEMBL1956715,3.2e-05,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...
4,CHEMBL4444029,4.1e-05,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...


In [None]:
def mol_dsc_calc(mols):
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in tqdm(mols))

# список конституционных и физико-химических дескрипторов из библиотеки RDKit
descriptors = {"HeavyAtomCount": Descriptors.HeavyAtomCount,
               "NHOHCount": Descriptors.NHOHCount,
               "NOCount": Descriptors.NOCount,
               "NumHAcceptors": Descriptors.NumHAcceptors,
               "NumHDonors": Descriptors.NumHDonors,
               "NumHeteroatoms": Descriptors.NumHeteroatoms,
               "NumRotatableBonds": Descriptors.NumRotatableBonds,
               "NumValenceElectrons": Descriptors.NumValenceElectrons,
               "NumAromaticRings": Descriptors.NumAromaticRings,
               "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
               "RingCount": Descriptors.RingCount,
               "MW": Descriptors.MolWt,
               "LogP": Descriptors.MolLogP,
               "MR": Descriptors.MolMR,
               "TPSA": Descriptors.TPSA}

# sklearn трансформер для использования в конвейерном моделировании
descriptors_transformer = FunctionTransformer(mol_dsc_calc)
X = descriptors_transformer.transform(df['SMILES'])
X.head()

In [4]:
data_dsc = df.join(X)

In [5]:
data_dsc

Unnamed: 0,Title,IC50,SMILES,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA
0,CHEMBL2206459,0.000015,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...,24,7,9,4,6,10,7,138,0,0,1,362.367,0.38187,90.4296,157.76
1,CHEMBL3818159,0.000016,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...,21,4,6,4,3,6,7,120,0,0,1,298.383,1.05450,79.7279,101.65
2,CHEMBL1956716,0.000033,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...,21,3,6,5,2,6,7,118,0,0,1,295.359,-0.36170,77.0531,104.48
3,CHEMBL1956715,0.000032,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...,20,3,6,5,2,6,6,114,0,0,1,283.348,-0.52780,72.5301,104.48
4,CHEMBL4444029,0.000041,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...,24,4,9,6,4,9,3,122,3,0,3,329.268,1.09742,81.9775,149.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36372,CHEMBL3699085,2366.790000,CC(=O)N[C@H]1CC[C@@H]([C@@H]12)[C@@H](O[Si](C)...,44,3,10,6,3,11,8,238,2,0,4,622.839,5.88880,171.2541,127.35
36373,CHEMBL109004,2372.100000,CC(=O)Nc(cc1)c(OC(=O)C)cc1C(=O)O,17,2,6,4,2,6,3,90,1,0,1,237.211,1.26850,59.0140,92.70
36374,CHEMBL2259758,2600.970000,c1cccc(c12)cccc2CNC(=O)CCCCCCO[C@]3(C(=O)O)C[C...,61,7,17,13,7,18,21,330,4,1,5,866.987,2.32120,224.7361,250.36
36375,CHEMBL109781,2793.000000,NCCCC(=O)Nc(c(cc1)NC(=O)C)cc1C(=O)O,20,5,7,4,4,7,6,108,1,0,1,279.296,1.02060,74.6231,121.52


In [29]:
data_dsc.to_csv('data_dsc.csv')

In [20]:
tqdm.pandas()

In [23]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, AllChem


def rdkit_fp(smiles_column: pd.Series, radius=3, nBits=2048, useChirality=False):
    # morganFP_rdkit
    def desc_gen(mol):
        mol = Chem.MolFromSmiles(mol)
        bit_vec = np.zeros((1,), np.int16)
        DataStructs.ConvertToNumpyArray(
            AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits, useChirality=useChirality), bit_vec)
        return bit_vec

    return pd.DataFrame.from_records(smiles_column.progress_apply(func=desc_gen), columns=[f'bit_id_{i}' for i in range(nBits)])


def rdkit_2d(smiles_column: pd.Series):
    # 2d_rdkit
    descriptors = {i[0]: i[1] for i in Descriptors._descList}
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in tqdm(smiles_column))

In [24]:
Y = rdkit_fp(df['SMILES'])
Y.head()

  0%|          | 0/36377 [00:00<?, ?it/s][21:41:45] Conflicting single bond directions around double bond at index 55.
[21:41:45]   BondStereo set to STEREONONE and single bond directions set to NONE.
 98%|█████████▊| 35702/36377 [00:12<00:00, 2486.98it/s][21:41:57] Conflicting single bond directions around double bond at index 7.
[21:41:57]   BondStereo set to STEREONONE and single bond directions set to NONE.
100%|██████████| 36377/36377 [00:12<00:00, 2964.01it/s]


Unnamed: 0,bit_id_0,bit_id_1,bit_id_2,bit_id_3,bit_id_4,bit_id_5,bit_id_6,bit_id_7,bit_id_8,bit_id_9,...,bit_id_2038,bit_id_2039,bit_id_2040,bit_id_2041,bit_id_2042,bit_id_2043,bit_id_2044,bit_id_2045,bit_id_2046,bit_id_2047
0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
data_fp = df.join(Y)
data_fp.head()

Unnamed: 0,Title,IC50,SMILES,bit_id_0,bit_id_1,bit_id_2,bit_id_3,bit_id_4,bit_id_5,bit_id_6,...,bit_id_2038,bit_id_2039,bit_id_2040,bit_id_2041,bit_id_2042,bit_id_2043,bit_id_2044,bit_id_2045,bit_id_2046,bit_id_2047
0,CHEMBL2206459,1.5e-05,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,CHEMBL3818159,1.6e-05,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL1956716,3.3e-05,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL1956715,3.2e-05,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL4444029,4.1e-05,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
data_fp

Unnamed: 0,Title,IC50,SMILES,bit_id_0,bit_id_1,bit_id_2,bit_id_3,bit_id_4,bit_id_5,bit_id_6,...,bit_id_2038,bit_id_2039,bit_id_2040,bit_id_2041,bit_id_2042,bit_id_2043,bit_id_2044,bit_id_2045,bit_id_2046,bit_id_2047
0,CHEMBL2206459,0.000015,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,CHEMBL3818159,0.000016,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL1956716,0.000033,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL1956715,0.000032,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL4444029,0.000041,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36372,CHEMBL3699085,2366.790000,CC(=O)N[C@H]1CC[C@@H]([C@@H]12)[C@@H](O[Si](C)...,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
36373,CHEMBL109004,2372.100000,CC(=O)Nc(cc1)c(OC(=O)C)cc1C(=O)O,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36374,CHEMBL2259758,2600.970000,c1cccc(c12)cccc2CNC(=O)CCCCCCO[C@]3(C(=O)O)C[C...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
36375,CHEMBL109781,2793.000000,NCCCC(=O)Nc(c(cc1)NC(=O)C)cc1C(=O)O,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
data_fp.to_csv('data_fp.csv')

In [None]:
Z = rdkit_2d(df['SMILES'])
Z.head()

In [34]:
data_2d = df.join(Z)
data_2d.head()

Unnamed: 0,Title,IC50,SMILES,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CHEMBL2206459,1.5e-05,[H]\N=C(N)\N[C@@H](C1)[C@@H](NC(=O)C)[C@@H](C=...,11.743425,11.743425,0.061289,-4.487931,0.216971,25.916667,362.367,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL3818159,1.6e-05,O=C(O)C1=C[C@H](N)[C@@H](NC(=O)C)[C@@H](C1)COC...,11.324462,11.324462,0.124444,-0.95902,0.651957,25.571429,298.383,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL1956716,3.3e-05,C=CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O...,11.521766,11.521766,0.02529,-1.252691,0.624022,25.333333,295.359,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL1956715,3.2e-05,CC(=O)N[C@H]([C@H](C1)N)[C@@H](C=C1C([O-])=O)O...,11.271766,11.271766,0.000752,-1.23623,0.681758,26.25,283.348,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL4444029,4.1e-05,O=C(O)c1c(O)c(=O)cc([nH]1)-c(c2C)ccc(c2)-c3noc...,11.745432,11.745432,0.243422,-1.443615,0.561612,10.708333,329.268,...,0,0,0,0,0,0,0,0,0,0


In [35]:
data_2d.to_csv('data_2d.csv')