# Мини‑таск 2: Расчёт и отбор дескрипторов

### Импорт необходимых бибилиотек

In [80]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from sklearn.feature_selection import VarianceThreshold

### Загрузка датасета

In [81]:
df = pd.read_csv("data.csv")

Посмотрим на первые строки

In [82]:
df.head()

Unnamed: 0,Smiles,Molecular Weight,#RO5 Violations,AlogP,Standard Value,Activity
0,CC1(C)OC(=O)C(OC2CCCCC2)=C1c1ccc(S(C)(=O)=O)cc1,364.46,0.0,3.49,40.0,40.0
1,CCc1ccc(-c2ncc(Cl)cc2-c2ccc(S(C)(=O)=O)cc2)cn1,372.88,0.0,4.43,1700.0,1700.0
2,CCCCOC(=O)Cc1c(C)n(C(=O)c2ccc(Cl)cc2)c2ccc(OC)...,413.9,1.0,5.19,50.0,50.0
3,COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1,357.79,0.0,3.93,200.0,200.0
4,CS(=O)(=O)c1ccc(-c2csc(CC(=O)O)c2-c2ccc(F)cc2)cc1,390.46,0.0,4.25,10000.0,10000.0


### Рассчёт дескрипторов RDKit

Соберём список всех доступных дескрипторов из RDKit

In [83]:
desc_list = [name for name, func in Descriptors._descList]

In [84]:
def compute_rdkit_descriptors(smi):
    mol = Chem.MolFromSmiles(smi)
    vals = []
    for name, func in Descriptors._descList:
        try:
            vals.append(func(mol))
        except:
            vals.append(np.nan)
    return vals

Применяем и собираем в DataFrame

In [85]:
desc_vals = df['Smiles'].apply(compute_rdkit_descriptors)
df_desc = pd.DataFrame(desc_vals.tolist(), columns=desc_list)

Посмотрим первые строки

In [86]:
df_desc.head()

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,12.372552,12.372552,0.037493,-3.264341,0.765113,21.32,364.463,340.271,364.134445,136,...,0,0,1,0,0,0,0,0,0,0
1,11.641862,11.641862,0.279401,-3.231616,0.679285,11.48,372.877,355.741,372.069926,128,...,0,0,1,0,0,0,0,0,0,0
2,13.229344,13.229344,0.097334,-0.303369,0.39497,10.896552,413.901,389.709,413.139386,152,...,0,0,0,0,0,0,0,0,1,0
3,12.987203,12.987203,0.170239,-0.954711,0.767807,10.84,357.793,341.665,357.076786,128,...,0,0,0,0,0,0,0,0,0,0
4,13.27026,13.27026,0.14008,-3.296291,0.706877,11.461538,390.457,375.337,390.039579,134,...,0,0,1,0,0,0,0,1,0,0


Размер датасета

In [87]:
df_desc.shape

(6392, 217)

### Рассчёт Morgan-фингерпринтов

In [88]:
def compute_morgan_fp(smi, radius=2, n_bits=1024):
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return [0] * n_bits
    arr = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    return list(arr)

In [89]:
fps = df['Smiles'].map(lambda s: compute_morgan_fp(s, radius=2, n_bits=1024))
df_fp = pd.DataFrame(fps.tolist(), columns=[f'FP_{i}' for i in range(1024)])



### Объединение оригинальных колонки и дескрипторов

In [90]:
df_features = pd.concat([
    df[['Smiles', 'Standard Value', 'Activity']].reset_index(drop=True),
    df_desc.reset_index(drop=True),
    df_fp.reset_index(drop=True)
], axis=1)

### Фильтрация и отбор признаков

Выбираем только числовые признаки

In [91]:
numeric_cols = df_features.select_dtypes(include=[np.number]).columns
X_num = df_features[numeric_cols].copy()

Удаляем признаки с пропусками

In [92]:
X_num = X_num.dropna(axis=1)

Удаляем признаки с нулевой дисперсией

In [93]:
selector = VarianceThreshold(threshold=0.0)
X_var = pd.DataFrame(
    selector.fit_transform(X_num),
    columns = X_num.columns[selector.get_support()],
    index   = X_num.index
)

 Удаляем высоко скоррелированные признаки (|r| > 0.7)

In [94]:
corr = X_var.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.7)]
X_sel = X_var.drop(columns=to_drop)

И обратно обьединяем данные

In [95]:
df_clean = pd.concat([
    df_features[['Smiles', 'Standard Value', 'Activity']].reset_index(drop=True),
    X_sel.reset_index(drop=True)
], axis=1)

### Сохранение итогового датасета

Первые строки датасета

In [96]:
df_clean.head()

Unnamed: 0,Smiles,Standard Value,Activity,Standard Value.1,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,FP_1014,FP_1015,FP_1016,FP_1017,FP_1018,FP_1019,FP_1020,FP_1021,FP_1022,FP_1023
0,CC1(C)OC(=O)C(OC2CCCCC2)=C1c1ccc(S(C)(=O)=O)cc1,40.0,40.0,40.0,12.372552,0.037493,-3.264341,0.765113,21.32,364.463,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,CCc1ccc(-c2ncc(Cl)cc2-c2ccc(S(C)(=O)=O)cc2)cn1,1700.0,1700.0,1700.0,11.641862,0.279401,-3.231616,0.679285,11.48,372.877,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CCCCOC(=O)Cc1c(C)n(C(=O)c2ccc(Cl)cc2)c2ccc(OC)...,50.0,50.0,50.0,13.229344,0.097334,-0.303369,0.39497,10.896552,413.901,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1,200.0,200.0,200.0,12.987203,0.170239,-0.954711,0.767807,10.84,357.793,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CS(=O)(=O)c1ccc(-c2csc(CC(=O)O)c2-c2ccc(F)cc2)cc1,10000.0,10000.0,10000.0,13.27026,0.14008,-3.296291,0.706877,11.461538,390.457,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Размер датасета

In [97]:
df_clean.shape

(6392, 1059)

Сохраняем в csv

In [98]:
df_clean.to_csv("new_data.csv", index=False)