In [1]:
import pandas as pd 
import os
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem




In [2]:
folder = r'D:\Xuelian\project-PFAS_new'
os.chdir(folder)

### 1. descriptors

#### calculate rdkit descriptors 

In [15]:
def cal_rdkit(molecules,name_col='PUBCHEM_CID'):
    calculator = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors.descList])
    X = pd.DataFrame([list(calculator.CalcDescriptors(mol)) for mol in molecules],
                     index=[mol.GetProp(name_col) if mol.HasProp(name_col) else '' for mol in molecules],
                     columns=list(calculator.GetDescriptorNames()))
    # Imputes the data and replaces NaN values with mean from the column
    desc_matrix = X.fillna(X.mean())

    # Checks for appropriate output
    assert len(desc_matrix.columns) != 0, 'All features contained at least one null value. No descriptor matrix ' \
                                          'could be generated.'

    return desc_matrix

In [16]:
sdf_file = r'D:\Xuelian\project-PFAS_new\PFAS_16new_smi.sdf'

In [17]:
mols = [mol for mol in Chem.SDMolSupplier(sdf_file) if mol is not None]

In [18]:
rd = cal_rdkit(mols)

In [19]:
rd.to_csv('PFAS_16new_smi_rdkit.csv')

#### use the Dragon software to calculate molecular descriptors 

In [28]:
drag = pd.read_csv('PFAS_16new_Dragon.txt',sep='\t',index_col=0)

In [29]:
drag['PUBCHEM_CID'] = rd.index
drag.to_csv('PFAS_16new_Dragon.csv')

In [5]:
# the dragon descriptor list
descs = pd.read_csv('backup\dragon_desc_list.txt', sep='\t',index_col=0)
descs.to_csv('backup\dragon_desc_list.csv')

In [58]:
drag_s = pd.read_csv('PFAS_16new_Dragon_small.txt',sep='\t',index_col=0)

In [60]:
drag_s['PUBCHEM_CID'] = dragon.index
drag_s.to_csv('PFAS_16new_Dragon_small.csv')

In [5]:
drag_xs = pd.read_csv('PFAS_16new_Dragon_xsmall.txt',sep='\t',index_col=0)

In [6]:
drag_xs.to_csv('PFAS_16new_Dragon_xsmall.csv')

#### use the MOE software to calculate molecular descriptors

In [34]:
moe = pd.read_csv('PFAS_16new_MOE.txt',sep=',')

In [36]:
moe.to_csv('PFAS_16new_MOE.csv',index=False)

In [20]:
moe_s = pd.read_csv('PFAS_16new_MOE_small_54.txt',sep=',')
moe_s.to_csv('PFAS_16new_MOE_s_54.csv',index=False)

### 2. Feature engineering  
a. remove descriptors that have 0 variance among the 16 compounds  
b. Standardization (Z-score normalization), removing the mean and scaling to unit variance.

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

In [8]:
rdkit = pd.read_csv('PFAS_16new_rdkit.csv',index_col=0)
#dragon = pd.read_csv('PFAS_16new_Dragon.csv',index_col=0)
dragon_s = pd.read_csv('PFAS_16new_Dragon_small.csv',index_col=0)
dragon_xs = pd.read_csv('PFAS_16new_Dragon_xsmall.csv',index_col=0)
moe = pd.read_csv('PFAS_16new_MOE.csv',index_col=0)

#### 2.1 rdkit descriptors

In [9]:
selector = VarianceThreshold(threshold=0)
selector.fit(rdkit)

VarianceThreshold(threshold=0)

In [6]:
# get the names of the supported columns (std > threshold, threshold is bedefault=0)
indes = selector.get_support()

In [7]:
rdkit_vari0 = rdkit.loc[:,indes].copy()

In [10]:
scaler = StandardScaler()
temp = scaler.fit_transform(rdkit_vari0)
rdkit_vari0_st = pd.DataFrame(temp,index=rdkit_vari0.index, columns=rdkit_vari0.columns)

In [11]:
rdkit_vari0_st.shape

(16, 78)

In [12]:
#rdkit_vari0_st.to_csv('PFAS_16new_rdkit_85_st.csv')
rdkit_vari0_st.to_csv('PFAS_16new_rdkit_78_st.csv')

#### 2.2 Dragon descriptors

the large descriptor set that contain more than 3000 descriptors

In [39]:
dragon.dropna(axis=1,inplace=True)

In [40]:
selector = VarianceThreshold()
selector.fit(dragon)

VarianceThreshold(threshold=0.0)

In [41]:
# get the names of the supported columns (std > threshold, threshold is bedefault=0)
indes = selector.get_support()

In [42]:
dragon_vari0 = dragon.loc[:,indes].copy()

In [70]:
dragon_vari0.shape # the remaining descriptors were still too many

(16, 1647)

the small descriptor set that contain 570 descriptors

In [62]:
dragon_s.isna().sum().sum()

0

In [10]:
selector = VarianceThreshold(threshold=0)
selector.fit(dragon_xs)
# get the names of the supported columns (std > threshold, threshold is bedefault=0)
indes = selector.get_support()

In [11]:
dragon_xs_vari0 = dragon_xs.loc[:,indes].copy()

In [12]:
dragon_xs_vari0.shape

(16, 50)

In [13]:
scaler = StandardScaler()
temp = scaler.fit_transform(dragon_xs_vari0)
dragon_xs_vari0_st = pd.DataFrame(temp,index=dragon_xs_vari0.index, columns=dragon_xs_vari0.columns)

In [14]:
#dragon_s_vari0_st.to_csv('PFAS_16new_Dragon_small_103_st.csv')
#dragon_s_vari0_st.to_csv('PFAS_16new_Dragon_small_88_st.csv')
dragon_xs_vari0_st.to_csv('PFAS_16new_Dragon_xsmall_50_st.csv')

#### 2.3 MOE descriptors

In [50]:
moe.isna().sum().sum()

0

In [18]:
selector = VarianceThreshold(threshold=0)
selector.fit(moe)

VarianceThreshold(threshold=0.05)

In [19]:
# get the names of the supported columns (std > threshold, threshold is bedefault=0)
indes = selector.get_support()

In [20]:
moe_vari0 = moe.loc[:,indes].copy()

In [21]:
scaler = StandardScaler()
temp = scaler.fit_transform(moe_vari0)
moe_vari0_st = pd.DataFrame(temp,index=moe_vari0.index, columns=moe_vari0.columns)

In [23]:
moe_s = pd.read_csv('PFAS_16new_MOE_s_54.csv',index_col=0)

In [24]:
selector = VarianceThreshold(threshold=0)
selector.fit(moe_s)

VarianceThreshold(threshold=0)

In [25]:
indes = selector.get_support()
moe_s_vari0 = moe_s.loc[:,indes].copy()
scaler = StandardScaler()
temp = scaler.fit_transform(moe_s_vari0)
moe_s_vari0_st = pd.DataFrame(temp,index=moe_s_vari0.index, columns=moe_s_vari0.columns)
moe_s_vari0_st.to_csv('PFAS_16new_moe_s_54_st+ptarget-.csv')

In [24]:
#moe_vari0_st.to_csv('PFAS_16new_moe_155_st.csv')
moe_vari0_st.to_csv('PFAS_16new_moe_115_st.csv')

#### endpoints transformation

In [29]:
import math

In [30]:
endpoints = pd.read_excel(r'D:\Xuelian\project-PFAS_new\original data\Targets_t.xlsx',index_col=0)

In [32]:
endpoints_t = -endpoints.applymap(math.log10)
endpoints_t.to_csv('Targets_log.csv')

In [12]:
scaler = StandardScaler()
temp = scaler.fit_transform(endpoints_t)
ep_tr_st = pd.DataFrame(temp, index=endpoints_t.index, columns=endpoints_t.columns)

In [13]:
ep_tr_st.to_csv('Targets_log_st.csv')