In [3]:
import os
import pandas as pd
import numpy as np
import json
from rdkit import Chem
from rdkit.Chem import AllChem
from ast import literal_eval
from rdkit.Chem import AllChem, Descriptors
from mordred import Calculator, descriptors
from rdkit.Chem.Descriptors import rdMolDescriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit import DataStructs
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions, GetStereoisomerCount

# 正则化smiles
def smi2smi(smi):
    mol = Chem.MolFromSmiles(smi)
    new_smi = Chem.MolToSmiles(mol)
    return new_smi

def get_features_df(path= 'sp_data/', data_name = 'stdSMILES_CAS_SP', smiles_name='SMILES', target_name='total', b1=0.05, b2=0.05):
    # clean 
    ini_df = pd.read_csv(path + data_name + '.csv')
    indexs = []
    for i in range(len(ini_df)):
        smiles = ini_df[smiles_name][i]
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            print(smiles)
        else:
            indexs.append(i)
    new_df = ini_df.loc[indexs]
    new_df.to_csv(path + data_name + '_clean.csv', index=False)
    # generate features
    df = pd.read_csv(path + data_name + '_clean.csv')
    calc = Calculator(descriptors, ignore_3D=True, version="1.0.0")
    mols = [Chem.MolFromSmiles(df[smiles_name][i]) for i in range(len(df))]
    des_df = calc.pandas(mols, nproc=8, nmols=None, quiet=True, ipynb=False, id=-1)
    # descriptors from rdkit
    # for desc_name, desc_func in Descriptors.descList:
    # print(f"{desc_name}: {desc_func.__doc__}")
    descriptor_funcs = [getattr(Descriptors, x[0]) for x in Descriptors.descList]
    des = np.array([list(map(lambda f: f(mol), descriptor_funcs)) for mol in mols])
    names = [x[0] for x in Descriptors.descList]
    des_df[names] = des
    des_df.to_csv(path + data_name + '_clean_des.csv', index=False)
    # eliminate features failed to calculate(ratio>0.05) or with a low var
    x_name =[]
    for column in des_df.columns: 
        des_df[column] = pd.to_numeric(des_df[column], errors='coerce')
        if np.sum(pd.isnull(des_df[column]))/len(des_df) < b1:
            x_name.append(column)
    names = []
    for column in x_name: 
        if des_df[column].std() / (des_df[column].mean()+1e-9) <= b2:
            pass
        else:
            names.append(column)
    result_df = des_df[names]
    if target_name is not None:
        result_df[target_name] = df[target_name]
    data_name = data_name + '_clean_des_%s'%len(names)
    result_df.to_csv(path + data_name + '.csv', index=False)
    return result_df, data_name

def get_reduce_df(path= 'sp_data/', data_name = 'stdSMILES_CAS_SP', x_name=None, y_name=None, select_num=128):
    df = pd.read_csv(path+data_name+'.csv')
    from sklearn.feature_selection import SelectKBest,f_classif, f_regression
    if x_name is None:
        x_name = df.columns[:-1]
    if y_name is None:
        y_name = df.columns[-1]
        
    X = np.array(df[x_name], dtype=float)
    Y = np.array(df[y_name], dtype=float)
    x_mean = np.nanmean(X, axis=0)
    x_std = np.nanstd(X, axis=0)
    y_mean = np.nanmean(Y, axis=0)
    y_std = np.nanstd(Y, axis=0)

    X_std = (X-x_mean)/(1e-9+x_std)
    Y_std = (Y-y_mean)/(1e-9+y_std)
    X_std[np.isnan(X_std)] = 0
    Y_std[np.isnan(Y_std)] = 0
    
    sel=SelectKBest(score_func=f_regression,k=select_num)
    sel.fit(X_std,Y_std)
    select_feature = sel.get_support(True)
    select_df = df.iloc[:,select_feature]
    cat_df = pd.concat([select_df, df[y_name]], axis=1)
    data_name = data_name+'_%s'%select_num
    cat_df.to_csv(path+data_name + '.csv', index=False)
    return cat_df, data_name

In [4]:
# to load model
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
# to calculate descriptors
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from mordred import Calculator, descriptors

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
def save(data, name):
    import pickle
    with open('%s.pkl'%name, 'wb') as f:
        pickle.dump(models, f)
def load(name):
    import pickle
    with open('%s.pkl'%name, 'rb') as f:
        return pickle.load(f)

In [40]:
data_name = 'SP_P_clean_des_977'
reduce_df, data_name = get_reduce_df(path= './sp_p/', data_name = data_name, select_num=256)

In [None]:
# 生成描述符，并消除具备较多缺省值和较低方差的描述符

# sp_d
df, data_name = get_features_df(path= './sp/', data_name = 'SP_D', smiles_name='smiles', target_name='sp_d')
reduce_df, data_name = get_reduce_df(path= './sp/', data_name = data_name, select_num=256)

In [None]:
# sp_h
df, data_name = get_features_df(path= '', data_name = 'SP_H', smiles_name='smiles', target_name='sp_h')
reduce_df, data_name = get_reduce_df(path= '', data_name = data_name, select_num=256)

In [None]:
# sp_p
df, data_name = get_features_df(path= '', data_name = 'SP_P', smiles_name='smiles', target_name='sp_p')
reduce_df, data_name = get_reduce_df(path= '', data_name = data_name, select_num=256)

In [13]:
# vis
df, data_name = get_features_df(path= './vis/', data_name = 'result', smiles_name='smiles', target_name='log_110')
reduce_df, data_name = get_reduce_df(path= './vis/', data_name = data_name, select_num=256)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# td
path = './td/'
data_name = 'td'

df1, data_name1 = get_features_df(path= path, data_name = data_name, smiles_name='SMILES_PSA', target_name='Td5')
df2, data_name2 = get_features_df(path= path, data_name = data_name, smiles_name='SMILES_PI', target_name='Td5')

name1 = set(df1.columns[:-1])
name2 = set(df2.columns[:-1])
names = list(name1 & name2)

ratio1 = np.array(pd.read_csv(path +data_name+'.csv')['Ratio_PSA']).reshape(-1,1)
ratio2 = np.array(pd.read_csv(path +data_name+'.csv')['Ratio_PI']).reshape(-1,1)
des = np.array(df1[names]) * ratio1 + np.array(df2[names]) * ratio2
df = pd.DataFrame()
df[names] = des
df['Td5'] = df1['Td5']
data_name = data_name + '_clean_des_%s'%len(names)
df.to_csv(path + data_name + '.csv', index=False)

reduce_df, data_name = get_reduce_df(path=path, data_name = data_name, select_num=256)

In [7]:
# fs
# td
path = './fs/'
data_name = 'fs'

df1, data_name1 = get_features_df(path= path, data_name = data_name, smiles_name='SMILES_PSA', target_name=data_name, b2=-1)
df2, data_name2 = get_features_df(path= path, data_name = data_name, smiles_name='SMILES_PI', target_name=data_name)

name1 = set(df1.columns[:-1])
name2 = set(df2.columns[:-1])
names = list(name1 & name2)

ratio1 = np.array(pd.read_csv(path +data_name+'.csv')['Ratio_PSA']).reshape(-1,1)
ratio2 = np.array(pd.read_csv(path +data_name+'.csv')['Ratio_PI']).reshape(-1,1)
des = np.array(df1[names]) * ratio1 + np.array(df2[names]) * ratio2
df = pd.DataFrame()
df[names] = des
df[data_name] = df1[data_name]
data_name = data_name + '_clean_des_%s'%len(names)
df.to_csv(path + data_name + '.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:02<00:00,  9.20it/s]
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df[target_name] = df[target_name]
 11%|████████▋                                                                          | 2/19 [00:04<01:22,  4.88s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 21%|█████████████████▍                                                                 | 4/19 [00:05<00:16,  1.12s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 79%|████████████████████████████████████████████████████████████████▋                 | 15/19 [00:09<00:01,  2.04it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 89%|█████████████████████████████████████████████████████████████████████████▎        | 17/19 [00:09<00:00,  3.43it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:09<00:00,  1.96it/s]
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des
  des_df[names] = des


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df[target_name] = df[target_name]
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names] = des
  df[names

In [None]:
# cause the high computation cost, done in server

# raw RFE:
# done with sklearn, see FS_RFE.py

# one step RFE
# done with code by ourselves, see feature_select.py
# the  result data are given in .csv files

In [25]:

with open('./fs/log_c.pth', 'rb') as f:
    log = pickle.load(f)
    
log_dict = {}
for i in range(19):
    for ind in range(len(log.keys())):
        if i in log_dict:
            log_dict[i].append(log[ind]['scores'][i])
        else:
            log_dict[i] = [log[ind]['scores'][i]]

log_df = pd.DataFrame(log_dict)
log_df['max'] = log_df.max(axis=1)

maxs = list(log_df['max'])
max_iter = maxs.index(max(maxs))
print(max_iter)
selected_index = log[max_iter]['feature_index']
selected_index.sort()

df = pd.read_csv('./fs/fs_clean_des_717_256_c_20.csv')
new_df = df[list(df.iloc[:,selected_index].columns) + ['fs']]
new_df.to_csv('./fs/fs_clean_des_717_256_c_20_%s.csv'%len(selected_index), index=False)

12


In [12]:
# features for predict

path = './candidates/'
data_name = 'candidates'

df1, data_name1 = get_features_df(path= path, data_name = data_name, smiles_name='SMILES_PSA', target_name=None,b2=-1)
# df2, data_name2 = get_features_df(path= path, data_name = data_name, smiles_name='SMILES_PI', target_name=None, b2=-1)

df1.to_csv(path + 'candidates_clean_des_PSA.csv', index=False)
df2.to_csv(path + 'candidates_clean_des_PI.csv', index=False)

name1 = set(df1.columns[:-1])
name2 = set(df2.columns[:-1])
names = list(name1 & name2)

ratio1 = np.array(pd.read_csv(path +data_name+'.csv')['Ratio_PSA']).reshape(-1,1)
ratio2 = np.array(pd.read_csv(path +data_name+'.csv')['Ratio_PI']).reshape(-1,1)
des = np.array(df1[names]) * ratio1 + np.array(df2[names]) * ratio2
df = pd.DataFrame()
df[names] = des
data_name = data_name + '_clean_des_%s'%len(names)
df.to_csv(path + data_name + '.csv', index=False)

  self[col] = igetitem(value, i)
  self[col] = igetitem(value, i)
