In [81]:
import numpy as np
import pandas as pd
from rdkit import rdBase, Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from mordred import Calculator, descriptors
import mordred

In [98]:
df = pd.read_csv('./data/origin.csv',index_col=0)
df

Unnamed: 0,entry,R1-,organocatalyst,organocatalyst(mol%),temp(℃),time(h),yield,R2_C,R2_H,under_O2,under_air,solvent_CH3CN,solvent_MeOH,solvent_toluene
0,1,[*]C1=CC=CC=C1,ClC1=CC=C(C(O)=O)C(O)=C1,5.0,90,24.0,81.0,0,1,1,0,0,0,1
1,2,[*]C1=CC=CC=C1,ClC1=CC=C(C(O)=O)C(O)=C1,5.0,90,12.0,14.0,0,1,1,0,0,0,1
2,3,[*]C1=CC=CC=C1,OC1=CC(C)=CC=C1C(O)=O,5.0,90,24.0,74.0,0,1,1,0,0,0,1
3,4,[*]C1=CC=CC=C1,OC1=CC(C)=CC=C1C(O)=O,5.0,90,12.0,15.0,0,1,1,0,0,0,1
4,5,[*]C1=CC=CC=C1,OC1=CC(OC)=CC=C1C(O)=O,5.0,90,24.0,90.0,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2i,[*]C1=CC(Br)=CC=C1,O=C1C(C=C(OC)C(C(C)(C)C)=C1)=O,10.0,23,72.0,95.0,1,0,1,0,1,0,0
116,2j,[*]C1=CC=CC(OC)=C1,O=C1C(C=C(OC)C(C(C)(C)C)=C1)=O,10.0,23,48.0,89.0,1,0,1,0,1,0,0
117,2k,[*]C1=CC(C)=C(C)C=C1,O=C1C(C=C(OC)C(C(C)(C)C)=C1)=O,10.0,23,72.0,99.0,1,0,1,0,1,0,0
118,2m,[*]C1=CSC=C1,O=C1C(C=C(OC)C(C(C)(C)C)=C1)=O,10.0,60,48.0,83.0,1,0,1,0,1,0,0


In [99]:
mols_r1 = list(Chem.MolFromSmiles(smile) for smile in df['R1-'].values)
mols_organocatalyst = list(Chem.MolFromSmiles(smile) for smile in df['organocatalyst'].values)

# Rdkit記述子に変換

In [100]:
def toFingerRdkit(mols,prefix):
    descriptor_names = [(descriptor_name[0] + '_' + prefix) for descriptor_name in Descriptors.descList]
    descriptor_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
    rdkit_descriptors_results = [descriptor_calculator.CalcDescriptors(mol) for mol in mols]
    df_rdkit = pd.DataFrame(rdkit_descriptors_results, columns=descriptor_names)
    df_rdkit = df_rdkit[df_rdkit.columns[~df_rdkit.isnull().any()]]
    return df_rdkit

In [101]:
df_rdkit_r1 = toFingerRdkit(mols_r1, 'r1')
df_rdkit_organocatalyst = toFingerRdkit(mols_organocatalyst, 'organocatalyst')

In [86]:
print(df_rdkit_r1.shape)
print(df_rdkit_organocatalyst.shape)

(120, 208)
(120, 208)


# Mordred記述子の変換を行う

In [87]:
def toFingerMordred(mols):
    mordered_calculator = Calculator(descriptors,ignore_3D=True)
    df_mordred = mordered_calculator.pandas(pd.Series(mols))
    
    for column in df_mordred.columns:
        if df_mordred[column].dtypes == object:
            df_mordred[column] = df_mordred[column].values.astype(np.float32)
    
    df_mordered = df_mordred[df_mordred.columns[~df_mordred.isnull().any()]]
    return df_mordered

In [64]:
df_mordred_r1 = toFingerMordred(mols_r1)
df_mordred_organocatalyst = toFingerMordred(mols_organocatalyst)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 120/120 [00:10<00:00, 11.94it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 120/120 [00:23<00:00,  5.10it/s]


In [102]:
df_rdkit_r1.to_csv('./data/method4/rdkit_r1.csv')
df_rdkit_organocatalyst.to_csv('./data/method4/rdkit_organocatalyst.csv')
df_mordred_r1.to_csv('./data/method4/mordred_r1.csv')
df_mordred_organocatalyst.to_csv('./data/method4/mordred_organocatalyst.csv')

In [103]:
df_dropped = df.drop(columns=['entry', 'R1-', 'organocatalyst'])
r1_names = ['rdkit_r1','mordred_r1']
organo_names = ['rdkit_organo', 'mordred_organo']

for r1_name, df_r1 in zip(r1_names, [df_rdkit_r1,df_mordred_r1]):
    for organo_name, df_organocatalyst in zip(organo_names, [df_rdkit_organocatalyst, df_mordred_organocatalyst]):
        _df = pd.concat([df_dropped,df_r1, df_organocatalyst], axis=1)
        _df.to_csv(f'./data/method4/dataset_{r1_name}_{organo_name}.csv')


# 特徴量選択
1. 分散0の説明変数を除去
2. 9割以上が同じ値になる記述子を削除
3. 0.95以上の説明変数の組みがある場合どちらかを除去
4. オートスケーリング

In [115]:
from sklearn.feature_selection import VarianceThreshold

def selectFeature(df):
    select = VarianceThreshold()
    select_values = select.fit_transform(df.values)
    df_selected = df[df.columns[select.get_support()]]
    
    st_threshold = 0.9
    selected = []
    for column in df_selected:
        value_counts = df_selected[column].value_counts(sort=True)
        score = value_counts.values[0] / len(df_selected[column].values)

In [116]:
df = pd.read_csv('./data/method4/dataset_mordred_r1_mordred_organo.csv',index_col=0)

In [117]:
selectFeature(df)

AttributeError: 'numpy.ndarray' object has no attribute 'count'