Skip to content

Commit

Permalink
rdkit_desc_update (#228)
Browse files Browse the repository at this point in the history
* rdkit_desc_update

1. include desc_list to allow selection of descriptor feature in rdkit
2. change test as length of rdkit DescriptorFeature becomes 208 (originally 200)

* Update fingerprint.py

* Update linux_win_env.yml

* Update macos_env.yml

Co-authored-by: Chang.Liu <TsumiNa@users.noreply.github.com>
  • Loading branch information
stewu5 and TsumiNa committed Feb 18, 2021
1 parent c2d07bc commit fb97cd7
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .github/config/linux_win_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ dependencies:
- numpy
- scipy
- requests
- rdkit=2020.03.3.0
- rdkit=2020.09
- scikit-learn
- scipy
- pytorch >=1.7.0,<2.0.0
Expand Down
2 changes: 1 addition & 1 deletion .github/config/macos_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ dependencies:
- numpy
- scipy
- requests
- rdkit=2020.03.3.0
- rdkit=2020.09
- scikit-learn
- scipy
- pytorch >=1.7.0,<2.0.0
Expand Down
4 changes: 2 additions & 2 deletions tests/descriptor/test_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def test_fps_4(data):
fps = Fingerprints(n_jobs=1, input_type='any', on_errors='nan')
ret = fps.transform(data['err_smis'])
assert isinstance(ret, pd.DataFrame)
assert ret.shape == (4, 16751)
assert ret.shape == (4, 16759)
assert np.isnan(ret.values[1][10])
assert np.isnan(ret.values[2][20])

Expand All @@ -246,7 +246,7 @@ def test_fps_6(data):
fps = Fingerprints(n_jobs=1, input_type='any', on_errors='nan', counting=True)
ret = fps.transform(data['smis'])
assert isinstance(ret, pd.DataFrame)
assert ret.shape == (4, 16751)
assert ret.shape == (4, 16759)


if __name__ == "__main__":
Expand Down
51 changes: 45 additions & 6 deletions xenonpy/descriptor/fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import Descriptors as ChemDesc
from rdkit.Chem import MACCSkeys as MAC
from rdkit.Chem import rdMolDescriptors as rdMol
from rdkit.Chem import rdmolops as rdm
Expand Down Expand Up @@ -643,8 +643,39 @@ def feature_labels(self):

class DescriptorFeature(BaseFeaturizer):

classic = ['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt',
'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge',
'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2',
'FpDensityMorgan3', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n',
'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3',
'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14',
'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9',
'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7',
'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2',
'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'TPSA',
'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4',
'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'VSA_EState1', 'VSA_EState10',
'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8',
'VSA_EState9', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles',
'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles',
'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds',
'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'RingCount', 'MolLogP',
'MolMR', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH',
'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0',
'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH',
'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_amidine',
'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine',
'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan',
'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan',
'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy',
'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso',
'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_phos_acid',
'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine',
'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole',
'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea']

def __init__(self, n_jobs=-1,
*, input_type='mol', on_errors='raise', return_type='any', target_col=None):
*, input_type='mol', on_errors='raise', return_type='any', target_col=None, desc_list='all'):
"""
All descriptors in RDKit (length = 200) [may include NaN]
see https://www.rdkit.org/docs/GettingStartedInPython.html#list-of-available-descriptors for the full list
Expand Down Expand Up @@ -672,12 +703,21 @@ def __init__(self, n_jobs=-1,
Specify a single column to be used for transformation.
If ``None``, all columns of the pd.DataFrame is used.
Default is None.
desc_list: string or list
List of descriptor names to be called in rdkit to calculate molecule descriptors.
If ``classic``, the full list of rdkit v.2020.03.xx is used. (length = 200)
Default is to use the latest list available in the rdkit. (length = 208 in rdkit v.2020.09.xx)
"""
# self.arg = arg # arg[0] = radius, arg[1] = bit length
super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
self.input_type = input_type
nms = [x[0] for x in Descriptors._descList]
self.calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms)
if desc_list == 'all':
self.nms = [x[0] for x in ChemDesc._descList]
elif desc_list == 'classic':
self.nms = self.classic
else:
self.nms = desc_list
self.calc = MoleculeDescriptors.MolecularDescriptorCalculator(self.nms)
self.__authors__ = ['Stephen Wu', 'TsumiNa']

def featurize(self, x):
Expand All @@ -696,8 +736,7 @@ def featurize(self, x):

@property
def feature_labels(self):
return [x[0] for x in Descriptors._descList]
# return ['desc200:' + str(i) for i in range(200)]
return self.nms


class Fingerprints(BaseDescriptor):
Expand Down

0 comments on commit fb97cd7

Please sign in to comment.