Skip to content

Commit

Permalink
Update DescriptorFeature input in fingerprint.py
Browse files Browse the repository at this point in the history
1. adding "add_Hs" capability after mol convertion in DescriptorFeature
2. add compatibility of "add_Hs" and "desc_list" to Fingerprints
  • Loading branch information
stewu5 committed Feb 11, 2022
1 parent 9956135 commit 251aa26
Showing 1 changed file with 24 additions and 3 deletions.
27 changes: 24 additions & 3 deletions xenonpy/descriptor/fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,7 +675,7 @@ class DescriptorFeature(BaseFeaturizer):
'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea']

def __init__(self, n_jobs=-1,
*, input_type='mol', on_errors='raise', return_type='any', target_col=None, desc_list='all'):
*, input_type='mol', on_errors='raise', return_type='any', target_col=None, desc_list='all', add_Hs=False):
"""
All descriptors in RDKit (length = 200) [may include NaN]
see https://www.rdkit.org/docs/GettingStartedInPython.html#list-of-available-descriptors for the full list
Expand Down Expand Up @@ -707,10 +707,14 @@ def __init__(self, n_jobs=-1,
List of descriptor names to be called in rdkit to calculate molecule descriptors.
If ``classic``, the full list of rdkit v.2020.03.xx is used. (length = 200)
Default is to use the latest list available in the rdkit. (length = 208 in rdkit v.2020.09.xx)
add_Hs: boolean
Add hydrogen atoms to the mol format in RDKit or not.
This may affect a few physical descriptors (e.g., charge related ones).
"""
# self.arg = arg # arg[0] = radius, arg[1] = bit length
super().__init__(n_jobs=n_jobs, on_errors=on_errors, return_type=return_type, target_col=target_col)
self.input_type = input_type
self.add_Hs = add_Hs
if desc_list == 'all':
self.nms = [x[0] for x in ChemDesc._descList]
elif desc_list == 'classic':
Expand All @@ -726,12 +730,20 @@ def featurize(self, x):
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.add_Hs:
x = Chem.AddHs(x)
if x is None:
raise ValueError('cannot add Hs to Mol for %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('cannot convert Mol from SMILES %s' % x_)
if self.add_Hs:
x = Chem.AddHs(x)
if x is None:
raise ValueError('cannot add Hs to Mol')
return self.calc.CalcDescriptors(x)

@property
Expand All @@ -755,7 +767,9 @@ def __init__(self,
input_type='mol',
featurizers='all',
on_errors='raise',
target_col=None):
target_col=None,
desc_list='all',
add_Hs=False):
"""
Parameters
Expand Down Expand Up @@ -796,6 +810,13 @@ def __init__(self,
Specify a single column to be used for transformation.
If ``None``, all columns of the pd.DataFrame is used.
Default is None.
desc_list: string or list
List of descriptor names to be called in rdkit to calculate molecule descriptors.
If ``classic``, the full list of rdkit v.2020.03.xx is used. (length = 200)
Default is to use the latest list available in the rdkit. (length = 208 in rdkit v.2020.09.xx)
add_Hs: boolean
Add hydrogen atoms to the mol format in RDKit or not.
This may affect a few physical descriptors (e.g., charge related ones) and currently no effect to fingerprints.
"""

super().__init__(featurizers=featurizers)
Expand All @@ -817,4 +838,4 @@ def __init__(self,
self.mol = MHFP(1, radius=radius, n_bits=n_bits,
input_type=input_type, on_errors=on_errors, target_col=target_col)
self.mol = DescriptorFeature(n_jobs, input_type=input_type,
on_errors=on_errors, target_col=target_col)
on_errors=on_errors, target_col=target_col, desc_list=desc_list, add_Hs=add_Hs)

0 comments on commit 251aa26

Please sign in to comment.