# 2019-01-17 joergkurtwegner: generalizing from SMARTS to more verbose RDKit BuildFeatureFactory from argparse import Namespace import numpy as np from typing import List, Union from rdkit import Chem from rdkit import RDConfig from rdkit.Chem import ChemicalFeatures import os def get_num_functional_groups(args: Namespace): count=None if 'smarts' in args.additional_atom_features: ### naive file reader without comments #with open(args.functional_group_smarts, 'r') as f: # count = len(f.readlines()) ### file reader with comments smarts=[] with open(args.functional_group_smarts, 'r') as txtfile: for line in txtfile: line=line.strip() if not line.startswith('#'): smarts.append(line) count = len(smarts) elif 'family_and_type' in args.additional_atom_features: if '{RDDataDir}/' in args.atom_features_family_and_type: fdef=os.path.join(RDConfig.RDDataDir,args.atom_features_family_and_type.replace('{RDDataDir}/','')) featFactory = ChemicalFeatures.BuildFeatureFactory(fdef) else: featFactory = ChemicalFeatures.BuildFeatureFactory(args.atom_features_family_and_type) count=len(featFactory.GetFeatureFamilies())+len(featFactory.GetFeatureDefs().items()) return count class FunctionalGroupFeaturizer: """ Class for extracting feature vector of indicators for atoms being parts of functional groups. """ def __init__(self, args: Namespace): self.smarts = None self.featFactory = None if 'smarts' in args.additional_atom_features: ### naive file reader without comments #with open(args.functional_group_smarts, 'r') as f: # for line in f: # self.smarts.append(Chem.MolFromSmarts(line.strip())) ### file reader with comments self.smarts = [] with open(args.functional_group_smarts, 'r') as txtfile: for line in txtfile: line=line.strip() if not line.startswith('#'): self.smarts.append(Chem.MolFromSmarts(line)) elif 'family_and_type' in args.additional_atom_features: self.featFactory = [] if '{RDDataDir}/' in args.atom_features_family_and_type: fdef=os.path.join(RDConfig.RDDataDir,args.atom_features_family_and_type.replace('{RDDataDir}/','')) self.featFactory = ChemicalFeatures.BuildFeatureFactory(fdef) else: self.featFactory = ChemicalFeatures.BuildFeatureFactory(args.atom_features_family_and_type) self.featFactoryMap=dict() for i, x in enumerate(featFactory.GetFeatureFamilies()): featFactoryMap[x]=i for i, (x, y) in enumerate(featFactory.GetFeatureDefs().items()): featFactoryMap[x]=i+len(featFactory.GetFeatureFamilies()) def featurize(self, smiles: Union[Chem.Mol, str]) -> List[List[int]]: """ Given a molecule in SMILES form, return a feature vector of indicators for each atom, indicating whether the atom is part of each functional group. Can also directly accept a Chem molecule. Searches through the functional groups given in smarts.txt. :param smiles: A smiles string representing a molecule. :return: Numpy array of shape num_atoms x num_features (num functional groups) """ if type(smiles) == str: mol = Chem.MolFromSmiles(smiles) # turns out rdkit knows to match even without adding Hs else: mol = smiles if self.smarts != None: features = np.zeros((mol.GetNumAtoms(), len(self.smarts))) # num atoms (without Hs) x num features for i, smarts in enumerate(self.smarts): for group in mol.GetSubstructMatches(smarts): for idx in group: features[idx][i] = 1 elif self.featFactory != None: features = np.zeros((mol.GetNumAtoms(), len(featFactory.GetFeatureFamilies())+len(featFactory.GetFeatureDefs().items()))) # num atoms (without Hs) x num features feats = self.featFactory.GetFeaturesForMol(mol) for i in range(len(feats)): #print(feats[i].GetAtomIds(),feats[i].GetFamily(),feats[i].GetType()) for idx in range(mol.GetNumAtoms()): if(idx in feats[i].GetAtomIds()): featureId=self.featFactoryMap[feats[i].GetFamily()] features[idx][featureId] = 1 featureId=self.featFactoryMap[feats[i].GetType()] features[idx][featureId] = 1 return features.tolist() if __name__ == '__main__': featurizer = FunctionalGroupFeaturizer() features = featurizer.featurize('C(#N)C(=O)C#N')