In [2]:

from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import pathlib, pickle, os, tqdm, torch
import multiprocessing, collections

from collections import defaultdict  

RADIUS_UPPER_LIMIT = 10
DATASET_root_path = pathlib.Path("/workspace/")
DATASETS = ["OneD_Only_Dataset", "SMILES_dataset"]
DATASET_INDEX_SOURCE = ["oneD_NMR" , "HSQC"]

from rdkit.Chem import rdFingerprintGenerator
gen = rdFingerprintGenerator.GetMorganGenerator(radius=RADIUS_UPPER_LIMIT)
ao = rdFingerprintGenerator.AdditionalOutput()
ao.AllocateBitInfoMap()

from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*')  # Disable all RDKit warnings


def isomeric_to_canonical_smiles(isomeric_smiles):
    try:
        mol = Chem.MolFromSmiles(isomeric_smiles)
        Chem.RemoveStereochemistry( mol ) 
    except:
        # print(isomeric_smiles)
        return None

    canonical_smiles = Chem.MolToSmiles(mol, canonical=True)
    
    return canonical_smiles

# step 1: find all fragments of the entire training set
def circular_substructures_method_compare(smiles):
    circular_substructures_counts = defaultdict(int) # radius to smiles
    substrucure_radius = {}
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"Failed to parse {smiles}")
        # raise ValueError(f"Failed to parse {smiles}")
        return circular_substructures_counts, substrucure_radius
    mol = Chem.AddHs(mol)
    Chem.Kekulize(mol, clearAromaticFlags=True)

    # Compute Morgan fingerprint with radius 
    fp = gen.GetFingerprint(mol, additionalOutput=ao)
    info = ao.GetBitInfoMap()  

    # Extract circular subgraphs
    # display(info)
    for bit_id, atom_envs in info.items():
        for atom_idx, curr_radius in atom_envs:
            # Get the circular environment as a subgraph
            env = Chem.FindAtomEnvironmentOfRadiusN(mol, curr_radius, atom_idx)
            submol = Chem.PathToSubmol(mol, env)
            smiles_from_submol = Chem.MolToSmiles(submol, canonical=True)
            if isomeric_to_canonical_smiles(smiles_from_submol) is None:
                print("failed to convert smiles_from_submol back to canonical smiles", smiles_from_submol)
                continue
            
            atom_indices = set()
            for bond_idx in env:
                bond = mol.GetBondWithIdx(bond_idx)
                atom_indices.add(bond.GetBeginAtomIdx())
                atom_indices.add(bond.GetEndAtomIdx())
            if len(atom_indices) == 0:
                # print(f"Empty atom indices for {smiles}")
                # print(f"{smiles_from_submol=}")
                continue
            smiles_from_fragments = Chem.MolFragmentToSmiles(mol, atom_indices, canonical=True)
            if isomeric_to_canonical_smiles(smiles_from_submol) is None:
                continue
            print(f"{smiles_from_fragments == isomeric_to_canonical_smiles(smiles_from_fragments)=}")
            print(f"{smiles_from_submol == isomeric_to_canonical_smiles(smiles_from_submol)=}")
            # print()
            # # if isomeric_to_canonical_smiles(smiles_from_fragments) != isomeric_to_canonical_smiles(smiles_from_submol):
            # #     print(f"{smiles_from_fragments=}, {smiles_from_submol=}")
            # #     print(f"{isomeric_to_canonical_smiles(smiles_from_fragments)},           {isomeric_to_canonical_smiles(smiles_from_submol)}")
            # #     print()
            
           

In [3]:
RADIUS_UPPER_LIMIT = 10
from rdkit.Chem import rdFingerprintGenerator
gen = rdFingerprintGenerator.GetMorganGenerator(radius=RADIUS_UPPER_LIMIT)
ao = rdFingerprintGenerator.AdditionalOutput()
ao.AllocateBitInfoMap()


def get_fragments_v1(SMILES):
    mol = Chem.MolFromSmiles(SMILES)
    if mol is None:
        print(f"Failed to parse {SMILES}")
        # raise ValueError(f"Failed to parse {SMILES}")
        return None
    Chem.Kekulize(mol, clearAromaticFlags=True)
    mol = Chem.AddHs(mol)

    # Compute Morgan fingerprint with radius 
    fp = gen.GetFingerprint(mol, additionalOutput=ao)
    info = ao.GetBitInfoMap()
    
    # Extract circular subgraphs
    # display(info)
    frags = set()
    for bit_id, atom_envs in info.items():
        for atom_idx, curr_radius in atom_envs:
            # Get the circular environment as a subgraph
            env = Chem.FindAtomEnvironmentOfRadiusN(mol, curr_radius, atom_idx)
            submol = Chem.PathToSubmol(mol, env)
            smiles = Chem.MolToSmiles(submol, canonical=True) # this is canonical in terms of fragment, so it is related to the bond/atom index mapping
            smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True) # so, let's treat it as a independent molecule, instead of a fragment
            frags.add(smiles)
    
    frags.discard("")
    return frags

In [5]:
help(rdFingerprintGenerator.GetMorganGenerator)

Help on built-in function GetMorganGenerator in module rdkit.Chem.rdFingerprintGenerator:

GetMorganGenerator(...)
    GetMorganGenerator([  (int)radius=3 [, (bool)countSimulation=False [, (bool)includeChirality=False [, (bool)useBondTypes=True [, (bool)onlyNonzeroInvariants=False [, (bool)includeRingMembership=True [, (AtomPairsParameters)countBounds=None [, (int)fpSize=2048 [, (AtomPairsParameters)atomInvariantsGenerator=None [, (AtomPairsParameters)bondInvariantsGenerator=None [, (bool)includeRedundantEnvironments=False]]]]]]]]]]]) -> FingerprintGenerator64 :
        Get a morgan fingerprint generator
        
          ARGUMENTS:
            - radius:  the number of iterations to grow the fingerprint
            - countSimulation: if set, use count simulation while generating the fingerprint
            - includeChirality: if set, chirality information will be added to the generated fingerprint
            - useBondTypes: if set, bond types will be included as a part of the default

In [58]:


def get_fragments_v2(SMILES):
    mol = Chem.MolFromSmiles(SMILES)
    if mol is None:
        print(f"Failed to parse {SMILES}")
        # raise ValueError(f"Failed to parse {SMILES}")
        return None
    Chem.Kekulize(mol, clearAromaticFlags=True)
    mol = Chem.AddHs(mol)

    frags = set()
    for atom_idx in range(mol.GetNumAtoms()):
        for radius in range(1, RADIUS_UPPER_LIMIT + 1):
            env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, atom_idx)
            submol = Chem.PathToSubmol(mol, env)
            smiles = Chem.MolToSmiles(submol, canonical=True)
            smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True)

            frags.add(smiles)
    frags.discard("")
    return frags

In [59]:
import pickle
path = "/workspace/SMILES_dataset/val/SMILES/index.pkl"
with open(path, "rb") as f:
    sone_smiles = pickle.load(f).values()

In [60]:
for s in list(sone_smiles)[:10]:
    x= get_fragments_v1(s)
    y = get_fragments_v2(s)
    print(x==y)

True
True
True
True
True
True
True
True
True
False


In [49]:
x-y

set()

In [50]:
y-x

{'C#CC(Br)C1CC2OC3(CCC(C(Br)CC)O3)C(Br)C2O1',
 'C#CC(Br)C1CC2OC3(CCC(C(C)Br)O3)C(Br)C2O1',
 'CCC(Br)C1CCC2(O1)OC1CC(C(C)Br)OC1C2Br'}

In [51]:
len(x), len(y)

(50, 53)