In [2]:
import typing
from rdkit import Chem as rdkit_Chem
from rdkit.rdBase import BlockLogs as rdkit_BlockLogs

In [3]:

def remove_mapping_info_and_canonicalise_smiles(smiles: str) -> typing.Optional[str]:
    _ = rdkit_BlockLogs()
    # remove mapping info and canonicalsie the smiles at the same time
    # converting to mol and back canonicalises the smiles string
    try:
        m = rdkit_Chem.MolFromSmiles(smiles)
        for atom in m.GetAtoms():
            atom.SetAtomMapNum(0)
        return rdkit_Chem.MolToSmiles(m)
    except AttributeError:
        return None


def canonicalise_smile(smiles: str) -> typing.Optional[str]:
    _ = rdkit_BlockLogs()
    # remove mapping info and canonicalsie the smiles at the same time
    # converting to mol and back canonicalises the smiles string
    try:
        return rdkit_Chem.CanonSmiles(smiles)
    except:
        return None

In [11]:
%%timeit
remove_mapping_info_and_canonicalise_smiles('C1=CC=C(C=C1)C2=CC=CC=C2')

786 µs ± 18 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [12]:
%%timeit
canonicalise_smile('C1=CC=C(C=C1)C2=CC=CC=C2')

193 µs ± 4.62 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [6]:
rxn = '[OH:1][c:2]1[cH:3][cH:4][c:5]2[cH:6][cH:7][c:8](Br)[cH:33][c:34]2[cH:35]1.CC1(C)OB([c:9]2[cH:10][cH:11][c:12](-[c:13]3[n:14][c:15](-[c:16]4[cH:17][cH:18][cH:19][cH:20][cH:21]4)[n:22][c:23](-[c:24]4[cH:25][cH:26][cH:27][cH:28][cH:29]4)[n:30]3)[cH:31][cH:32]2)OC1(C)C>>[OH:1][c:2]1[cH:3][cH:4][c:5]2[cH:6][cH:7][c:8](-[c:9]3[cH:10][cH:11][c:12](-[c:13]4[n:14][c:15](-[c:16]5[cH:17][cH:18][cH:19][cH:20][cH:21]5)[n:22][c:23](-[c:24]5[cH:25][cH:26][cH:27][cH:28][cH:29]5)[n:30]4)[cH:31][cH:32]3)[cH:33][c:34]2[cH:35]1'

In [7]:
rxn.split('.')

['[OH:1][c:2]1[cH:3][cH:4][c:5]2[cH:6][cH:7][c:8](Br)[cH:33][c:34]2[cH:35]1',
 'CC1(C)OB([c:9]2[cH:10][cH:11][c:12](-[c:13]3[n:14][c:15](-[c:16]4[cH:17][cH:18][cH:19][cH:20][cH:21]4)[n:22][c:23](-[c:24]4[cH:25][cH:26][cH:27][cH:28][cH:29]4)[n:30]3)[cH:31][cH:32]2)OC1(C)C>>[OH:1][c:2]1[cH:3][cH:4][c:5]2[cH:6][cH:7][c:8](-[c:9]3[cH:10][cH:11][c:12](-[c:13]4[n:14][c:15](-[c:16]5[cH:17][cH:18][cH:19][cH:20][cH:21]5)[n:22][c:23](-[c:24]5[cH:25][cH:26][cH:27][cH:28][cH:29]5)[n:30]4)[cH:31][cH:32]3)[cH:33][c:34]2[cH:35]1']

In [8]:
remove_mapping_info_and_canonicalise_smiles(rxn.split('.')[0])

'Oc1ccc2ccc(Br)cc2c1'

In [9]:
canonicalise_smile(rxn.split('.')[0])

'Br[c:8]1[cH:7][cH:6][c:5]2[cH:4][cH:3][c:2]([OH:1])[cH:35][c:34]2[cH:33]1'