In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: tanminkang
"""

'\n@author: tanminkang\n'

In [1]:
import re
from rdkit import Chem

In [2]:
# Original string
smi = '[Cl:1][c:2]1[cH:3][c:4]([CH3:8])[n:5][n:6]1[CH3:7].[OH:14][N+:15]([O-:16])=[O:17].[S:9](=[O:10])(=[O:11])([OH:12])[OH:13]>>[Cl:1][c:2]1[c:3]([N+:15](=[O:14])[O-:16])[c:4]([CH3:8])[n:5][n:6]1[CH3:7] |f:0.1.2.3|'

In [3]:
# Reactants and reagent separation(atom mappings)
def separate_reactant_reagent(smi):
    """
    Reactants and reagent separation (atom mappings)
    :param smi:
    :return:
    """
    # remove useless information
    smi = smi.split(' |f')[0]
    reaction = smi.split('>')[0]
    product = smi.split('>')[-1]
    return reaction, product

In [4]:
# Atom-mapping removal and canonicalization
def remove_atom_mapping(smi):
    """
    Atom-mapping removal and canonicalization
    :param smi:
    :return:
    """
    # canonicalization
    smi = re.sub(r'H[0-9]+|H|:[0-9]+', '', smi)
    myRe = re.compile(r"(\[)([A-Za-z]+)(\])")
    smi = myRe.sub(r'\2', smi)
    return smi

In [5]:
def smi_tokenizer(smi):
    """
    Tokenize a SMILES molecule or reaction
    """
    pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    assert smi == ''.join(tokens)
    return ' '.join(tokens)

In [6]:
def canonicalize(smiles): 
    """
    Canonicalize smiles by MolToSmiles function
    """
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles)) if (smiles != '') else ''

In [9]:
reaction, product = separate_reactant_reagent(smi)
prediction = canonicalize(reaction)
production = canonicalize(product)