In [3]:
from orderly.data.solvents import get_solvents, get_solvents_set, get_solvents_dict

In [4]:
import orderly
import pathlib

In [10]:
# file_name = "ord_dataset-00005539a1e04c809a9a78647bea649c"
file_name = "ord_dataset-0b70410902ae4139bd5d334881938f69"
file = orderly.extract.main.get_file_names(
    directory=orderly.data.get_path_of_test_ords(),
    file_ending=f"{file_name}.pb.gz",
)
assert len(file) == 1
file = pathlib.Path(file[0])

dataset = orderly.extract.extractor.OrdExtractor.load_data(file)

In [20]:
from orderly.extract.canonicalise import remove_mapping_info_and_canonicalise_smiles

In [52]:
remove_mapping_info_and_canonicalise_smiles('P([O-])(O)O')

'[O-]P(O)O'

In [13]:
for idx, rxn in enumerate(dataset.reactions):
    rxn_str = rxn.identifiers.value
    if '' in rxn_str:
        print(idx)
        break

In [15]:
r,a,p = '[CH2:1]([S:8][C:9]1[CH:14]=[C:13]([O:15][C:16]2[CH:21]=[CH:20][C:19]([C:22]([F:25])([F:24])[F:23])=[CH:18][C:17]=2[Cl:26])[CH:12]=[CH:11][C:10]=1[N+:27]([O-])=O)[C:2]1[CH:7]=[CH:6][CH:5]=[CH:4][CH:3]=1.P([O-])(O)O.[Na+]>C(O)C.[Pd]>[CH2:1]([S:8][C:9]1[CH:14]=[C:13]([O:15][C:16]2[CH:21]=[CH:20][C:19]([C:22]([F:25])([F:23])[F:24])=[CH:18][C:17]=2[Cl:26])[CH:12]=[CH:11][C:10]=1[NH2:27])[C:2]1[CH:3]=[CH:4][CH:5]=[CH:6][CH:7]=1'.split('>')

In [50]:
r.split('.')

['[CH2:1]([S:8][C:9]1[CH:14]=[C:13]([O:15][C:16]2[CH:21]=[CH:20][C:19]([C:22]([F:25])([F:24])[F:23])=[CH:18][C:17]=2[Cl:26])[CH:12]=[CH:11][C:10]=1[N+:27]([O-])=O)[C:2]1[CH:7]=[CH:6][CH:5]=[CH:4][CH:3]=1',
 'P([O-])(O)O',
 '[Na+]']

In [46]:
import rdkit
from rdkit.rdBase import BlockLogs as rdkit_BlockLogs
def remove_mapping_info_and_canonicalise_smiles(molecule_identifier):
    """
    Strips away mapping info and returns canonicalised SMILES. 
    """
    _ = rdkit_BlockLogs()
    # remove mapping info and canonicalsie the molecule_identifier at the same time
    # converting to mol and back canonicalises the molecule_identifier string
    try:
        m = rdkit.Chem.MolFromSmiles(molecule_identifier)
        for atom in m.GetAtoms():
            atom.SetAtomMapNum(0)
        return rdkit.Chem.MolToSmiles(m)
    except AttributeError:
        if molecule_identifier[0] == '[':
            if molecule_identifier[-1] == ']':
                return remove_mapping_info_and_canonicalise_smiles(molecule_identifier[1:-1])
        return None

In [47]:
molecule_identifier = "[CC(C)(C)[P]([Pd][P](C(C)(C)C)(C(C)(C)C)C(C)(C)C)(C(C)(C)C)C(C)(C)C]"
molecule_identifier[1:-1]

'CC(C)(C)[P]([Pd][P](C(C)(C)C)(C(C)(C)C)C(C)(C)C)(C(C)(C)C)C(C)(C)C'

In [48]:
remove_mapping_info_and_canonicalise_smiles("[Pd]")

'[Pd]'

In [49]:
%%timeit
remove_mapping_info_and_canonicalise_smiles("[CC(C)(C)[P]([Pd][P](C(C)(C)C)(C(C)(C)C)C(C)(C)C)(C(C)(C)C)C(C)(C)C]")

1.47 ms ± 48.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [29]:
%%timeit
rdkit.Chem.MolFromSmiles("CC(C)(C)[P]([Pd][P](C(C)(C)C)(C(C)(C)C)C(C)(C)C)(C(C)(C)C)C(C)(C)C")

100 µs ± 435 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
