# Demo of RDKitMol + TS-GCN + FF ALIGNMENT

Compared to the other demo, in this demo, we tries to use force field to prepare the input for TS-GCN.
In the current implementation, this method completely ignores the product

Some codes are compiled from https://github.com/ReactionMechanismGenerator/TS-GCN

In [31]:
import os
import sys
import subprocess

# To add this RDMC into PYTHONPATH in case you haven't do it
sys.path.append(os.path.dirname(os.path.abspath('')))

from rdkit import Chem
from rdmc.mol import RDKitMol
from rdmc.view import grid_viewer, mol_viewer
from rdmc.forcefield import RDKitFF, OpenBabelFF
try:
    # import RMG dependencies
    from rdmc.external.rmg import (from_rdkit_mol,
                                   find_reaction_family,
                                   generate_reaction_complex,
                                   load_rmg_database,
                                   )
    # Load RMG database
    database = load_rmg_database()
except (ImportError, ModuleNotFoundError):
    print('You need to install RMG-Py first and run this IPYNB in rmg_env!')

import openbabel as ob

def parse_xyz_or_smiles(identifier, **kwargs):
    try:
        return RDKitMol.FromXYZ(identifier, **kwargs)
    except:
        mol = RDKitMol.FromSmiles(identifier,)
        mol.EmbedConformer()
        return mol

BOND_CONSTRAINT = {'1,3_Insertion_ROR': 3.0,
                   'Retroene': 3.0,
                   '1,2_Insertion': 2.5,
                   '2+2_cycloaddition_Cd': 3.5,
                   'Diels_alder_addition': 4.0,
                   'H_Abstraction': 3.0,}

%load_ext autoreload
%autoreload 2



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 1. Input molecule information
Perceive xyz and generate RMG molecule.
- **Please always define the single species end of the reaction as the reactant.**
- **Prefered to put the heavier product in the first place of the list.**

Here, some examples are provided

#### 1.1: Intra H migration (A = B)

In [12]:
reactants = ["""C -1.528265  0.117903  -0.48245
C -0.214051  0.632333  0.11045
C 0.185971  2.010727  -0.392941
O 0.428964  2.005838  -1.836634
O 1.53499  1.354342  -2.136876
H -1.470265  0.057863  -1.571456
H -1.761158  -0.879955  -0.103809
H -2.364396  0.775879  -0.226557
H -0.285989  0.690961  1.202293
H 0.605557  -0.056315  -0.113934
H -0.613001  2.746243  -0.275209
H 1.100271  2.372681  0.080302"""
]

products = ["""C 1.765475  -0.57351  -0.068971
H 1.474015  -1.391926  -0.715328
H 2.791718  -0.529486  0.272883
C 0.741534  0.368416  0.460793
C -0.510358  0.471107  -0.412585
O -1.168692  -0.776861  -0.612765
O -1.768685  -1.15259  0.660846
H 1.164505  1.37408  0.583524
H 0.417329  0.069625  1.470788
H -1.221189  1.194071  0.001131
H -0.254525  0.771835  -1.433299
H -1.297409  -1.977953  0.837367"""]

#### 1.2: Intra_R_Add_Endocyclic (A = B)

In [3]:
reactants = ["""C -1.280629  1.685312  0.071717
C -0.442676  0.4472  -0.138756
C 0.649852  0.459775  -0.911627
C 1.664686  -0.612881  -1.217378
O 1.590475  -1.810904  -0.470776
C -0.908344  -0.766035  0.616935
O -0.479496  -0.70883  2.04303
O 0.804383  -0.936239  2.193929
H -1.330008  1.940487  1.13602
H -0.87426  2.544611  -0.46389
H -2.311393  1.527834  -0.265852
H 0.884957  1.398914  -1.412655
H 2.661334  -0.151824  -1.125202
H 1.56564  -0.901818  -2.270488
H 1.630132  -1.574551  0.469563
H -0.531309  -1.699031  0.2105
H -1.994785  -0.790993  0.711395""",
]

products = ["""C -1.515438  1.173583  -0.148858
C -0.776842  -0.102045  0.027824
C 0.680366  -0.300896  -0.240616
O 1.080339  -1.344575  0.660508
O -0.122211  -2.188293  0.768145
C -1.192654  -1.233281  0.917593
C -1.377606  -0.848982  2.395301
O -0.302953  -0.072705  2.896143
H -2.596401  1.013314  -0.200053
H -1.327563  1.859316  0.692798
H -1.211486  1.693094  -1.062486
H 0.888934  -0.598866  -1.280033
H 1.294351  0.57113  0.013413
H -2.08787  -1.759118  0.559676
H -1.514675  -1.774461  2.97179
H -2.282313  -0.243469  2.505554
H 0.511127  -0.541653  2.673033"""]

#### 1.3: ketoenol (A = B)

In [4]:
reactants = ["""O 0.898799  1.722422  0.70012
C 0.293754  -0.475947  -0.083092
C -1.182804  -0.101736  -0.000207
C 1.238805  0.627529  0.330521
H 0.527921  -1.348663  0.542462
H 0.58037  -0.777872  -1.100185
H -1.45745  0.17725  1.018899
H -1.813437  -0.937615  -0.310796
H -1.404454  0.753989  -0.640868
H 2.318497  0.360641  0.272256""",
    ]

products = ["""O 2.136128  0.058786  -0.999372
C -1.347448  0.039725  0.510465
C 0.116046  -0.220125  0.294405
C 0.810093  0.253091  -0.73937
H -1.530204  0.552623  1.461378
H -1.761309  0.662825  -0.286624
H -1.923334  -0.892154  0.536088
H 0.627132  -0.833978  1.035748
H 0.359144  0.869454  -1.510183
H 2.513751  -0.490247  -0.302535"""]

#### 1.4: Retroene (A = B + C)

In [20]:
reactants = [
"""C -6.006673  2.090429  -0.326601
C -4.967524  1.669781  0.388617
C -3.589427  2.26746  0.357355
C -2.508902  1.272686  -0.104697
H -3.327271  2.622795  1.363524
H -3.58379  3.147152  -0.296003
C -1.100521  1.87264  -0.092522
H -2.756221  0.924031  -1.113232
H -2.5361  0.386526  0.540381
H -1.035149  2.742418  -0.753598
H -0.355581  1.145552  -0.426718
H -0.818137  2.200037  0.913007
H -6.976055  1.60886  -0.26225
H -5.925943  2.93703  -1.002368
H -5.097445  0.815022  1.052006""",
]

products = [
"""C -1.134399  -0.013643  -0.104812
C 0.269995  0.453024  0.142565
C 1.359378  -0.302236  0.042286
H -1.605932  0.564934  -0.907015
H -1.757078  0.122064  0.786524
H -1.163175  -1.069646  -0.383531
H 0.381197  1.49848  0.425883
H 1.301395  -1.350314  -0.236967
H 2.348619  0.097335  0.235065""",

"""C 0.659713  0.003927  0.070539
C -0.659713  -0.003926  -0.070539
H 1.253364  0.882833  -0.158319
H 1.20186  -0.86822  0.420842
H -1.20186  0.86822  -0.420842
H -1.253364  -0.882833  0.158319""",
]

#### 1.5 HO2 Addition (A = B + C)

In [6]:
reactants = [
"""C -1.890664  -0.709255  -0.271996
C -0.601182  0.078056  -0.018811
C 0.586457  -0.545096  -0.777924
C -0.292203  0.188974  1.451901
H -0.683164  -0.56844  2.124827
C 0.477032  1.332664  2.012529
O -0.367239  2.493656  2.288335
O -0.679966  1.393013  -0.618968
O -1.811606  2.119506  -0.074789
H -1.819659  -1.711353  0.159844
H -2.063907  -0.801665  -1.346104
H -2.739557  -0.190076  0.171835
H 0.374452  -0.548385  -1.849706
H 1.501209  0.026135  -0.608139
H 0.747239  -1.572318  -0.444379
H 1.209047  1.707778  1.296557
H 0.998836  1.047896  2.931789
H -0.994076  2.235514  2.974109
H -1.392774  2.537261  0.704151"""
]

products = [
"""C -1.395681  1.528483  -0.00216
C -0.402668  0.411601  -0.210813
C -0.997629  -0.972081  -0.127641
C 0.890607  0.678979  -0.433435
C 2.015631  -0.28316  -0.676721
O 2.741986  0.043989  -1.867415
H -0.923699  2.509933  -0.072949
H -2.200649  1.479183  -0.744922
H -1.873843  1.44886  0.981238
H -1.839799  -1.068706  -0.822233
H -0.283424  -1.765173  -0.346167
H -1.400492  -1.154354  0.875459
H 1.201336  1.7219  -0.466637
H 2.754241  -0.212398  0.127575
H 1.667906  -1.32225  -0.7073
H 2.101868  0.079395  -2.5857""",

"""O -0.168488  0.443026  0.0
O 1.006323  -0.176508  0.0
H -0.837834  -0.266518  0.0""",
]

#### 1.6 cycloaddition (A = B + C)

In [28]:
reactants = [
"""O -0.854577  1.055663  -0.58206
O 0.549424  1.357531  -0.196886
C -0.727718  -0.273028  -0.011573
C 0.76774  -0.043476  0.113736
H -1.066903  -1.044054  -0.706048
H -1.263435  -0.349651  0.939354
H 1.374762  -0.530738  -0.655177
H 1.220707  -0.172248  1.098653"""
           ]

products = [
"""O 0.0  0.0  0.682161
C 0.0  0.0  -0.517771
H 0.0  0.938619  -1.110195
H 0.0  -0.938619  -1.110195""",

"""O 0.0  0.0  0.682161
C 0.0  0.0  -0.517771
H 0.0  0.938619  -1.110195
H 0.0  -0.938619  -1.110195""",
]

#### [TEST]

In [7]:
reactants = [
"""[CH3]""",
"""CCCCO""",]            

products = [
"""C""",
"""[CH2]CCCO""",]

### 2. Check if this reaction matches RMG templates and generate product complex
If the reaction matches at least one RMG family, the result will be shown. Otherwise,
this script will not be helpful

In [13]:
# xyz perception algorithm
backends = ['openbabel', 'jensen']

# header, if your xyz string has the first two lines (atom number + title/comments)
header = False

In [14]:
%pdb

Automatic pdb calling has been turned OFF


In [30]:
# For A = B + C reactions, Better to make A as the reactant
if len(reactants) == 2 and len(products) == 1:
    reactants, products = products, reactants

# Generate reactant and product complex
for backend in backends:
    print(f'Using {backend} as XYZ perception backend.')
    try:
        # Convert XYZ to rdkit mol
        reactants_rdkit = [parse_xyz_or_smiles(reactant, backend=backend, header=False) for reactant in reactants]
        products_rdkit = [parse_xyz_or_smiles(product, backend=backend, header=False) for product in products]

        # Convert rdkit mol to RMG mol
        reactant_molecules = [from_rdkit_mol(r.ToRWMol()) for r in reactants_rdkit]
        product_molecules = [from_rdkit_mol(p.ToRWMol()) for p in products_rdkit]

    except Exception as e:
        print(e)
        print(f'Cannot generate molecule instances using {backend}...')
        continue
    else:
        # A product complex with the same atom indexing as the reactant is generated
        family_label, _ = find_reaction_family(database, reactant_molecules,
                                               product_molecules, verbose=False)
        r_complex, p_complex = generate_reaction_complex(database,
                                                 reactant_molecules,
                                                 product_molecules,
                                                 verbose=True)
    if not r_complex:
        continue

    try:
        # p_rmg is the complex in RDKitMol form and product_match is its RMG molecule form
        r_mol = RDKitMol.FromRMGMol(r_complex)
        p_mol = RDKitMol.FromRMGMol(p_complex)
        break
    except Exception as e:
        # There can be some problem converting RMG mol back to RDKit
        print(e)
        continue
else:
    print('No matched RMG reaction is found for the given reactants and products.')

Using openbabel as XYZ perception backend.
C=O + C=O <=> C1COO1
RMG family: 2+2_cycloaddition_CO
Is forward reaction: False


### If Single reactants

In [40]:
if len(reactants) == 1:
    r_mol = reactants_rdkit[0]
    
    try:
        p_mol.SetPositions(r_mol.GetPositions())
    except ValueError as e:
        # Very ocassionally, RDKit Cannot embed molecules 
        # Though the molecule itself looks okay
        # Generate a openbabel first and convert it to RDKitMol
        print(e)

    if p_mol:
        print('\nThe generated product complex:')
        viewer = mol_viewer(p_mol.ToMolBlock(), 'sdf')
        viewer.show()


The generated product complex:


### 3. Apply force field optimization to get reasonable alignment
Instead of using RDKit force field module, we are using Openbabel force field. It is due to RDKit force field doesn't treat molecule-molecule interaction correctly

In [43]:
# bond length for bonds will be fixed at the following value
bond_constraint_factor = 2.0 #BOND_CONSTRAINT.get(family_label, 3.0) # Unit: Angstrom

# Force Field
force_field_type = "MMFF94s"
# Convergence criteria
tol = 1e-8
# Step size
step = 5
# Max step
max_step = 10000

In [44]:
# Find broken bonds
r_bonds = [{bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()} for bond in r_mol.GetBonds()]
p_bonds = [{bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()} for bond in p_mol.GetBonds()]
formed_bonds = [bond for bond in p_bonds if bond not in r_bonds]
broken_bonds = [bond for bond in r_bonds if bond not in p_bonds]
print(f'Following bonds are formed in the reaction: {formed_bonds}')
print(f'Following bonds are broken in the reaction: {broken_bonds}')

# Make a backup molecule
p_combine = p_mol.Copy()
try:
    # First try if we can use RDKit Forcefield to optimize
    # It is faster, relatively more robust, but have limited atom-type support
    ff = RDKitFF(force_field_type)
    # In RDKit FF, setup first then add constraints
    ff.setup(p_combine, ignore_interfrag_interactions=False)
    for broken_bond in broken_bonds:
        ff.add_distance_constraint(broken_bond, bond_constraint_factor)

except NotImplementedError as e:
    print(e)
    # It means we cannot make the molecule optimizable by RDKit force field
    # Then use OpenBabel force field
    ff = OpenBabelFF(force_field_type)
    # provides mol can help adjust the atom index difference between RDKit Mol and OpenBabel Mol
    ff.mol = p_combine
    # In Openbabel Forcefield, add constraints first then setup the force field
    for broken_bond in broken_bonds:
        ff.add_distance_constraint(broken_bond, bond_constraint_factor)
    ff.setup()

finally:
    ff.optimize(max_step=max_step, tol=tol, step_per_iter=step)
    p_combine = ff.get_optimized_mol()

print('\nVisualize optimized molecule:')
mol_viewer(p_combine.ToMolBlock(), 'sdf')

Following bonds are formed in the reaction: []
Following bonds are broken in the reaction: [{0, 1}, {2, 3}]

Visualize optimized molecule:


<py3Dmol.view at 0x7f85a175d290>

### 4. Find the best atom mapping by RMSD. <br>
Note, this can perform relatively poorly if the reactant and the product are in different stereotype (cis/trans). or most rotors are significantly different oriented. However, previous step (match according to RMG reaction) makes sure that all heavy atoms and reacting H atoms are consistent, so only H atoms that are more trivial are influenced by this.

NOTE: AlignMol can yields wrong numbers, we switch to `GetBestRMS` and `CalcRMS`.

In [45]:
# Whether to find better matches by reflecting the molecule (resulting in mirror image)
reflect = False

In [46]:
# Generate substructure matches,
# There is no difference using `p_combine` or `p_mol` as the argument
# Since both of them have the same connectivity information
matches = p_mol.GetSubstructMatches(p_combine, uniquify=False)

# Make a copy of p_combine to preserve its original information
p_align = p_combine.Copy()

rmsds = []

# Align the combined complex to the rmg generated complex
# According to different mapping and find the best one.
for i, match in enumerate(matches):
    atom_map = [list(enumerate(match))]
    rmsd1 = Chem.rdMolAlign.GetBestRMS(prbMol=p_align.ToRWMol(),
                                       refMol=p_rmg.ToRWMol(),
                                       map=atom_map)
    if reflect:
        p_align.Reflect()
        rmsd2 = Chem.rdMolAlign.GetBestRMS(prbMol=p_align.ToRWMol(),
                                           refMol=p_rmg.ToRWMol(),
                                           map=atom_map)
        p_align.Reflect()
    else:
        rmsd2=1e10
    if rmsd1 > rmsd2:
        rmsds.append((i, True, rmsd2,))
    else:
        rmsds.append((i, False, rmsd1,))
best = sorted(rmsds, key=lambda x: x[2])[0]
print('Match index: {0}, Reflect Conformation: {1}, RMSD: {2}'.format(*best))

# Realign and reorder atom indexes according to the best match
best_match = matches[best[0]]
Chem.rdMolAlign.GetBestRMS(prbMol=p_align.ToRWMol(),
                           refMol=p_rmg.ToRWMol(),
                           map=[list(enumerate(best_match))])
if best[1]:
    p_align.Reflect()
new_atom_indexes = [best_match.index(i) for i in range(len(best_match))]
p_align = p_align.RenumberAtoms(new_atom_indexes)

NameError: name 'p_rmg' is not defined

### 4. View Molecules

In [46]:
mols_to_view = [r_mol, p_align,] + p_mols
entries = len(mols_to_view)

viewer = grid_viewer(viewer_grid=(1, entries), viewer_size=(240 * entries, 300),)
for i in range(entries):
    mol_viewer(mols_to_view[i].ToMolBlock(), 'sdf', viewer=viewer, viewer_loc=(0, i))

print('reactant    matched product      original products')
viewer.show()

reactant    matched product      original products


### 5. Export to SDF file and run ts_gen

In [47]:
r_mol.ToSDFFile('reactant.sdf')
p_align.ToSDFFile('product.sdf')

#### 5.1 TS Gen V2

In [48]:
TS_GEN_PYTHON = '~/Apps/anaconda3/envs/ts_gen_v2/bin/python3.7'
TS_GEN_DIR = '~/Apps/ts_gen_v2'

In [49]:
try:
    subprocess.run(f'export PYTHONPATH=$PYTHONPATH:{TS_GEN_DIR};'
                   f'{TS_GEN_PYTHON} {TS_GEN_DIR}/inference.py '
                   f'--r_sdf_path reactant.sdf '
                   f'--p_sdf_path product.sdf '
                   f'--ts_xyz_path TS.xyz',
                   check=True,
                   shell=True)
except subprocess.CalledProcessError as e:
    print(e)
else:
    with open('TS.xyz', 'r') as f:
        ts_xyz=f.read()
    ts = RDKitMol.FromXYZ(ts_xyz)

### 6. Visualize TS

In [50]:
# Align the TS to make visualization more convenient
atom_map = [(i, i) for i in range(r_mol.GetNumAtoms())]
ts.GetBestAlign(refMol=r_mol,
                atomMap=atom_map,
                keepBestConformer=True)

# View results in 3D geometries
mols_to_view = [r_mol, ts, p_align]
entries = len(mols_to_view)
viewer = grid_viewer(viewer_grid=(1, entries), viewer_size=(240 * entries, 300),)
for i in range(entries):
    mol_viewer(mols_to_view[i].ToMolBlock(), 'sdf', viewer=viewer, viewer_loc=(0, i))

print('reactant    TS      product')
viewer.show()

reactant    TS      product


Get TS xyz

In [51]:
print(ts.ToXYZ())

22

C      1.735066   -0.439314    1.051607
C      1.853989    0.893110    0.678944
C      1.641262    0.905010   -0.663430
C     -0.831680    0.486149   -0.966058
C     -1.392511    1.319296    0.072020
C     -2.254315    0.609091    0.741270
C     -2.179691   -0.731317    0.341177
C     -1.188826   -0.741141   -0.644998
C      1.816470   -0.412329   -1.153476
C      2.120668   -1.180878   -0.123956
H      1.986889   -0.741698    1.941073
H      1.886390    1.696963    1.309975
H      1.654443    1.646568   -1.142845
H     -0.828014    0.750773   -1.883750
H     -1.183272    2.348933    0.253047
H     -2.925812    1.053808    1.407552
H     -3.087100   -1.121142    0.122647
H     -1.964623   -1.393974    1.000605
H      0.164755   -0.460121    0.619571
H     -1.097631   -1.528969   -1.299336
H      1.635321   -0.785689   -2.107590
H      2.438221   -2.173130   -0.095473

