In [1]:
import h5py
import numpy as np

In [2]:
from torch_geometric.data import Data
import torch

In [3]:
hartree_2_eV = 27.211386245988

In [1]:
import h5py
import numpy as np

class RGD1Loader:
    """
    Loader for the RGD1 reaction dataset.

    Parameters
    ----------
    chno_h5_path : str
        Path to 'RGD1_CHNO.h5' containing reaction geometries and energies.
    rps_h5_path : str
        Path to 'RGD1_RPs.h5' containing individual reactant/product optimized geometries.
    randp_smiles_txt : str
        Path to 'RandP_smiles.txt' mapping reaction SMILES to molecule indices in RPs file.
    """
    def __init__(self, chno_h5_path, rps_h5_path=None, randp_smiles_txt=None):
        # Open main reaction file
        self.rg1 = h5py.File(chno_h5_path, 'r')
        # Optional: open individual molecule file
        self.rps = h5py.File(rps_h5_path, 'r') if rps_h5_path else None
        # Load reactant/product smiles mapping
        self.rp_dict = {}
        if randp_smiles_txt:
            with open(randp_smiles_txt, 'r', encoding='utf-8') as f:
                for idx, line in enumerate(f):
                    if idx == 0:
                        continue  # skip header
                    key, val = line.strip().split()[:2]
                    self.rp_dict[key] = val

        # Cache reaction IDs
        self.rxn_ids = list(self.rg1.keys())

    def __len__(self):
        """Return the number of reactions in the dataset."""
        return len(self.rxn_ids)

    def __iter__(self):
        """Iterate over all reactions as dictionaries."""
        for rid in self.rxn_ids:
            yield self.get_reaction(rid)

    def __getitem__(self, idx):
        """Get reaction by index or ID."""
        if isinstance(idx, int):
            rid = self.rxn_ids[idx]
        else:
            rid = idx
        return self.get_reaction(rid)

    def get_reaction(self, rid):
        """Parse a single reaction into a structured dict."""
        rxn = self.rg1[rid]
        # Atom numbers map for CHNOF
        num2elm = {1: 'H', 6: 'C', 7: 'N', 8: 'O', 9: 'F'}

        # Decode SMILES
        react_smiles = rxn['Rsmiles'][()].decode('utf-8')
        prod_smiles = rxn['Psmiles'][()].decode('utf-8')

        # Atomic numbers and geometries
        elems = np.array(rxn['elements'], dtype=int)
        atomic_numbers = elems.tolist()
        R_pos = np.array(rxn['RG'])        # reactant positions
        P_pos = np.array(rxn['PG'])        # product positions
        TS_pos = np.array(rxn['TSG'])     # transition state positions

        # Energies (single-point, enthalpy, free) in Hartree
        R_E, P_E, TS_E = (float(rxn['R_E'][()]),
                          float(rxn['P_E'][()]),
                          float(rxn['TS_E'][()]))
        R_H, P_H, TS_H = (float(rxn['R_H'][()]),
                          float(rxn['P_H'][()]),
                          float(rxn['TS_H'][()]))
        R_F, P_F, TS_F = (float(rxn['R_F'][()]),
                          float(rxn['P_F'][()]),
                          float(rxn['TS_F'][()]))

        reaction = {
            'rxn_id': rid,
            'reactant': {
                'smiles': react_smiles,
                'atomic_numbers': atomic_numbers,
                'positions': R_pos,
                'energy': R_E,
                'enthalpy': R_H,
                'free_energy': R_F,
            },
            'product': {
                'smiles': prod_smiles,
                'atomic_numbers': atomic_numbers,
                'positions': P_pos,
                'energy': P_E,
                'enthalpy': P_H,
                'free_energy': P_F,
            },
            'transition_state': {
                'atomic_numbers': atomic_numbers,
                'positions': TS_pos,
                'energy': TS_E,
                'enthalpy': TS_H,
                'free_energy': TS_F,
            }
        }

        # Optional: attach optimized individual molecule geometries & energies
        if self.rps and self.rp_dict:
            # map reactant/products mapping
            R_keys = react_smiles.split('.')
            P_keys = prod_smiles.split('.')
            reaction['reactant']['molecules'] = []
            reaction['product']['molecules'] = []
            for key in R_keys:
                idx = self.rp_dict.get(key)
                if idx and idx in self.rps:
                    mol = self.rps[idx]
                    reaction['reactant']['molecules'].append({
                        'atomic_numbers': list(mol['elements']),
                        'positions': np.array(mol['DFTG']),
                        'energy': float(mol['DFT_SPE'][()]),
                    })
            for key in P_keys:
                idx = self.rp_dict.get(key)
                if idx and idx in self.rps:
                    mol = self.rps[idx]
                    reaction['product']['molecules'].append({
                        'atomic_numbers': list(mol['elements']),
                        'positions': np.array(mol['DFTG']),
                        'energy': float(mol['DFT_SPE'][()]),
                    })

        return reaction

# Example usage
if __name__ == '__main__':
    loader = RGD1Loader(
        chno_h5_path='RGD1_CHNO.h5',
    )
    print(f"Total reactions: {len(loader)}")
    # Print first reaction summary
    rxn = loader[0]
    print(rxn['rxn_id'], rxn['reactant']['smiles'], "→", rxn['product']['smiles'])


Total reactions: 176992
MR_100001_2 CC.C[C@H]1N[C@@H](N)CO1 → CC[C@H](N[C@H](CO)N)C


In [8]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
# Constants
C = 6

def get_cc_bonds(atomic_numbers, positions, cutoff=(1.2, 1.7)):
    """Returns a set of tuples (i, j) for bonded C–C pairs"""
    dist = squareform(pdist(positions))
    bonds = set()
    for i in range(len(atomic_numbers)):
        if atomic_numbers[i] != C:
            continue
        for j in range(i + 1, len(atomic_numbers)):
            if atomic_numbers[j] != C:
                continue
            d = dist[i, j]
            if cutoff[0] <= d <= cutoff[1]:
                bonds.add((i, j))
    return bonds

from ase.io import write
def cc_bond_formed(reactant, product):
    """
    Compares reactant and product to detect if any new C–C bond formed.
    Returns True if any new C–C bond is present in the product.
    """
    r_cc = get_cc_bonds(reactant["atomic_numbers"], reactant["positions"])
    p_cc = get_cc_bonds(product["atomic_numbers"], product["positions"])
    new_bonds = p_cc - r_cc
    return len(new_bonds) > 0

# Atomic number constants
C, H, O = 6, 1, 8

def is_alpha_deprotonation(reactant, product):
    Z = reactant["atomic_numbers"]
    pos_r = np.array(reactant["positions"])
    pos_p = np.array(product["positions"])

    dist_r = squareform(pdist(pos_r))
    dist_p = squareform(pdist(pos_p))

    num_atoms = len(Z)

    for i in range(num_atoms):
        if Z[i] != C:
            continue

        for j in range(num_atoms):
            if Z[j] == O and 1.15 < dist_r[i][j] < 1.30:
                # C=O found; check for adjacent C with H in reactant
                for k in range(num_atoms):
                    if Z[k] == C and 1.3 < dist_r[i][k] < 1.7:
                        h_neighbors_r = [l for l in range(num_atoms)
                                         if Z[l] == H and dist_r[k][l] < 1.2]
                        h_neighbors_p = [l for l in range(num_atoms)
                                         if Z[l] == H and dist_p[k][l] < 1.2]
                        # H present in reactant but gone in product
                        if len(h_neighbors_r) > 0 and len(h_neighbors_p) == 0:
                            return True
    return False
from ase import Atoms
def to_atoms(numbers, positions):
    return Atoms(numbers=numbers, positions=positions)

def save_linked_reactions_to_xyz(r1, r2, filename="linked_reactions"):

    def save_reaction(reaction, label):
        for state in ["reactant", "transition_state", "product"]:
            atoms = to_atoms(
                reaction[state]["atomic_numbers"],
                reaction[state]["positions"]
            )
            write(filename, atoms, append=True, format="extxyz",)

    if r1:
        save_reaction(r1, "step1_deprotonation")
    if r2:
        save_reaction(r2, "step2_enolate_attack")

    print(f"\nSaved r1 and r2 reactions to folder: {filename}/")

from scipy.optimize import linear_sum_assignment
from scipy.spatial.distance import cdist
import numpy as np

def same_geometry(atomic_numbers1, pos1, atomic_numbers2, pos2, tol=0.3):
    """
    Compares two molecular geometries using atom-wise optimal matching.
    Returns True if they match within `tol` RMSD per atom.
    """
    if len(atomic_numbers1) != len(atomic_numbers2):
        return False
    if sorted(atomic_numbers1) != sorted(atomic_numbers2):
        return False

    pos1 = np.array(pos1)
    pos2 = np.array(pos2)
    Z1 = np.array(atomic_numbers1)
    Z2 = np.array(atomic_numbers2)

    total_rmsd = 0.0
    count = 0

    # Match atoms by type (H, C, O, N)
    for z in sorted(set(Z1)):
        idx1 = np.where(Z1 == z)[0]
        idx2 = np.where(Z2 == z)[0]

        if len(idx1) != len(idx2):
            return False

        sub_pos1 = pos1[idx1]
        sub_pos2 = pos2[idx2]

        dists = cdist(sub_pos1, sub_pos2)
        row_ind, col_ind = linear_sum_assignment(dists)
        rmsd = np.sqrt(np.mean((sub_pos1[row_ind] - sub_pos2[col_ind]) ** 2))
        total_rmsd += rmsd * len(idx1)
        count += len(idx1)

    avg_rmsd = total_rmsd / count
    return avg_rmsd < tol


def find_alpha_deprotonation_then_enolate_attack(dataloader):
    reactions = list(dataloader)
    idx_seq = 0
    for i, r1 in enumerate(reactions):
        if is_alpha_deprotonation(r1["reactant"], r1["product"]):
            prod1_Z = r1["product"]["atomic_numbers"]
            prod1_pos = r1["product"]["positions"]

            for r2 in reactions:
                if same_geometry(prod1_Z, prod1_pos,
                                 r2["reactant"]["atomic_numbers"],
                                 r2["reactant"]["positions"]):
                    if cc_bond_formed(r2["reactant"], r2["product"]):
                        print(f"\nFound linked pair:")
                        # return r1, r2
                        save_linked_reactions_to_xyz(r1, r2, filename=f"linked_reactions_{idx_seq}.xyz")
                        idx_seq += 1

    print("No matching sequence found.")
    # return None, None



# Run inspection
find_alpha_deprotonation_then_enolate_attack(loader)




Found linked pair:

Saved r1 and r2 reactions to folder: linked_reactions_0.xyz/

Found linked pair:

Saved r1 and r2 reactions to folder: linked_reactions_1.xyz/

Found linked pair:

Saved r1 and r2 reactions to folder: linked_reactions_2.xyz/

Found linked pair:

Saved r1 and r2 reactions to folder: linked_reactions_3.xyz/

Found linked pair:

Saved r1 and r2 reactions to folder: linked_reactions_4.xyz/

Found linked pair:

Saved r1 and r2 reactions to folder: linked_reactions_5.xyz/
No matching sequence found.


In [14]:
# find_beckmann_rgd1.py

import os
from rdkit import Chem

def find_beckmann_rearrangements(chno_h5, rps_h5=None, randp_smiles_txt=None,
                                 out_txt="beckmann_hits.txt"):
    """
    Scan the RGD1 dataset for reactions where:
      Reactant contains an oxime (C=N–O)
      Product contains an amide (C(=O)–N)
    Writes matching reaction IDs and SMILES to out_txt.
    """
    # SMARTS patterns
    oxime_pat  = Chem.MolFromSmarts("[CX3]=[NX2]-[OX1]")
    amide_pat  = Chem.MolFromSmarts("[CX3](=O)[NX3]")

    loader = RGD1Loader(chno_h5_path=chno_h5,
                        rps_h5_path=rps_h5,
                        randp_smiles_txt=randp_smiles_txt)

    with open(out_txt, "w") as fout:
        fout.write("rxn_id\treactant_smiles\tproduct_smiles\n")
        for rxn in loader:
            r_smiles = rxn["reactant"]["smiles"]
            p_smiles = rxn["product"]["smiles"]

            # Parse with RDKit
            r_mol = Chem.MolFromSmiles(r_smiles)
            p_mol = Chem.MolFromSmiles(p_smiles)
            if r_mol is None or p_mol is None:
                continue

            # Pattern match
            if r_mol.HasSubstructMatch(oxime_pat) and p_mol.HasSubstructMatch(amide_pat):
                rid = rxn["rxn_id"]
                print(f"Beckmann hit: {rid}  {r_smiles} → {p_smiles}")
                fout.write(f"{rid}\t{r_smiles}\t{p_smiles}\n")

    print(f"\nDone — results in {out_txt}")

if __name__ == "__main__":
    # adjust paths as needed
    chno = "RGD1_CHNO.h5"

    find_beckmann_rearrangements(chno,)



Done — results in beckmann_hits.txt


In [17]:
# find_perkin_rgd1_with_xyz.py

import os
from rdkit import Chem
from ase import Atoms
from ase.io import write

# SMARTS patterns
aldehyde_pat         = Chem.MolFromSmarts("[CX3H1](=O)[#6]")         # R–CHO
acid_anhydride_pat   = Chem.MolFromSmarts("[CX3](=O)O[CX3](=O)")     # R–C(=O)–O–C(=O)–R
alpha_beta_unsat_pat = Chem.MolFromSmarts("[CX3H1]=[CX2]C(=O)[OX2H]") # R–CH=CH–C(=O)–OH

def to_atoms(numbers, positions):
    return Atoms(numbers=numbers, positions=positions)

def find_perkin_reactions(chno_h5, rps_h5=None, randp_txt=None,
                          out_txt="perkin_hits.txt", xyz_dir="perkin_xyz"):
    loader = RGD1Loader(chno_h5_path=chno_h5,
                        rps_h5_path=rps_h5,
                        randp_smiles_txt=randp_txt)

    os.makedirs(xyz_dir, exist_ok=True)

    with open(out_txt, "w") as fout:
        fout.write("rxn_id\treactant_smiles\tproduct_smiles\n")
        for rxn in loader:
            rid       = rxn["rxn_id"]
            r_smiles  = rxn["reactant"]["smiles"]
            p_smiles  = rxn["product"]["smiles"]
            r_mol     = Chem.MolFromSmiles(r_smiles)
            p_mol     = Chem.MolFromSmiles(p_smiles)
            if r_mol is None or p_mol is None:
                continue

            if (r_mol.HasSubstructMatch(aldehyde_pat)
             and r_mol.HasSubstructMatch(acid_anhydride_pat)
             and p_mol.HasSubstructMatch(alpha_beta_unsat_pat)):

                # record in text file
                fout.write(f"{rid}\t{r_smiles}\t{p_smiles}\n")
                print(f"Perkin hit: {rid}  {r_smiles} → {p_smiles}")

                # save xyz for this reaction
                for state in ("reactant", "transition_state", "product"):
                    geom = rxn[state]
                    atoms = to_atoms(geom["atomic_numbers"], geom["positions"])
                    fn = os.path.join(xyz_dir, f"{rid}.xyz")
                    write(fn, atoms, append=True, format="extxyz")
                print(f"  → saved geometries to {xyz_dir}/{rid}_*.xyz")

    print(f"\nDone — hits in {out_txt}, xyz in {xyz_dir}/")


if __name__ == "__main__":
    CHNO    = "RGD1_CHNO.h5"

    find_perkin_reactions(CHNO, )



Done — hits in perkin_hits.txt, xyz in perkin_xyz/


In [19]:
# find_diels_alder_rgd1_with_xyz.py

import os
from rdkit import Chem
from rdkit.Chem.rdchem import BondType
from ase import Atoms
from ase.io import write

# SMARTS for conjugated diene and simple alkene
DIENE_PAT   = Chem.MolFromSmarts("[#6]=[#6]-[#6]=[#6]")
ALKENE_PAT  = Chem.MolFromSmarts("[#6]=[#6]")

def is_diels_alder(r_smiles: str, p_smiles: str) -> bool:
    """
    Returns True if reactant SMILES contains one conjugated diene fragment
    plus one alkene fragment, and product SMILES contains a cyclohexene ring
    (6-membered ring with exactly one C=C).
    """
    frags = r_smiles.split('.')
    if len(frags) != 2:
        return False

    diene_found = False
    alkene_found = False

    for frag in frags:
        mol = Chem.MolFromSmiles(frag)
        if mol is None:
            return False
        # Count double bonds
        n_db = sum(1 for b in mol.GetBonds() if b.GetBondType() == BondType.DOUBLE)

        if mol.HasSubstructMatch(DIENE_PAT) and n_db >= 2:
            diene_found = True
        elif mol.HasSubstructMatch(ALKENE_PAT) and n_db == 1:
            alkene_found = True

    if not (diene_found and alkene_found):
        return False

    # Check product for cyclohexene: 6-membered ring with exactly one double bond
    p_mol = Chem.MolFromSmiles(p_smiles)
    if p_mol is None:
        return False

    ring_info = p_mol.GetRingInfo()
    for ring in ring_info.AtomRings():
        if len(ring) == 6:
            # Count double bonds within this ring
            count_db = 0
            for i in range(len(ring)):
                a1 = ring[i]
                a2 = ring[(i + 1) % len(ring)]
                bond = p_mol.GetBondBetweenAtoms(a1, a2)
                if bond and bond.GetBondType() == BondType.DOUBLE:
                    count_db += 1
            if count_db == 1:
                return True

    return False

def save_diels_alder_xyz(chno_h5: str,
                         rps_h5: str = None,
                         randp_txt: str = None,
                         out_txt: str = "diels_alder_hits.txt",
                         xyz_dir: str = "diels_alder_xyz"):
    """
    Scan RGD1 for Diels–Alder candidates and:
      - Write hits to out_txt (rxn_id, reactant_smiles, product_smiles)
      - Save reactant/TS/product geometries as separate .xyz files
    """
    loader = RGD1Loader(chno_h5_path=chno_h5,
                        rps_h5_path=rps_h5,
                        randp_smiles_txt=randp_txt)
    os.makedirs(xyz_dir, exist_ok=True)

    with open(out_txt, "w") as fout:
        fout.write("rxn_id\treactant_smiles\tproduct_smiles\n")
        idx_seq = 0
        for rxn in loader:
            rid      = rxn["rxn_id"]
            r_smiles = rxn["reactant"]["smiles"]
            p_smiles = rxn["product"]["smiles"]

            if is_diels_alder(r_smiles, p_smiles):
                fout.write(f"{rid}\t{r_smiles}\t{p_smiles}\n")
                print(f"Diels–Alder hit: {rid}  {r_smiles} → {p_smiles}")

                # Save geometries
                for state in ("reactant", "transition_state", "product"):
                    geom  = rxn[state]
                    atoms = Atoms(numbers=geom["atomic_numbers"],
                                  positions=geom["positions"])
                    fn = os.path.join(xyz_dir, f"r{idx_seq}.xyz")
                    write(fn, atoms, append=True, format="extxyz")
                idx_seq += 1
                print(f" → saved XYZs to {xyz_dir}/{rid}_*.xyz")

    print(f"\nDone — see {out_txt} and {xyz_dir}/")

if __name__ == "__main__":
    CHNO    = "RGD1_CHNO.h5"
    RPS     = None            # optional
    MAPPING = None            # optional

    save_diels_alder_xyz(CHNO, RPS, MAPPING)


Diels–Alder hit: MR_117032_2  C=CC=C=O.C=C → O=C1CCCC=C1
 → saved XYZs to diels_alder_xyz/MR_117032_2_*.xyz
Diels–Alder hit: MR_591173_0  O=C=C(C=C(C)C)O.C=C → OC1=CC(C)(C)CCC1=O
 → saved XYZs to diels_alder_xyz/MR_591173_0_*.xyz

Done — see diels_alder_hits.txt and diels_alder_xyz/


In [20]:
# find_robinson_annulation_rgd1.py

import os
from rdkit import Chem
from rdkit.Chem.rdchem import BondType
from ase import Atoms
from ase.io import write

# SMARTS for a Michael donor (1,3-diketone) and Michael acceptor (α,β-unsaturated ketone)
DKETONE_PAT  = Chem.MolFromSmarts("[#6]C(=O)CC(=O)[#6]")          # R–C(=O)–CH2–C(=O)–R
ENONE_PAT    = Chem.MolFromSmarts("[CX3H1]=[CX2]C(=O)[#6]")       # R–CH=CH–C(=O)–R

def is_michael_addition(r_smiles: str, p_smiles: str) -> bool:
    """Step 1: Michael addition of a 1,3-diketone onto an enone → 1,5-diketone."""
    frags = r_smiles.split('.')
    if len(frags) != 2:
        return False
    d1, d2 = Chem.MolFromSmiles(frags[0]), Chem.MolFromSmiles(frags[1])
    prod = Chem.MolFromSmiles(p_smiles)
    if not all((d1, d2, prod)):
        return False

    # one fragment is 1,3-diketone, the other is enone
    if (d1.HasSubstructMatch(DKETONE_PAT) and d2.HasSubstructMatch(ENONE_PAT)) \
    or (d2.HasSubstructMatch(DKETONE_PAT) and d1.HasSubstructMatch(ENONE_PAT)):
        # product should still have two carbonyls (now 1,5-diketone) and no C=C
        if prod.HasSubstructMatch(DKETONE_PAT) and not prod.HasSubstructMatch(ENONE_PAT):
            return True
    return False

def is_aldol_cyclization(r_smiles: str, p_smiles: str) -> bool:
    """Step 2: Intramolecular aldol + dehydration → cyclohexenone ring."""
    react = Chem.MolFromSmiles(r_smiles)
    prod  = Chem.MolFromSmiles(p_smiles)
    if not (react and prod):
        return False

    # product must contain a 6-membered ring with exactly one C=C and one C=O
    ri = prod.GetRingInfo()
    for ring in ri.AtomRings():
        if len(ring) == 6:
            dbonds = 0
            carbonyls = 0
            for i in range(len(ring)):
                a1 = ring[i]
                a2 = ring[(i+1)%len(ring)]
                b = prod.GetBondBetweenAtoms(a1, a2)
                if b.GetBondType() == BondType.DOUBLE:
                    dbonds += 1
            # count C=O attachments into the ring
            for a in ring:
                atom = prod.GetAtomWithIdx(a)
                for nbr in atom.GetNeighbors():
                    b2 = prod.GetBondBetweenAtoms(a, nbr.GetIdx())
                    if nbr.GetAtomicNum() == 8 and b2.GetBondType() == BondType.DOUBLE:
                        carbonyls += 1
            if dbonds == 1 and carbonyls == 1:
                return True
    return False

def to_atoms(numbers, positions):
    return Atoms(numbers=numbers, positions=positions)

def find_robinson_annulation(chno_h5, rps_h5=None, randp_txt=None,
                             out_txt="robinson_hits.txt", xyz_dir="robinson_xyz"):
    loader = RGD1Loader(chno_h5_path=chno_h5,
                        rps_h5_path=rps_h5,
                        randp_smiles_txt=randp_txt)

    # index by product SMILES
    prod_index = {}
    for rxn in loader:
        p_sm = rxn["product"]["smiles"]
        prod_index.setdefault(p_sm, []).append(rxn)

    os.makedirs(xyz_dir, exist_ok=True)
    with open(out_txt, "w") as fout:
        fout.write("step1_rxn\tstep2_rxn\treactant1\tintermediate2\tproduct2\n")
        idx_seq = 0
        for r1 in loader:
            r1_sm = r1["reactant"]["smiles"]
            p1_sm = r1["product"]["smiles"]

            if not is_michael_addition(r1_sm, p1_sm):
                continue

            # look for intramolecular aldol on that intermediate
            for r2 in prod_index.get(p1_sm, []):
                if is_aldol_cyclization(r2["reactant"]["smiles"], r2["product"]["smiles"]):
                    # record
                    fout.write(f"{r1['rxn_id']}\t{r2['rxn_id']}\t"
                               f"{r1_sm}\t{p1_sm}\t{r2['product']['smiles']}\n")
                    print(f"Robinson annulation: {r1['rxn_id']} → {r2['rxn_id']}")

                    # save geometries
                    for label, rxn in [("step1", r1), ("step2", r2)]:
                        for state in ("reactant", "transition_state", "product"):
                            geom = rxn[state]
                            atoms = to_atoms(geom["atomic_numbers"], geom["positions"])
                            fn = os.path.join(xyz_dir, f"r{idx_seq}.xyz")
                            write(fn, atoms, append=True, format="extxyz")
                    break
            idx_seq += 1    

    print(f"\nDone — see hits in {out_txt} and geometries in {xyz_dir}/")

if __name__ == "__main__":
    CHNO    = "RGD1_CHNO.h5"
    RPS     = None
    MAPPING = None

    find_robinson_annulation(CHNO, RPS, MAPPING)



Done — see hits in robinson_hits.txt and geometries in robinson_xyz/


In [21]:
import os
from rdkit import Chem
from ase import Atoms
from ase.io import write

# SMARTS patterns for Michael donors and acceptors
DKETONE_PAT = Chem.MolFromSmarts("[#6]C(=O)CC(=O)[#6]")      # 1,3-diketone
ENONE_PAT   = Chem.MolFromSmarts("[CX3H1]=[CX2]C(=O)[#6]")    # α,β-unsaturated ketone

def is_michael_addition(r_smiles: str, p_smiles: str) -> bool:
    """
    Detects Michael addition: 1,3-diketone + enone -> 1,5-diketone
    """
    frags = r_smiles.split('.')
    if len(frags) != 2:
        return False
    mols = [Chem.MolFromSmiles(f) for f in frags]
    prod = Chem.MolFromSmiles(p_smiles)
    if None in mols or prod is None:
        return False
    d1, d2 = mols
    # one fragment is diketone, the other is enone
    if ((d1.HasSubstructMatch(DKETONE_PAT) and d2.HasSubstructMatch(ENONE_PAT)) or
        (d2.HasSubstructMatch(DKETONE_PAT) and d1.HasSubstructMatch(ENONE_PAT))):
        # product should have both carbonyls (diketone) and no C=C
        if prod.HasSubstructMatch(DKETONE_PAT) and not prod.HasSubstructMatch(ENONE_PAT):
            return True
    return False

def to_atoms(numbers, positions):
    return Atoms(numbers=numbers, positions=positions)

def find_michael_and_save_extxyz(chno_h5, rps_h5=None, randp_txt=None,
                                 xyz_dir="michael_extxyz"):
    """
    Finds Michael addition reactions in RGD1 and saves each reaction
    (reactant, TS, product) into individual extended XYZ files.
    """
    loader = RGD1Loader(chno_h5_path=chno_h5,
                        rps_h5_path=rps_h5,
                        randp_smiles_txt=randp_txt)
    os.makedirs(xyz_dir, exist_ok=True)

    idx_seq = 0
    for rxn in loader:
        r_sm = rxn["reactant"]["smiles"]
        p_sm = rxn["product"]["smiles"]
        if not is_michael_addition(r_sm, p_sm):
            continue

        filename = os.path.join(xyz_dir, f"r{idx_seq}.xyz")
        # remove existing file if any
        if os.path.exists(filename):
            os.remove(filename)

        # append all states to one extended XYZ
        for state in ("reactant", "transition_state", "product"):
            geom = rxn[state]
            atoms = to_atoms(geom["atomic_numbers"], geom["positions"])
            atoms.info["reaction_id"] = rxn.get("rxn_id", f"r{idx_seq}")
            atoms.info["state"] = state
            write(filename, atoms, append=True, format="extxyz")
        print(f"Saved Michael reaction {rxn.get('rxn_id', idx_seq)} to {filename}")
        idx_seq += 1

    print(f"\nTotal Michael additions saved: {idx_seq}")

if __name__ == "__main__":
    CHNO    = "RGD1_CHNO.h5"
    RPS     = None            # optional
    MAPPING = None            # optional

    find_michael_and_save_extxyz(CHNO, RPS, MAPPING)




Total Michael additions saved: 0


In [38]:
# Corrected script for detecting named reactions in RGD1 with RDKit and ASE
# Fix: Use Chem.MolToSmiles instead of non-existent Chem.CanonicalSmiles

import os
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from ase import Atoms
from ase.io import write

# SMARTS patterns (approximate) for named reactions
PATTERNS = {
    'claisen': {
        'reactant': Chem.MolFromSmarts('[#6]=[#6]-[#6]-O-[#6]'),      # vinyl ether
        'product':  Chem.MolFromSmarts('[#6]-[#6]=[#6]-C(=O)[#6]')   # γ,δ-unsat carbonyl
    },
    'cope': {
        'reactant': Chem.MolFromSmarts('[#6]=[#6]-[#6]-[#6]-[#6]=[#6]'),  # 1,5-diene
        'product':  Chem.MolFromSmarts('[#6]=[#6]-[#6]-[#6]-[#6]=[#6]')   # isomerization
    },
    'pinacol': {
        'reactant': Chem.MolFromSmarts('C([OX2H])[C;!H0]([OX2H])'),        # vicinal diol
        'product':  Chem.MolFromSmarts('C(=O)C([OX2H])')                  # α-hydroxy ketone
    },
    'wagner_meerwein': {
        # general carbon rearrangement
        'reactant': None,
        'product':  None
    }
}

def is_isomerization(r_mol, p_mol):
    """Check same formula and different connectivity."""
    return (rdMolDescriptors.CalcMolFormula(r_mol) == rdMolDescriptors.CalcMolFormula(p_mol)
            and Chem.MolToSmiles(r_mol, canonical=True) != Chem.MolToSmiles(p_mol, canonical=True))

def detect_claisen(r_mol, p_mol):
    return (r_mol.HasSubstructMatch(PATTERNS['claisen']['reactant'])
            and p_mol.HasSubstructMatch(PATTERNS['claisen']['product']))

def detect_cope(r_mol, p_mol):
    return (r_mol.HasSubstructMatch(PATTERNS['cope']['reactant'])
            and p_mol.HasSubstructMatch(PATTERNS['cope']['product'])
            and is_isomerization(r_mol, p_mol))

def detect_pinacol(r_mol, p_mol):
    return (r_mol.HasSubstructMatch(PATTERNS['pinacol']['reactant'])
            and p_mol.HasSubstructMatch(PATTERNS['pinacol']['product']))

def detect_wagner_meerwein(r_mol, p_mol):
    # pure carbon isomerization
    return (len(r_mol.GetAtoms()) == len(p_mol.GetAtoms())
            and all(atom.GetAtomicNum() == 6 for atom in r_mol.GetAtoms())
            and all(atom.GetAtomicNum() == 6 for atom in p_mol.GetAtoms())
            and is_isomerization(r_mol, p_mol))

def to_atoms(geom):
    return Atoms(numbers=geom['atomic_numbers'], positions=geom['positions'])

def save_extxyz(rxn, out_dir, idx):
    os.makedirs(out_dir, exist_ok=True)
    filename = os.path.join(out_dir, f"r{idx}.xyz")
    if os.path.exists(filename):
        os.remove(filename)
    for state in ('reactant', 'transition_state', 'product'):
        geom = rxn[state]
        atoms = to_atoms(geom)
        atoms.info['rxn_id'] = rxn['rxn_id']
        atoms.info['state'] = state
        atoms.info['energy'] = geom.get('energy')
        write(filename, atoms, append=True, format='extxyz')

def find_named_reactions(chno_h5, rps_h5=None, randp_txt=None):
    loader = RGD1Loader(chno_h5_path=chno_h5,
                        rps_h5_path=rps_h5,
                        randp_smiles_txt=randp_txt)
    detectors = {
        #'claisen':           detect_claisen,
        'cope':              detect_cope,
        #'pinacol':           detect_pinacol,
        #'wagner_meerwein':   detect_wagner_meerwein
    }
    counts = {name: 0 for name in detectors}
    for name, detector in detectors.items():
        out_dir = f"{name}_xyz"
        idx = 0
        for rxn in loader:
            r_mol = Chem.MolFromSmiles(rxn['reactant']['smiles'])
            p_mol = Chem.MolFromSmiles(rxn['product']['smiles'])
            if not r_mol or not p_mol:
                continue
            if detector(r_mol, p_mol):
                save_extxyz(rxn, out_dir, idx)
                print(f"{name.title()} hit: {rxn['rxn_id']} → saved to {out_dir}/r{idx}.xyz")
                idx += 1
        counts[name] = idx
    print("Summary of hits:")
    for name, count in counts.items():
        print(f"  {name.title()}: {count}")

if __name__ == '__main__':
    CHNO = 'RGD1_CHNO.h5'
    RPS = None
    MAP = None
    find_named_reactions(CHNO, RPS, MAP)


Cope hit: MR_103198_0 → saved to cope_xyz/r0.xyz
Cope hit: MR_103198_2 → saved to cope_xyz/r1.xyz
Cope hit: MR_104090_1 → saved to cope_xyz/r2.xyz
Cope hit: MR_110865_0 → saved to cope_xyz/r3.xyz
Cope hit: MR_110865_1 → saved to cope_xyz/r4.xyz
Cope hit: MR_110865_2 → saved to cope_xyz/r5.xyz
Cope hit: MR_125664_2 → saved to cope_xyz/r6.xyz
Cope hit: MR_125915_1 → saved to cope_xyz/r7.xyz
Cope hit: MR_130501_0 → saved to cope_xyz/r8.xyz
Cope hit: MR_130501_1 → saved to cope_xyz/r9.xyz
Cope hit: MR_130755_0 → saved to cope_xyz/r10.xyz
Cope hit: MR_130755_2 → saved to cope_xyz/r11.xyz
Cope hit: MR_134331_0 → saved to cope_xyz/r12.xyz
Cope hit: MR_134331_2 → saved to cope_xyz/r13.xyz
Cope hit: MR_142477_0 → saved to cope_xyz/r14.xyz
Cope hit: MR_143120_0 → saved to cope_xyz/r15.xyz
Cope hit: MR_143120_1 → saved to cope_xyz/r16.xyz
Cope hit: MR_163294_0 → saved to cope_xyz/r17.xyz
Cope hit: MR_163294_1 → saved to cope_xyz/r18.xyz
Cope hit: MR_171476_0 → saved to cope_xyz/r19.xyz
Cope hit: 

In [3]:
import os
from rdkit import Chem
from ase import Atoms
from ase.io import write

# SMARTS patterns for Beckmann rearrangement
oxime_pat  = Chem.MolFromSmarts("[CX3]=[NX2]-[OX2H]")  # R1R2C=NOH
amide_pat  = Chem.MolFromSmarts("[CX3](=O)[NX3]")      # R1C(=O)NR2

def to_atoms(numbers, positions):
    return Atoms(numbers=numbers, positions=positions)

def find_beckmann_and_save_xyz(chno_h5, rps_h5=None, randp_txt=None,
                               xyz_dir="beckmann_xyz"):
    """
    Finds Beckmann rearrangement reactions in RGD1 (oxime -> amide)
    and saves reactant, TS, and product geometries to individual .xyz files.
    """
    loader = RGD1Loader(chno_h5_path=chno_h5,
                        rps_h5_path=rps_h5,
                        randp_smiles_txt=randp_txt)
    os.makedirs(xyz_dir, exist_ok=True)

    hits = 0
    for rxn in loader:
        rid = rxn["rxn_id"]
        r_sm = rxn["reactant"]["smiles"]
        p_sm = rxn["product"]["smiles"]
        r_mol = Chem.MolFromSmiles(r_sm)
        p_mol = Chem.MolFromSmiles(p_sm)
        if not r_mol or not p_mol:
            continue

        # detect Beckmann: oxime -> amide
        if r_mol.HasSubstructMatch(oxime_pat) and p_mol.HasSubstructMatch(amide_pat):
            hits += 1
            print(f"Beckmann hit: {rid}  {r_sm} → {p_sm}")

            # save XYZ files
            for state in ("reactant", "transition_state", "product"):
                geom = rxn[state]
                atoms = to_atoms(geom["atomic_numbers"], geom["positions"])
                filename = os.path.join(xyz_dir, f"r{hits-1}.xyz")
                write(filename, atoms, append=True, format="extxyz")
            print(f"  → saved geometries to {xyz_dir}/{rid}_*.xyz")

    print(f"\nTotal Beckmann reactions found: {hits}")

# Example usage:
if __name__ == "__main__":
    CHNO    = "RGD1_CHNO.h5"
    RPS     = None            # optional
    MAPPING = None            # optional

    find_beckmann_and_save_xyz(CHNO, RPS, MAPPING)



Beckmann hit: MR_100791_1  O/N=C/[C@H](C(=O)O)C → C/C=C\N(C(=O)O)O
  → saved geometries to beckmann_xyz/MR_100791_1_*.xyz
Beckmann hit: MR_1010_0  C/C(=N\O)/OCC → CCN(C(=O)C)O
  → saved geometries to beckmann_xyz/MR_1010_0_*.xyz
Beckmann hit: MR_102368_0  C/C(=N\O)/OC(=O)N → ON(C(=O)N)C(=O)C
  → saved geometries to beckmann_xyz/MR_102368_0_*.xyz
Beckmann hit: MR_102381_1  CC(=O)O/C(=N/O)/N → ON(C(=O)N)C(=O)C
  → saved geometries to beckmann_xyz/MR_102381_1_*.xyz
Beckmann hit: MR_110072_0  C/C(=N\O)/OC=C → CC(=O)N(C=C)O
  → saved geometries to beckmann_xyz/MR_110072_0_*.xyz
Beckmann hit: MR_114005_0  CC(O)(O)N.ON=C → NC(=O)O.CN(O)C
  → saved geometries to beckmann_xyz/MR_114005_0_*.xyz
Beckmann hit: MR_114201_2  O/N=C(/CC(=O)O)\C → CC(=C)N(C(=O)O)O
  → saved geometries to beckmann_xyz/MR_114201_2_*.xyz
Beckmann hit: MR_116607_0  CC(=O)C.C/C=N\O → CC(N(C(=O)C)O)C
  → saved geometries to beckmann_xyz/MR_116607_0_*.xyz
Beckmann hit: MR_120529_1  CC(=O)N.ON=C → CC(=O)N(CN)O
  → saved geomet

In [2]:
import os
from rdkit import Chem
from ase import Atoms
from ase.io import write

# SMARTS patterns for Curtius
acyl_azide_pat = Chem.MolFromSmarts('[CX3](=O)N=[NX2]=[NX1]')
isocyanate_pat = Chem.MolFromSmarts('[NX2]=[CX2]=[OX1]')

def to_atoms(geom):
    return Atoms(numbers=geom['atomic_numbers'], positions=geom['positions'])

def find_curtius_and_save_extxyz(checkpoint_path, xyz_dir="curtius_extxyz"):
    loader = RGD1Loader(checkpoint_path)
    os.makedirs(xyz_dir, exist_ok=True)

    idx = 0
    for rxn in loader:
        r_mol = Chem.MolFromSmiles(rxn['reactant']['smiles'])
        p_mol = Chem.MolFromSmiles(rxn['product']['smiles'])
        if not r_mol or not p_mol:
            continue

        if r_mol.HasSubstructMatch(acyl_azide_pat) and p_mol.HasSubstructMatch(isocyanate_pat):
            filename = os.path.join(xyz_dir, f"curtius_{idx}.xyz")
            if os.path.exists(filename):
                os.remove(filename)

            for state in ("reactant", "transition_state", "product"):
                geom = rxn[state]
                atoms = to_atoms(geom)
                atoms.info["rxn_id"] = rxn["rxn_id"]
                atoms.info["state"] = state
                write(filename, atoms, append=True, format="extxyz")

            print(f"Saved Curtius reaction {rxn['rxn_id']} to {filename}")
            idx += 1

    print(f"\nTotal Curtius rearrangements saved: {idx}")

if __name__ == "__main__":
    find_curtius_and_save_extxyz("RGD1_CHNO.h5")



Total Curtius rearrangements saved: 0


In [5]:
import os
from rdkit import Chem
from ase import Atoms
from ase.io import write

# SMARTS for Lossen rearrangement (hydroxamic acid → isocyanate)
hydroxamic_pat = Chem.MolFromSmarts('[CX3](=O)N[OX2H]')
isocyanate_pat = Chem.MolFromSmarts('[NX2]=[CX2]=[OX1]')

# SMARTS for Cannizzaro reaction (disproportionation of two aldehydes)
aldehyde_pat = Chem.MolFromSmarts('[CX3H1](=O)[#6]')
alcohol_pat  = Chem.MolFromSmarts('[CX4H2][OX2H]')
acid_pat     = Chem.MolFromSmarts('[CX3](=O)[OX2H]')

def to_atoms(geom):
    """Convert geometry dict to an ASE Atoms object."""
    return Atoms(numbers=geom['atomic_numbers'], positions=geom['positions'])

def save_extxyz(rxn, idx, rxn_type, out_dir):
    """
    Write reactant, TS, and product of a single reaction
    into one extended-XYZ file named {rxn_type}_{idx}.xyz.
    """
    os.makedirs(out_dir, exist_ok=True)
    fname = os.path.join(out_dir, f"r{idx}.xyz")
    # Remove existing to start fresh
    if os.path.exists(fname):
        os.remove(fname)
    for state in ('reactant', 'transition_state', 'product'):
        geom = rxn[state]
        atoms = to_atoms(geom)
        atoms.info['rxn_id'] = rxn['rxn_id']
        atoms.info['state']  = state
        write(fname, atoms, append=True, format='extxyz')

def find_lossen(chno_h5, out_dir="lossen_extxyz"):
    """
    Scan RGD1 for Lossen rearrangements (hydroxamic acid → isocyanate)
    and save each hit to an extended XYZ.
    """
    loader = RGD1Loader(chno_h5)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        r_mol = Chem.MolFromSmiles(r_sm)
        p_mol = Chem.MolFromSmiles(p_sm)
        if r_mol and p_mol:
            if (r_mol.HasSubstructMatch(hydroxamic_pat)
             and p_mol.HasSubstructMatch(isocyanate_pat)):
                save_extxyz(rxn, idx, "lossen", out_dir)
                print(f"Lossen hit: {rxn['rxn_id']} → {out_dir}/lossen_{idx}.xyz")
                idx += 1
    print(f"Total Lossen hits: {idx}")

def find_cannizzaro(chno_h5, out_dir="cannizzaro_extxyz"):
    """
    Scan RGD1 for Cannizzaro reactions (2 R–CHO → R–CH2OH + R–COOH)
    and save each hit to an extended XYZ.
    """
    loader = RGD1Loader(chno_h5)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        # Reactant must be two aldehydes
        frags_r = r_sm.split('.')
        if len(frags_r) != 2:
            continue
        mols_r = [Chem.MolFromSmiles(f) for f in frags_r]
        if any(m is None for m in mols_r):
            continue
        if not all(m.HasSubstructMatch(aldehyde_pat) for m in mols_r):
            continue
        # Product must be one alcohol + one acid
        frags_p = p_sm.split('.')
        if len(frags_p) != 2:
            continue
        mols_p = [Chem.MolFromSmiles(f) for f in frags_p]
        if any(m is None for m in mols_p):
            continue
        if ( any(m.HasSubstructMatch(alcohol_pat) for m in mols_p)
          and any(m.HasSubstructMatch(acid_pat)    for m in mols_p) ):
            save_extxyz(rxn, idx, "cannizzaro", out_dir)
            print(f"Cannizzaro hit: {rxn['rxn_id']} → {out_dir}/cannizzaro_{idx}.xyz")
            idx += 1
    print(f"Total Cannizzaro hits: {idx}")

if __name__ == "__main__":
    # Path to your RGD1_CHNO.h5
    CHNO_PATH = "RGD1_CHNO.h5"

    # Find and save Lossen rearrangements
    find_lossen(CHNO_PATH, out_dir="lossen_extxyz")

    # Find and save Cannizzaro reactions
    find_cannizzaro(CHNO_PATH, out_dir="cannizzaro_extxyz")


Lossen hit: MR_110070_1 → lossen_extxyz/lossen_0.xyz
Lossen hit: MR_118404_1 → lossen_extxyz/lossen_1.xyz
Lossen hit: MR_119727_0 → lossen_extxyz/lossen_2.xyz
Lossen hit: MR_124830_0 → lossen_extxyz/lossen_3.xyz
Lossen hit: MR_125002_2 → lossen_extxyz/lossen_4.xyz
Lossen hit: MR_143778_0 → lossen_extxyz/lossen_5.xyz
Lossen hit: MR_145939_1 → lossen_extxyz/lossen_6.xyz
Lossen hit: MR_148737_0 → lossen_extxyz/lossen_7.xyz
Lossen hit: MR_148737_1 → lossen_extxyz/lossen_8.xyz
Lossen hit: MR_176761_2 → lossen_extxyz/lossen_9.xyz
Lossen hit: MR_199418_0 → lossen_extxyz/lossen_10.xyz
Lossen hit: MR_199418_2 → lossen_extxyz/lossen_11.xyz
Lossen hit: MR_202499_0 → lossen_extxyz/lossen_12.xyz
Lossen hit: MR_202818_1 → lossen_extxyz/lossen_13.xyz
Lossen hit: MR_205252_1 → lossen_extxyz/lossen_14.xyz
Lossen hit: MR_215323_1 → lossen_extxyz/lossen_15.xyz
Lossen hit: MR_216566_2 → lossen_extxyz/lossen_16.xyz
Lossen hit: MR_220656_1 → lossen_extxyz/lossen_17.xyz
Lossen hit: MR_224101_1 → lossen_extxy

In [6]:
import os
from rdkit import Chem
from ase import Atoms
from ase.io import write

# SMARTS for Alder–ene reaction
# ene donor: allylic H next to C=C -> pattern: [CH2;!$(*=*)]-[CX2]=[CX2]
# enophile: C=O
ene_donor_pat = Chem.MolFromSmarts('[CH2;!$(*=*)]-[CX2]=[CX2]')
enophile_pat  = Chem.MolFromSmarts('[CX3]=O')
alkene_pat    = Chem.MolFromSmarts('[CX3]=[CX3]')  # product alkene or alcohol

# SMARTS for Dieckmann condensation (intramolecular Claisen)
ester_pat = Chem.MolFromSmarts('[CX3](=O)O[#6]')

def to_atoms(geom):
    return Atoms(numbers=geom['atomic_numbers'], positions=geom['positions'])

def save_extxyz(rxn, idx, rxn_type, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    fname = os.path.join(out_dir, f"r{idx}.xyz")
    if os.path.exists(fname):
        os.remove(fname)
    for state in ('reactant', 'transition_state', 'product'):
        geom = rxn[state]
        atoms = to_atoms(geom)
        atoms.info['rxn_id'] = rxn['rxn_id']
        atoms.info['state']  = state
        write(fname, atoms, append=True, format='extxyz')

def find_ene(chno_h5, out_dir="ene_extxyz"):
    """
    Finds Alder–ene type reactions: an ene donor with allylic H and an enophile (C=O).
    """
    loader = RGD1Loader(chno_h5)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        # require two fragments
        if '.' not in r_sm:
            continue
        frags = r_sm.split('.')
        if len(frags) != 2:
            continue
        donor_mol = Chem.MolFromSmiles(frags[0])
        enophile_mol = Chem.MolFromSmiles(frags[1])
        p_mol = Chem.MolFromSmiles(p_sm)
        if not donor_mol or not enophile_mol or not p_mol:
            continue
        # detect patterns
        if donor_mol.HasSubstructMatch(ene_donor_pat) and enophile_mol.HasSubstructMatch(enophile_pat):
            # product should have new C-O bond: detect alcohol pat
            if p_mol.HasSubstructMatch(Chem.MolFromSmarts('[OX2H]')) and p_mol.HasSubstructMatch(alkene_pat):
                save_extxyz(rxn, idx, "ene", out_dir)
                print(f"Ene hit: {rxn['rxn_id']} → {out_dir}/ene_{idx}.xyz")
                idx += 1
    print(f"Total Ene hits: {idx}")

def find_dieckmann(chno_h5, out_dir="dieckmann_extxyz"):
    """
    Finds Dieckmann condensations: intramolecular di-ester -> cyclic β-keto ester.
    """
    loader = RGD1Loader(chno_h5)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        # single fragment containing at least two ester groups
        if '.' in r_sm:
            continue
        r_mol = Chem.MolFromSmiles(r_sm)
        p_mol = Chem.MolFromSmiles(rxn['product']['smiles'])
        if not r_mol or not p_mol:
            continue
        # count ester substructures
        ester_matches = r_mol.GetSubstructMatches(ester_pat)
        if len(ester_matches) < 2:
            continue
        # product should be cyclic (ring count > 0)
        if p_mol.GetRingInfo().NumRings() > 0:
            save_extxyz(rxn, idx, "dieckmann", out_dir)
            print(f"Dieckmann hit: {rxn['rxn_id']} → {out_dir}/dieckmann_{idx}.xyz")
            idx += 1
    print(f"Total Dieckmann hits: {idx}")

# Example usage:
find_ene("RGD1_CHNO.h5")
find_dieckmann("RGD1_CHNO.h5")


Total Ene hits: 0
Dieckmann hit: MR_104396_0 → dieckmann_extxyz/dieckmann_0.xyz
Dieckmann hit: MR_148002_1 → dieckmann_extxyz/dieckmann_1.xyz
Dieckmann hit: MR_148012_0 → dieckmann_extxyz/dieckmann_2.xyz
Dieckmann hit: MR_16903_1 → dieckmann_extxyz/dieckmann_3.xyz
Dieckmann hit: MR_179761_1 → dieckmann_extxyz/dieckmann_4.xyz
Dieckmann hit: MR_249031_1 → dieckmann_extxyz/dieckmann_5.xyz
Dieckmann hit: MR_249057_0 → dieckmann_extxyz/dieckmann_6.xyz
Dieckmann hit: MR_274870_1 → dieckmann_extxyz/dieckmann_7.xyz
Dieckmann hit: MR_291167_0 → dieckmann_extxyz/dieckmann_8.xyz
Dieckmann hit: MR_295877_1 → dieckmann_extxyz/dieckmann_9.xyz
Dieckmann hit: MR_307782_1 → dieckmann_extxyz/dieckmann_10.xyz
Dieckmann hit: MR_362268_1 → dieckmann_extxyz/dieckmann_11.xyz
Dieckmann hit: MR_362396_0 → dieckmann_extxyz/dieckmann_12.xyz
Dieckmann hit: MR_362929_0 → dieckmann_extxyz/dieckmann_13.xyz
Dieckmann hit: MR_369955_1 → dieckmann_extxyz/dieckmann_14.xyz
Dieckmann hit: MR_373937_0 → dieckmann_extxyz/di

In [7]:
import os
from rdkit import Chem
from ase import Atoms
from ase.io import write

# Common SMARTS
aldehyde_pat      = Chem.MolFromSmarts('[CX3H1](=O)[#6]')
aldol_product_pat = Chem.MolFromSmarts('C([OX2H])C(=O)')       # β‑hydroxy carbonyl

ester_pat         = Chem.MolFromSmarts('[CX3](=O)O[#6]')
ketone_pat        = Chem.MolFromSmarts('[CX3](=O)[#6]')

hexatriene_pat    = Chem.MolFromSmarts('C=C-C=C-C=C')
cyclohexadiene_pat= Chem.MolFromSmarts('C1=CC=CC=C1')

def to_atoms(geom):
    return Atoms(numbers=geom['atomic_numbers'], positions=geom['positions'])

def save_extxyz(rxn, idx, name, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    fn = os.path.join(out_dir, f"r{idx}.xyz")
    if os.path.exists(fn):
        os.remove(fn)
    for state in ('reactant','transition_state','product'):
        geom = rxn[state]
        atoms = to_atoms(geom)
        atoms.info['rxn_id'] = rxn['rxn_id']
        atoms.info['state']  = state
        write(fn, atoms, append=True, format='extxyz')

def find_aldol(RGD1_h5, out_dir="aldol_extxyz"):
    loader = RGD1Loader(RGD1_h5)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        if '.' not in r_sm: 
            continue
        frags = r_sm.split('.')
        if len(frags)!=2: 
            continue
        mols = [Chem.MolFromSmiles(f) for f in frags]
        if None in mols: 
            continue
        # both fragments must be aldehydes
        if not all(m.HasSubstructMatch(aldehyde_pat) for m in mols):
            continue
        p_mol = Chem.MolFromSmiles(p_sm)
        if p_mol and p_mol.HasSubstructMatch(aldol_product_pat):
            save_extxyz(rxn, idx, "aldol", out_dir)
            print(f"Aldol hit: {rxn['rxn_id']} → {out_dir}/aldol_{idx}.xyz")
            idx += 1
    print(f"Total Aldol hits: {idx}")

def find_claisen_condensation(RGD1_h5, out_dir="claisen_cond_extxyz"):
    loader = RGD1Loader(RGD1_h5)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        if '.' not in r_sm:
            continue
        frags = r_sm.split('.')
        if len(frags)!=2:
            continue
        mols = [Chem.MolFromSmiles(f) for f in frags]
        if None in mols:
            continue
        # both fragments must be esters
        if not all(m.HasSubstructMatch(ester_pat) for m in mols):
            continue
        p_mol = Chem.MolFromSmiles(p_sm)
        # product must have at least two ketones and still an ester
        if (p_mol and
            len(p_mol.GetSubstructMatches(ketone_pat)) >= 2 and
            p_mol.HasSubstructMatch(ester_pat)):
            save_extxyz(rxn, idx, "claisen_cond", out_dir)
            print(f"Claisen condensation hit: {rxn['rxn_id']} → {out_dir}/claisen_cond_{idx}.xyz")
            idx += 1
    print(f"Total Claisen condensation hits: {idx}")

def find_electrocyclic(RGD1_h5, out_dir="electrocyclic_extxyz"):
    loader = RGD1Loader(RGD1_h5)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        # linear hexatriene reactant
        if Chem.MolFromSmiles(r_sm).HasSubstructMatch(hexatriene_pat):
            p_mol = Chem.MolFromSmiles(p_sm)
            # cyclohexadiene product
            if p_mol and p_mol.HasSubstructMatch(cyclohexadiene_pat):
                save_extxyz(rxn, idx, "electrocyclic", out_dir)
                print(f"Electrocyclic hit: {rxn['rxn_id']} → {out_dir}/electrocyclic_{idx}.xyz")
                idx += 1
    print(f"Total electrocyclic hits: {idx}")

if __name__ == "__main__":
    H5 = "RGD1_CHNO.h5"
    find_aldol(H5)
    find_claisen_condensation(H5)
    find_electrocyclic(H5)


Aldol hit: MR_153989_0 → aldol_extxyz/aldol_0.xyz
Aldol hit: MR_187971_1 → aldol_extxyz/aldol_1.xyz
Aldol hit: MR_190968_2 → aldol_extxyz/aldol_2.xyz
Aldol hit: MR_208332_1 → aldol_extxyz/aldol_3.xyz
Aldol hit: MR_267947_0 → aldol_extxyz/aldol_4.xyz
Aldol hit: MR_27718_2 → aldol_extxyz/aldol_5.xyz
Aldol hit: MR_293218_1 → aldol_extxyz/aldol_6.xyz
Aldol hit: MR_295659_2 → aldol_extxyz/aldol_7.xyz
Aldol hit: MR_311415_0 → aldol_extxyz/aldol_8.xyz
Aldol hit: MR_353564_2 → aldol_extxyz/aldol_9.xyz
Aldol hit: MR_360083_1 → aldol_extxyz/aldol_10.xyz
Aldol hit: MR_395380_1 → aldol_extxyz/aldol_11.xyz
Aldol hit: MR_412211_0 → aldol_extxyz/aldol_12.xyz
Aldol hit: MR_425354_2 → aldol_extxyz/aldol_13.xyz
Aldol hit: MR_426234_1 → aldol_extxyz/aldol_14.xyz
Aldol hit: MR_434267_0 → aldol_extxyz/aldol_15.xyz
Aldol hit: MR_472789_0 → aldol_extxyz/aldol_16.xyz
Aldol hit: MR_492610_2 → aldol_extxyz/aldol_17.xyz
Aldol hit: MR_499007_0 → aldol_extxyz/aldol_18.xyz
Aldol hit: MR_499007_2 → aldol_extxyz/aldo

In [8]:
import os
from rdkit import Chem
from ase import Atoms
from ase.io import write

# SMARTS patterns
ketone_pat   = Chem.MolFromSmarts('[#6][CX3](=O)[#6]')     # R-CO-R'
enol_pat     = Chem.MolFromSmarts('[#6][CX3]=[CX3]O[H]')   # R-C=C-OH

alcohol_pat  = Chem.MolFromSmarts('[CX4H2][OX2H]')         # R-CH2-OH
betaH_pat    = Chem.MolFromSmarts('[#6][CX4H2][#6]')      # CH2 with at least one neighbor C
alkene_pat   = Chem.MolFromSmarts('[#6]=[#6]')            # C=C

def to_atoms(geom):
    return Atoms(numbers=geom['atomic_numbers'], positions=geom['positions'])

def save_extxyz(rxn, idx, name, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    fn = os.path.join(out_dir, f"r{idx}.xyz")
    if os.path.exists(fn):
        os.remove(fn)
    for state in ('reactant', 'transition_state', 'product'):
        geom = rxn[state]
        atoms = to_atoms(geom)
        atoms.info['rxn_id'] = rxn['rxn_id']
        atoms.info['state']  = state
        write(fn, atoms, append=True, format='extxyz')

def find_tautomerism(h5path, out_dir="tautomerism_extxyz"):
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        r_mol = Chem.MolFromSmiles(r_sm)
        p_mol = Chem.MolFromSmiles(p_sm)
        if not r_mol or not p_mol:
            continue
        if (r_mol.HasSubstructMatch(ketone_pat)
         and p_mol.HasSubstructMatch(enol_pat)):
            save_extxyz(rxn, idx, "tautomer", out_dir)
            print(f"Tautomerism hit: {rxn['rxn_id']} → {out_dir}/tautomer_{idx}.xyz")
            idx += 1
    print(f"Total keto–enol tautomers: {idx}")

def find_dehydration(h5path, out_dir="dehydration_extxyz"):
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        r_mol = Chem.MolFromSmiles(r_sm)
        p_mol = Chem.MolFromSmiles(p_sm)
        if not r_mol or not p_mol:
            continue
        # require an alcohol with a beta‑CH2 in reactant
        if r_mol.HasSubstructMatch(alcohol_pat) and r_mol.HasSubstructMatch(betaH_pat):
            # product must form a C=C
            if p_mol.HasSubstructMatch(alkene_pat):
                save_extxyz(rxn, idx, "dehydration", out_dir)
                print(f"Dehydration hit: {rxn['rxn_id']} → {out_dir}/dehydration_{idx}.xyz")
                idx += 1
    print(f"Total alcohol dehydrations: {idx}")

if __name__ == "__main__":
    H5 = "RGD1_CHNO.h5"
    find_tautomerism(H5)
    find_dehydration(H5)


Total keto–enol tautomers: 0
Dehydration hit: MR_101772_0 → dehydration_extxyz/dehydration_0.xyz
Dehydration hit: MR_101773_1 → dehydration_extxyz/dehydration_1.xyz
Dehydration hit: MR_10322_1 → dehydration_extxyz/dehydration_2.xyz
Dehydration hit: MR_104325_2 → dehydration_extxyz/dehydration_3.xyz
Dehydration hit: MR_104852_1 → dehydration_extxyz/dehydration_4.xyz
Dehydration hit: MR_105613_0 → dehydration_extxyz/dehydration_5.xyz
Dehydration hit: MR_106761_0 → dehydration_extxyz/dehydration_6.xyz
Dehydration hit: MR_10709_0 → dehydration_extxyz/dehydration_7.xyz
Dehydration hit: MR_10709_2 → dehydration_extxyz/dehydration_8.xyz
Dehydration hit: MR_10730_1 → dehydration_extxyz/dehydration_9.xyz
Dehydration hit: MR_107709_2 → dehydration_extxyz/dehydration_10.xyz
Dehydration hit: MR_107794_0 → dehydration_extxyz/dehydration_11.xyz
Dehydration hit: MR_107794_1 → dehydration_extxyz/dehydration_12.xyz
Dehydration hit: MR_107794_2 → dehydration_extxyz/dehydration_13.xyz
Dehydration hit: MR

In [9]:
import os
from rdkit import Chem
from ase import Atoms
from ase.io import write

# SMARTS for Tishchenko reaction: two aldehydes -> ester
aldehyde_pat       = Chem.MolFromSmarts('[CX3H1](=O)[#6]')
ester_pat          = Chem.MolFromSmarts('[#6]C(=O)O[#6]')

# SMARTS for Benzilic acid rearrangement: α-diketone -> α-hydroxy acid
adjacent_diketone_pat = Chem.MolFromSmarts('[#6](=O)[#6](=O)')
alpha_hydroxy_acid_pat = Chem.MolFromSmarts('[#6][CX4]([OX2H])[#6]')  # tertiary alcohol fragment
carboxylic_acid_pat    = Chem.MolFromSmarts('[CX3](=O)[OX2H]')        # COOH

def to_atoms(geom):
    return Atoms(numbers=geom['atomic_numbers'], positions=geom['positions'])

def save_extxyz(rxn, idx, rxn_type, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    fname = os.path.join(out_dir, f"r{idx}.xyz")
    if os.path.exists(fname):
        os.remove(fname)
    for state in ('reactant', 'transition_state', 'product'):
        geom = rxn[state]
        atoms = to_atoms(geom)
        atoms.info['rxn_id'] = rxn['rxn_id']
        atoms.info['state']  = state
        write(fname, atoms, append=True, format='extxyz')

def find_tishchenko(h5path, out_dir="tishchenko_extxyz"):
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        # reactant must be two aldehyde fragments
        if '.' not in r_sm:
            continue
        frags_r = r_sm.split('.')
        if len(frags_r) != 2:
            continue
        mols_r = [Chem.MolFromSmiles(f) for f in frags_r]
        if any(m is None for m in mols_r):
            continue
        if all(m.HasSubstructMatch(aldehyde_pat) for m in mols_r):
            p_mol = Chem.MolFromSmiles(p_sm)
            if p_mol and p_mol.HasSubstructMatch(ester_pat):
                save_extxyz(rxn, idx, "tishchenko", out_dir)
                print(f"Tishchenko hit: {rxn['rxn_id']} → {out_dir}/tishchenko_{idx}.xyz")
                idx += 1
    print(f"Total Tishchenko hits: {idx}")

def find_benzilic(h5path, out_dir="benzilic_extxyz"):
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        r_mol = Chem.MolFromSmiles(r_sm)
        p_mol = Chem.MolFromSmiles(p_sm)
        if not r_mol or not p_mol:
            continue
        if r_mol.HasSubstructMatch(adjacent_diketone_pat):
            # product must have a tertiary alcohol and a carboxylic acid
            if (p_mol.HasSubstructMatch(alpha_hydroxy_acid_pat) 
                and p_mol.HasSubstructMatch(carboxylic_acid_pat)):
                save_extxyz(rxn, idx, "benzilic", out_dir)
                print(f"Benzilic hit: {rxn['rxn_id']} → {out_dir}/benzilic_{idx}.xyz")
                idx += 1
    print(f"Total Benzilic hits: {idx}")

# Example usage:
find_tishchenko("RGD1_CHNO.h5")
find_benzilic("RGD1_CHNO.h5")


Tishchenko hit: MR_42891_0 → tishchenko_extxyz/tishchenko_0.xyz
Tishchenko hit: MR_42891_2 → tishchenko_extxyz/tishchenko_1.xyz
Tishchenko hit: MR_668097_1 → tishchenko_extxyz/tishchenko_2.xyz
Tishchenko hit: MR_6911_1 → tishchenko_extxyz/tishchenko_3.xyz
Total Tishchenko hits: 4
Benzilic hit: MR_139006_0 → benzilic_extxyz/benzilic_0.xyz
Benzilic hit: MR_168150_1 → benzilic_extxyz/benzilic_1.xyz
Benzilic hit: MR_171388_1 → benzilic_extxyz/benzilic_2.xyz
Benzilic hit: MR_186696_1 → benzilic_extxyz/benzilic_3.xyz
Benzilic hit: MR_191768_1 → benzilic_extxyz/benzilic_4.xyz
Benzilic hit: MR_234263_0 → benzilic_extxyz/benzilic_5.xyz
Benzilic hit: MR_234263_1 → benzilic_extxyz/benzilic_6.xyz
Benzilic hit: MR_234263_2 → benzilic_extxyz/benzilic_7.xyz
Benzilic hit: MR_243066_1 → benzilic_extxyz/benzilic_8.xyz
Benzilic hit: MR_254784_1 → benzilic_extxyz/benzilic_9.xyz
Benzilic hit: MR_257555_0 → benzilic_extxyz/benzilic_10.xyz
Benzilic hit: MR_277143_0 → benzilic_extxyz/benzilic_11.xyz
Benzilic 

In [10]:
import os
from rdkit import Chem
from ase import Atoms
from ase.io import write

# SMARTS for Henry (nitroaldol) reaction
nitro_pat           = Chem.MolFromSmarts('[CX3;!$(C=*)][CH2][NX3+](=O)[O-]')  # nitroalkane fragment
aldehyde_pat        = Chem.MolFromSmarts('[CX3H1](=O)[#6]')                     # R-CHO
nitroalcohol_pat    = Chem.MolFromSmarts('[CX4]([OX2H])[NX3+](=O)[O-]')         # nitroalcohol product

# SMARTS for Knoevenagel condensation
diketone_pat        = Chem.MolFromSmarts('[#6]C(=O)CC(=O)[#6]')                 # 1,3-diketone
enone_prod_pat      = Chem.MolFromSmarts('[CX3H1]=[CX2]C(=O)[#6]')              # α,β-unsaturated carbonyl

def to_atoms(geom):
    return Atoms(numbers=geom['atomic_numbers'], positions=geom['positions'])

def save_extxyz(rxn, idx, rxn_type, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    fname = os.path.join(out_dir, f"r{idx}.xyz")
    if os.path.exists(fname):
        os.remove(fname)
    for state in ('reactant', 'transition_state', 'product'):
        geom = rxn[state]
        atoms = to_atoms(geom)
        atoms.info['rxn_id'] = rxn['rxn_id']
        atoms.info['state']  = state
        write(fname, atoms, append=True, format='extxyz')

def find_henry(h5path, out_dir="henry_extxyz"):
    """
    Finds Henry (nitroaldol) reactions: nitroalkane + aldehyde -> nitroalcohol
    """
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        # two-fragment reactant
        if '.' not in r_sm:
            continue
        frags = r_sm.split('.')
        if len(frags) != 2:
            continue
        mols_r = [Chem.MolFromSmiles(f) for f in frags]
        if any(m is None for m in mols_r):
            continue
        # one fragment must be nitroalkane, the other aldehyde
        if (mols_r[0].HasSubstructMatch(nitro_pat) and mols_r[1].HasSubstructMatch(aldehyde_pat)) \
        or (mols_r[1].HasSubstructMatch(nitro_pat) and mols_r[0].HasSubstructMatch(aldehyde_pat)):
            p_mol = Chem.MolFromSmiles(p_sm)
            if p_mol and p_mol.HasSubstructMatch(nitroalcohol_pat):
                save_extxyz(rxn, idx, "henry", out_dir)
                print(f"Henry hit: {rxn['rxn_id']} → {out_dir}/henry_{idx}.xyz")
                idx += 1
    print(f"Total Henry hits: {idx}")

def find_knoevenagel(h5path, out_dir="knoevenagel_extxyz"):
    """
    Finds Knoevenagel condensations: 1,3-diketone + aldehyde -> α,β-unsaturated carbonyl
    """
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        # two-fragment reactant
        if '.' not in r_sm:
            continue
        frags = r_sm.split('.')
        if len(frags) != 2:
            continue
        mols_r = [Chem.MolFromSmiles(f) for f in frags]
        if any(m is None for m in mols_r):
            continue
        # one fragment is 1,3-diketone, other is aldehyde
        if (mols_r[0].HasSubstructMatch(diketone_pat) and mols_r[1].HasSubstructMatch(aldehyde_pat)) \
        or (mols_r[1].HasSubstructMatch(diketone_pat) and mols_r[0].HasSubstructMatch(aldehyde_pat)):
            p_mol = Chem.MolFromSmiles(p_sm)
            if p_mol and p_mol.HasSubstructMatch(enone_prod_pat):
                save_extxyz(rxn, idx, "knoevenagel", out_dir)
                print(f"Knoevenagel hit: {rxn['rxn_id']} → {out_dir}/knoevenagel_{idx}.xyz")
                idx += 1
    print(f"Total Knoevenagel hits: {idx}")

# Example usage:
find_henry("RGD1_CHNO.h5")
find_knoevenagel("RGD1_CHNO.h5")


Total Henry hits: 0
Total Knoevenagel hits: 0


In [11]:
import os
from rdkit import Chem
from ase import Atoms
from ase.io import write

# SMARTS patterns
aldehyde_pat           = Chem.MolFromSmarts('[CX3H1](=O)[#6]')
benzoin_prod_pat      = Chem.MolFromSmarts('[#6]C(=O)C([OX2H])[#6]')

acid_pat              = Chem.MolFromSmarts('[CX3](=O)[OX2H]')
alcohol_pat           = Chem.MolFromSmarts('[CX4H2][OX2H]')
ester_pat             = Chem.MolFromSmarts('[#6]C(=O)O[#6]')

ketone_pat            = Chem.MolFromSmarts('[#6]C(=O)[#6]')
# Propargylic alcohol: RC#C-CH2OH
propargylic_alc_pat   = Chem.MolFromSmarts('[CX2]#C[CH2][OX2H]')
enone_prod_pat        = Chem.MolFromSmarts('[CX3H1]=[CX2]C(=O)[#6]')

def to_atoms(geom):
    return Atoms(numbers=geom['atomic_numbers'], positions=geom['positions'])

def save_extxyz(rxn, idx, rxn_type, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    fname = os.path.join(out_dir, f"r{idx}.xyz")
    if os.path.exists(fname):
        os.remove(fname)
    for state in ('reactant', 'transition_state', 'product'):
        geom = rxn[state]
        atoms = to_atoms(geom)
        atoms.info['rxn_id'] = rxn['rxn_id']
        atoms.info['state']  = state
        write(fname, atoms, append=True, format='extxyz')

def find_benzoin(h5path, out_dir="benzoin_extxyz"):
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        # two aldehyde fragments
        if '.' not in r_sm:
            continue
        frags = r_sm.split('.')
        mols = [Chem.MolFromSmiles(f) for f in frags]
        if any(m is None for m in mols):
            continue
        if all(m.HasSubstructMatch(aldehyde_pat) for m in mols):
            p_mol = Chem.MolFromSmiles(p_sm)
            if p_mol and p_mol.HasSubstructMatch(benzoin_prod_pat):
                save_extxyz(rxn, idx, "benzoin", out_dir)
                print(f"Benzoin hit: {rxn['rxn_id']} → {out_dir}/benzoin_{idx}.xyz")
                idx += 1
    print(f"Total Benzoin hits: {idx}")

def find_fischer_esterification(h5path, out_dir="fischer_extxyz"):
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        # acid + alcohol
        if '.' not in r_sm:
            continue
        frags = r_sm.split('.')
        mols = [Chem.MolFromSmiles(f) for f in frags]
        if any(m is None for m in mols):
            continue
        if (mols[0].HasSubstructMatch(acid_pat) and mols[1].HasSubstructMatch(alcohol_pat)) \
        or (mols[1].HasSubstructMatch(acid_pat) and mols[0].HasSubstructMatch(alcohol_pat)):
            p_mol = Chem.MolFromSmiles(p_sm)
            if p_mol and p_mol.HasSubstructMatch(ester_pat):
                save_extxyz(rxn, idx, "fischer", out_dir)
                print(f"Fischer hit: {rxn['rxn_id']} → {out_dir}/fischer_{idx}.xyz")
                idx += 1
    print(f"Total Fischer esterification hits: {idx}")

def find_baeyer_villiger(h5path, out_dir="bv_extxyz"):
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        # ketone → ester
        if '.' in r_sm or '.' in p_sm:
            continue
        r_mol = Chem.MolFromSmiles(r_sm)
        p_mol = Chem.MolFromSmiles(p_sm)
        if not r_mol or not p_mol:
            continue
        if r_mol.HasSubstructMatch(ketone_pat) and p_mol.HasSubstructMatch(ester_pat):
            save_extxyz(rxn, idx, "baeyer_villiger", out_dir)
            print(f"Baeyer–Villiger hit: {rxn['rxn_id']} → {out_dir}/baeyer_villiger_{idx}.xyz")
            idx += 1
    print(f"Total Baeyer–Villiger hits: {idx}")

def find_meyer_schuster(h5path, out_dir="ms_extxyz"):
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        r_mol = Chem.MolFromSmiles(r_sm)
        p_mol = Chem.MolFromSmiles(p_sm)
        if not r_mol or not p_mol:
            continue
        if r_mol.HasSubstructMatch(propargylic_alc_pat) and p_mol.HasSubstructMatch(enone_prod_pat):
            save_extxyz(rxn, idx, "meyer_schuster", out_dir)
            print(f"Meyer–Schuster hit: {rxn['rxn_id']} → {out_dir}/meyer_schuster_{idx}.xyz")
            idx += 1
    print(f"Total Meyer–Schuster hits: {idx}")

def find_lactonization(h5path, out_dir="lactonization_extxyz"):
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        if '.' in r_sm:
            continue
        r_mol = Chem.MolFromSmiles(r_sm)
        p_mol = Chem.MolFromSmiles(p_sm)
        if not r_mol or not p_mol:
            continue
        # intramolecular: has both acid and alcohol
        if r_mol.HasSubstructMatch(acid_pat) and r_mol.HasSubstructMatch(alcohol_pat):
            # product is cyclic ester
            if p_mol.GetRingInfo().NumRings() > 0 and p_mol.HasSubstructMatch(ester_pat):
                save_extxyz(rxn, idx, "lactonization", out_dir)
                print(f"Lactonization hit: {rxn['rxn_id']} → {out_dir}/lactonization_{idx}.xyz")
                idx += 1
    print(f"Total Lactonization hits: {idx}")

if __name__ == "__main__":
    H5 = "RGD1_CHNO.h5"
    find_benzoin(H5)
    find_fischer_esterification(H5)
    find_baeyer_villiger(H5)
    find_meyer_schuster(H5)
    find_lactonization(H5)



Benzoin hit: MR_395380_1 → benzoin_extxyz/benzoin_0.xyz
Total Benzoin hits: 1
Fischer hit: MR_109234_0 → fischer_extxyz/fischer_0.xyz
Fischer hit: MR_273957_2 → fischer_extxyz/fischer_1.xyz
Fischer hit: MR_81011_2 → fischer_extxyz/fischer_2.xyz
Total Fischer esterification hits: 3
Baeyer–Villiger hit: MR_103184_1 → bv_extxyz/baeyer_villiger_0.xyz
Baeyer–Villiger hit: MR_105624_0 → bv_extxyz/baeyer_villiger_1.xyz
Baeyer–Villiger hit: MR_106600_1 → bv_extxyz/baeyer_villiger_2.xyz
Baeyer–Villiger hit: MR_107105_2 → bv_extxyz/baeyer_villiger_3.xyz
Baeyer–Villiger hit: MR_119137_0 → bv_extxyz/baeyer_villiger_4.xyz
Baeyer–Villiger hit: MR_123903_0 → bv_extxyz/baeyer_villiger_5.xyz
Baeyer–Villiger hit: MR_12423_0 → bv_extxyz/baeyer_villiger_6.xyz
Baeyer–Villiger hit: MR_124483_1 → bv_extxyz/baeyer_villiger_7.xyz
Baeyer–Villiger hit: MR_126778_0 → bv_extxyz/baeyer_villiger_8.xyz
Baeyer–Villiger hit: MR_126785_0 → bv_extxyz/baeyer_villiger_9.xyz
Baeyer–Villiger hit: MR_126997_0 → bv_extxyz/baey

In [12]:
import os
from rdkit import Chem
from ase import Atoms
from ase.io import write

# SMARTS for the Nef reaction: nitroalkane → carbonyl
nitro_pat     = Chem.MolFromSmarts('[#6][CH2][NX3+](=O)[O-]')
carbonyl_pat  = Chem.MolFromSmarts('[#6][CX3](=O)[#6]')

# SMARTS for the Mannich reaction: amine + aldehyde + ketone → β‑amino ketone
amine_pat         = Chem.MolFromSmarts('[NX3;H2]')          # primary amine
ald_pat           = Chem.MolFromSmarts('[CX3H1](=O)[#6]')    # aldehyde
enolizable_pat    = Chem.MolFromSmarts('[#6][CX3](=O)[CH2]') # ketone with CH2 α‑carbon
beta_amino_pat    = Chem.MolFromSmarts('C[NX3]C(=O)')        # β‑amino carbonyl

def to_atoms(geom):
    return Atoms(numbers=geom['atomic_numbers'], positions=geom['positions'])

def save_extxyz(rxn, idx, rxn_type, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    fn = os.path.join(out_dir, f"r{idx}.xyz")
    if os.path.exists(fn):
        os.remove(fn)
    for state in ('reactant', 'transition_state', 'product'):
        atoms = to_atoms(rxn[state])
        atoms.info['rxn_id'] = rxn['rxn_id']
        atoms.info['state']  = state
        write(fn, atoms, append=True, format='extxyz')

def find_nef(h5path, out_dir="nef_extxyz"):
    """
    Nitroalkane  →  Carbonyl (Nef reaction)
    """
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        r_mol = Chem.MolFromSmiles(r_sm)
        p_mol = Chem.MolFromSmiles(p_sm)
        if r_mol and p_mol:
            if r_mol.HasSubstructMatch(nitro_pat) and p_mol.HasSubstructMatch(carbonyl_pat):
                save_extxyz(rxn, idx, "nef", out_dir)
                print(f"Nef hit: {rxn['rxn_id']} → {out_dir}/nef_{idx}.xyz")
                idx += 1
    print(f"Total Nef hits: {idx}")

def find_mannich(h5path, out_dir="mannich_extxyz"):
    """
    Amine + Aldehyde + Ketone  →  β‑Amino ketone (Mannich reaction)
    """
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        # need three fragments: amine, aldehyde, ketone
        if '.' not in r_sm:
            continue
        frags = r_sm.split('.')
        if len(frags) != 3:
            continue
        mols = [Chem.MolFromSmiles(f) for f in frags]
        if any(m is None for m in mols):
            continue
        # match one of each
        types = [m.HasSubstructMatch(amine_pat) for m in mols] + \
                [m.HasSubstructMatch(ald_pat) for m in mols] + \
                [m.HasSubstructMatch(enolizable_pat) for m in mols]
        # each SMARTS must be found exactly once
        if types.count(True) == 3:
            p_mol = Chem.MolFromSmiles(rxn['product']['smiles'])
            if p_mol and p_mol.HasSubstructMatch(beta_amino_pat):
                save_extxyz(rxn, idx, "mannich", out_dir)
                print(f"Mannich hit: {rxn['rxn_id']} → {out_dir}/mannich_{idx}.xyz")
                idx += 1
    print(f"Total Mannich hits: {idx}")

if __name__ == "__main__":
    H5 = "RGD1_CHNO.h5"
    find_nef(H5)
    find_mannich(H5)


Total Nef hits: 0
Mannich hit: MR_316088_1 → mannich_extxyz/mannich_0.xyz
Mannich hit: MR_350463_1 → mannich_extxyz/mannich_1.xyz
Total Mannich hits: 2


In [13]:
import os
from rdkit import Chem
from rdkit.Chem.rdchem import BondType
from ase import Atoms
from ase.io import write

# SMARTS patterns
enone_pat     = Chem.MolFromSmarts('[CX3]=[CX3]')  # C=C

# SMARTS for [2+2] cycloaddition: two alkenes -> cyclobutane ring
alkene_pat    = Chem.MolFromSmarts('[CX3]=[CX3]')

# SMARTS for intramolecular cyclobutane formation: single fragment with two C=C

def to_atoms(geom):
    return Atoms(numbers=geom['atomic_numbers'], positions=geom['positions'])

def save_extxyz(rxn, idx, rxn_type, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    fn = os.path.join(out_dir, f"r{idx}.xyz")
    if os.path.exists(fn):
        os.remove(fn)
    for state in ('reactant','transition_state','product'):
        geom = rxn[state]
        atoms = to_atoms(geom)
        atoms.info['rxn_id'] = rxn['rxn_id']
        atoms.info['state']  = state
        write(fn, atoms, append=True, format='extxyz')


def is_2plus2_cycloaddition(r_sm, p_sm):
    # two distinct alkene fragments
    if '.' not in r_sm:
        return False
    frags = r_sm.split('.')
    if len(frags) != 2:
        return False
    mols = [Chem.MolFromSmiles(f) for f in frags]
    if any(m is None for m in mols):
        return False
    if not all(m.HasSubstructMatch(alkene_pat) for m in mols):
        return False
    p_mol = Chem.MolFromSmiles(p_sm)
    if p_mol is None:
        return False
    # look for cyclobutane ring: single 4-membered ring
    for ring in p_mol.GetRingInfo().AtomRings():
        if len(ring) == 4:
            # ensure all ring bonds are single
            ok = True
            for i in range(4):
                a1 = ring[i]
                a2 = ring[(i+1)%4]
                b = p_mol.GetBondBetweenAtoms(a1, a2)
                if b.GetBondType() != BondType.SINGLE:
                    ok = False
                    break
            if ok:
                return True
    return False


def find_2plus2_cycloadditions(h5path, out_dir="2plus2_cycloadd_extxyz"):
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        if is_2plus2_cycloaddition(rxn['reactant']['smiles'], rxn['product']['smiles']):
            save_extxyz(rxn, idx, '2plus2_cycloaddition', out_dir)
            print(f"[2+2] cycloaddition hit: {rxn['rxn_id']} -> {out_dir}/2plus2_cycloaddition_{idx}.xyz")
            idx += 1
    print(f"Total [2+2] cycloadditions: {idx}")


def find_cyclobutane_cyclizations(h5path, out_dir="cyclobutane_cyclization_extxyz"):
    loader = RGD1Loader(h5path)
    idx = 0
    for rxn in loader:
        r_sm = rxn['reactant']['smiles']
        p_sm = rxn['product']['smiles']
        # single fragment with two alkenes
        if '.' in r_sm:
            continue
        r_mol = Chem.MolFromSmiles(r_sm)
        p_mol = Chem.MolFromSmiles(p_sm)
        if r_mol is None or p_mol is None:
            continue
        # must have two distinct C=C in reactant
        if len(r_mol.GetSubstructMatches(alkene_pat)) < 2:
            continue
        # product must be cyclobutane as before
        for ring in p_mol.GetRingInfo().AtomRings():
            if len(ring) == 4:
                ok = True
                for i in range(4):
                    a1 = ring[i]
                    a2 = ring[(i+1)%4]
                    b = p_mol.GetBondBetweenAtoms(a1, a2)
                    if b.GetBondType() != BondType.SINGLE:
                        ok = False
                        break
                if ok:
                    save_extxyz(rxn, idx, 'cyclobutane_cyclization', out_dir)
                    print(f"Cyclobutane cyclization hit: {rxn['rxn_id']} -> {out_dir}/cyclobutane_cyclization_{idx}.xyz")
                    idx += 1
                    break
    print(f"Total cyclobutane cyclizations: {idx}")


if __name__ == '__main__':
    H5 = 'RGD1_CHNO.h5'
    find_2plus2_cycloadditions(H5)
    find_cyclobutane_cyclizations(H5)


[2+2] cycloaddition hit: MR_110337_0 -> 2plus2_cycloadd_extxyz/2plus2_cycloaddition_0.xyz
[2+2] cycloaddition hit: MR_142005_1 -> 2plus2_cycloadd_extxyz/2plus2_cycloaddition_1.xyz
[2+2] cycloaddition hit: MR_155026_2 -> 2plus2_cycloadd_extxyz/2plus2_cycloaddition_2.xyz
[2+2] cycloaddition hit: MR_156257_0 -> 2plus2_cycloadd_extxyz/2plus2_cycloaddition_3.xyz
[2+2] cycloaddition hit: MR_156257_1 -> 2plus2_cycloadd_extxyz/2plus2_cycloaddition_4.xyz
[2+2] cycloaddition hit: MR_17098_1 -> 2plus2_cycloadd_extxyz/2plus2_cycloaddition_5.xyz
[2+2] cycloaddition hit: MR_184185_1 -> 2plus2_cycloadd_extxyz/2plus2_cycloaddition_6.xyz
[2+2] cycloaddition hit: MR_203780_1 -> 2plus2_cycloadd_extxyz/2plus2_cycloaddition_7.xyz
[2+2] cycloaddition hit: MR_21490_2 -> 2plus2_cycloadd_extxyz/2plus2_cycloaddition_8.xyz
[2+2] cycloaddition hit: MR_216326_1 -> 2plus2_cycloadd_extxyz/2plus2_cycloaddition_9.xyz
[2+2] cycloaddition hit: MR_23675_0 -> 2plus2_cycloadd_extxyz/2plus2_cycloaddition_10.xyz
[2+2] cycloa

In [None]:
'''
import os
import torch
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from ase import Atoms
from ase.io import write

class RGD1TorchLoaderDict:
    """
    Loads an RGD1 dataset saved as a torch checkpoint (a list of Data objects),
    and yields reactions as dicts in the same format as the original HDF5-based loader.
    """
    def __init__(self, checkpoint_path, device='cpu'):
        # Load your list of Data objects
        self.data_list = torch.load(checkpoint_path, map_location=device, weights_only=False)
        # Reconstruct the atom encoder to invert one-hot back to Z
        self.atom_types = np.array([1, 6, 7, 8])  # H, C, N, O
        self.encoder = OneHotEncoder(sparse_output=False)
        self.encoder.fit(self.atom_types.reshape(-1, 1))
    
    def __len__(self):
        return len(self.data_list)
    
    def __iter__(self):
        for data in self.data_list:
            yield self._to_reaction_dict(data)
    
    def __getitem__(self, idx):
        return self._to_reaction_dict(self.data_list[idx])
    
    def _decode_atomic_numbers(self, z_tensor):
        # z_tensor: (N, 5) padded one-hot
        z = z_tensor.cpu().numpy()
        # only first len(atom_types) columns contain the one-hot
        one_hot = z[:, :len(self.atom_types)]
        idxs = one_hot.argmax(axis=1)
        return self.atom_types[idxs].tolist()
    
    def _to_reaction_dict(self, data):
        react = {
            'smiles':            data.formula_reactant,
            'atomic_numbers':    self._decode_atomic_numbers(data.z_reactant),
            'positions':         data.pos_reactant.cpu().numpy(),
            'energy':            data.E_reactant.item(),
        }
        ts = {
            'smiles':            data.formula_transition_state,
            'atomic_numbers':    self._decode_atomic_numbers(data.z_transition_state),
            'positions':         data.pos_transition_state.cpu().numpy(),
            'energy':            data.E_transition_state.item(),
        }
        prod = {
            'smiles':            data.formula_product,
            'atomic_numbers':    self._decode_atomic_numbers(data.z_product),
            'positions':         data.pos_product.cpu().numpy(),
            'energy':            data.E_product.item(),
        }
        return {
            'rxn_id':            data.rxn,
            'reactant':          react,
            'transition_state':  ts,
            'product':           prod
        }

# SMARTS patterns and detectors
PATTERNS = {
    # 'claisen': {
    #     'reactant': Chem.MolFromSmarts('[#6]=[#6]-[#6]-O-[#6]'),
    #     'product':  Chem.MolFromSmarts('[#6]-[#6]=[#6]-C(=O)[#6]')
    # },
    'cope': {
        'reactant': Chem.MolFromSmarts('[#6]=[#6]-[#6]-[#6]-[#6]=[#6]'),
        'product':  Chem.MolFromSmarts('[#6]=[#6]-[#6]-[#6]-[#6]=[#6]')
    },
    # 'pinacol': {
    #     'reactant': Chem.MolFromSmarts('C([OX2H])[C;!H0]([OX2H])'),
    #     'product':  Chem.MolFromSmarts('C(=O)C([OX2H])')
    # },
    # 'wagner_meerwein': {
    #     'reactant': None,
    #     'product':  None
    # }
}

'''