In [48]:
import numpy as np
import re
from rdkit import Chem

In [115]:
def get_superatom(mol_file):
    with open(mol_file) as f:
        mol_data = f.read()
    return [(int(i)-1, symb) for i, symb in re.findall(r'A\s+(\d+)\s+(\S+)\s', mol_data)]

def convert_mol_to_smiles(mol_file, debug=False):
    try:
        mol = Chem.MolFromMolFile(mol_file, sanitize=False)
        print(Chem.MolToMolBlock(mol))
        
        smiles = Chem.MolToSmiles(mol)
        atom_order = mol.GetProp('_smilesAtomOutputOrder')
        atom_order = eval(atom_order)     # str -> List[int], since the Prop is a str
        print(f"Atom order in SMILES:\n {atom_order}")
        
        for i in atom_order:
            print(mol.GetAtomWithIdx(i).GetSymbol(), end=" ")
        
        reverse_map = np.argsort(atom_order)
        print(f"\nReverse map:\n{reverse_map}\n")
        
        with open(mol_file, "r") as f:
            lines = f.readlines()
        
        coords = []
        edges = {}
        for i, line in enumerate(lines):
            if line.endswith("V2000\n"):
                tokens = line.split()
                num_atoms = int(tokens[0])
                num_bonds = int(tokens[1])
                for atom_line in lines[i+1:i+1+num_atoms]:
                    atom_tokens = atom_line.strip().split()
                    coords.append([float(atom_tokens[0]), float(atom_tokens[1])])
                for bond_line in lines[i+1+num_atoms:i+1+num_atoms+num_bonds]:
                    bond_tokens = bond_line.strip().split()
                    start, end, bond_type, stereo = [int(token) for token in bond_tokens[:4]]
                    
                    edges[(start - 1, end - 1)] = [bond_type, stereo]
                break
        coords = np.array(coords)
        coords = coords[atom_order]
        
        edges = {(reverse_map[start], reverse_map[end]): v
                for (start, end), v in edges.items()}
        
        if mol.GetNumAtoms() < 3 or mol.GetNumAtoms() > 100:
            return None, None
        superatoms = get_superatom(mol_file)
        pseudo_smiles = smiles
        if len(superatoms) > 0:
            mappings = []
            cnt = 1
            mw = Chem.RWMol(mol)
            for atom_idx, symb in superatoms:
                atom = Chem.Atom("*")
                while f"[{cnt}*]" in pseudo_smiles:
                    cnt += 1
                atom.SetIsotope(cnt)
                mw.ReplaceAtom(atom_idx, atom)
                mappings.append((f"[{cnt}*]", f"[{symb}]"))
                cnt += 1
            pseudo_smiles = Chem.MolToSmiles(mw)
            for placeholder, symb in mappings:
                pseudo_smiles = pseudo_smiles.replace(placeholder, symb)
        return smiles, pseudo_smiles, coords, edges
    except Exception as e:
        if debug:
            raise e
        return None, None, None, None

In [116]:
mol_file = "../20020101/US06334995-20020101/US06334995-20020101-C00001.MOL"
smiles, pseudo_smiles, coords, edges = convert_mol_to_smiles(mol_file, debug=True)


     RDKit          2D

 22 21  0  0  0  0  0  0  0  0999 V2000
   -1.8779   -5.2066    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
   -0.3756   -5.2066    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.0282   -6.5070    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.0282   -3.9108    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.3756   -3.9061    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -0.3756   -2.6056    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
    0.3756   -1.3052    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -0.3756    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.3756    1.3005    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
    1.8779    1.3005    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -0.3756    2.6009    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.8779   -2.6056    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    2.6291    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.8779   -1.3052    0

In [117]:
smiles

'*OC(=O)CN(CCN(CCN(C)C)CC(=O)O)CC(=O)O'

In [118]:
pseudo_smiles

'*OC(=O)CN(CCN(CCN([C2])[C1])CC(=O)O)CC(=O)O'

In [119]:
coords

array([[ 0.3756,  6.5023],
       [-0.3756,  5.2019],
       [ 0.3756,  3.9014],
       [ 1.8779,  3.9014],
       [-0.3756,  2.6009],
       [ 0.3756,  1.3005],
       [-0.3756,  0.    ],
       [ 0.3756, -1.3052],
       [-0.3756, -2.6056],
       [ 0.3756, -3.9061],
       [-0.3756, -5.2066],
       [-1.8779, -5.2066],
       [-3.0282, -6.507 ],
       [-3.0282, -3.9108],
       [-1.8779, -2.6056],
       [-2.6291, -1.3052],
       [-4.1315, -1.3052],
       [-1.8779,  0.    ],
       [ 1.8779,  1.3005],
       [ 2.6291,  0.    ],
       [ 1.8779, -1.3052],
       [ 4.1315,  0.    ]])

In [120]:
edges

{(11, 10): [1, 0],
 (11, 12): [1, 0],
 (11, 13): [1, 0],
 (10, 9): [1, 0],
 (9, 8): [1, 0],
 (8, 7): [1, 0],
 (7, 6): [1, 0],
 (6, 5): [1, 0],
 (5, 18): [1, 0],
 (5, 4): [1, 0],
 (8, 14): [1, 0],
 (18, 19): [1, 0],
 (19, 20): [2, 0],
 (19, 21): [1, 0],
 (4, 2): [1, 0],
 (2, 3): [2, 0],
 (2, 1): [1, 0],
 (1, 0): [1, 0],
 (14, 15): [1, 0],
 (15, 16): [2, 0],
 (15, 17): [1, 0]}