# For assigning chemical info and partial charges to PDB structures (saves to well-defined SDF files)

## Imports

In [1]:
# Generic Imports
from collections import defaultdict

# File I/O
from pathlib import Path
from tqdm.notebook import tqdm

# Cheminformatics
from openff.toolkit.topology import Topology, Molecule
from openff.toolkit.topology.topology import UnmatchedAtomsError

# Static Paths
MONO_DIR = Path('monomer_files')
PDB_DIR  = Path('pdb_files')

## Registering charging ToolkitWrapper

In [2]:
from openff.toolkit import GLOBAL_TOOLKIT_REGISTRY as GTR
from espaloma_charge.openff_wrapper import EspalomaChargeToolkitWrapper

# Create molecule charging toolkit registries
GTR.register_toolkit(EspalomaChargeToolkitWrapper)
TOOLKITS = { 
    tk.toolkit_name : tk
        for tk in GTR.registered_toolkits
}

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


## Chemistries

In [3]:
# defining reacting functional groups
reaction_pairs = {
    'NIPU' : ('cyclocarbonate', 'amine'),
    'urethane' : ('isocyanate', 'hydroxyl')
}
# chemistries = ('urethane', 'NIPU')
chemistries = [i for i in reaction_pairs.keys()]

## Obtaining complete PDB-monomer pairs for iteration

In [4]:
chem_paths = defaultdict(defaultdict)
for chemistry in chemistries:
    chem_pdb  = PDB_DIR / chemistry
    chem_mono = MONO_DIR / chemistry

    has_pdb  = set(path.stem for path in chem_pdb.glob( '*.pdb'))
    has_mono = set(path.stem for path in chem_mono.glob('*.json'))

    for valid_mol_name in (has_pdb & has_mono):
        pdb_path  = chem_pdb  / f'{valid_mol_name}.pdb'
        mono_path = chem_mono / f'{valid_mol_name}.json'

        chem_paths[chemistry][valid_mol_name] = (pdb_path, mono_path)

## Paramaterizing and saving to SDF, recording mols where match fails

In [6]:
topo_dir = Path('Topologies')
topo_dir.mkdir(exist_ok=True)

unmatched_pdb_mols = defaultdict(defaultdict)
for chemistry, path_dict in chem_paths.items():
    chem_dir = topo_dir / chemistry
    chem_dir.mkdir(exist_ok=True)

    progress = tqdm(path_dict.items())
    for mol_name, (pdb_path, mono_path) in progress:
        progress.set_postfix_str(f'{chemistry} : {mol_name}')
        try:
            offtop, *_ = Topology.from_pdb_and_monomer_info(
                str(pdb_path),
                monomer_info_json=mono_path,
                strict=True,
                verbose=False
            )
            offmol = next(offtop.molecules)
            offmol.assign_partial_charges(partial_charge_method='espaloma-am1bcc', toolkit_registry=TOOLKITS['Espaloma Charge Toolkit'])
            
            sdf_path = chem_dir / f'{mol_name}.sdf'
            offmol.to_file(str(sdf_path), file_format = sdf_path.suffix[1:])

        except UnmatchedAtomsError:
            offtop_unmatched, *_ = Topology.from_pdb_and_monomer_info(
                str(pdb_path),
                monomer_info_json=mono_path,
                strict=False,
                verbose=False
            )
            offmol_unmatched = next(offtop_unmatched.molecules)
            unmatched_pdb_mols[chemistry][mol_name] = offmol_unmatched

        except Exception as e:
            print(f'{mol_name} : {e}')

  0%|          | 0/6 [00:00<?, ?it/s]



NIPU_12 : rdkit.Chem.rdchem.BondType.UNSPECIFIED




  0%|          | 0/63 [00:00<?, ?it/s]



## Probing PDB mismatches

In [7]:
unmatched_pdb_mols['urethane']['urethane_13']



NGLWidget()

In [13]:
for mol_name, mol in unmatched_pdb_mols['urethane'].items():
    mol.name = mol_name

mols_by_len = sorted(unmatched_pdb_mols['urethane'].values(), key=lambda mol : mol.n_atoms)

In [21]:
mol = mols_by_len[-1]

rdmol = mol.to_rdkit()
for i, atom in enumerate(mol.atoms):
    if not atom.metadata['already_matched']:
        print(atom.metadata)
        rdmol.GetAtomWithIdx(i).SetProp('atomNote', 'unmatched')

display(rdmol)

KeyError: rdkit.Chem.rdchem.BondType.UNSPECIFIED