# For assigning chemical info and partial charges to PDB structures (saves to well-defined SDF files)

## Imports

In [1]:
# Generic Imports
from collections import defaultdict

# File I/O
from pathlib import Path
from tqdm.notebook import tqdm

# Cheminformatics
from openff.toolkit.topology import Topology, Molecule

# Static Paths
MONO_DIR = Path('monomer_files')
PDB_DIR  = Path('pdb_files')

## Chemistries

In [2]:
# defining reacting functional groups
reaction_pairs = {
    'NIPU' : ('cyclocarbonate', 'amine'),
    'urethane' : ('isocyanate', 'hydroxyl')
}
# chemistries = ('urethane', 'NIPU')
chemistries = [i for i in reaction_pairs.keys()]

## Obtaining complete PDB-monomer pairs for iteration

In [3]:
chem_paths = defaultdict(defaultdict)
for chemistry in chemistries:
    chem_pdb  = PDB_DIR / chemistry
    chem_mono = MONO_DIR / chemistry

    has_pdb  = set(path.stem for path in chem_pdb.glob( '*.pdb'))
    has_mono = set(path.stem for path in chem_mono.glob('*.json'))

    for valid_mol_name in (has_pdb & has_mono):
        pdb_path  = chem_pdb  / f'{valid_mol_name}.pdb'
        mono_path = chem_mono / f'{valid_mol_name}.json'

        chem_paths[chemistry][valid_mol_name] = (pdb_path, mono_path)

## Paramaterizing and saving to SDF, recording mols where match fails

In [6]:
from polymerist.openfftools import topology
from polymerist.openfftools.pcharge import MolCharger

from polymerist.monomers import MonomerGroup
from polymerist.residues.partition import partition

# catch annoying warnings
import warnings 
from openff.toolkit.utils.exceptions import IncorrectNumConformersWarning
warnings.catch_warnings(record=True)
warnings.filterwarnings('ignore', category=IncorrectNumConformersWarning)


topo_dir = Path('Topologies')
topo_dir.mkdir(exist_ok=True)

charge_method = 'Espaloma-AM1-BCC'
charger = MolCharger.subclass_registry[charge_method]()


unmatched_pdb_mols = defaultdict(defaultdict)
for chemistry, path_dict in chem_paths.items():
    chem_dir = topo_dir / chemistry
    chem_dir.mkdir(exist_ok=True)

    progress = tqdm(path_dict.items())
    for mol_name, (pdb_path, mono_path) in progress:
        progress.set_description_str(f'{chemistry} : {mol_name}')
        try:
            progress.set_postfix_str('Loading files')
            monogrp = MonomerGroup.from_file(mono_path)
            offtop = Topology.from_pdb(pdb_path, _custom_substructures=monogrp.monomers)
            progress.set_postfix_str('Partitioning')
            was_partitioned = partition(offtop)
            assert(was_partitioned)

            progress.set_postfix_str('Assigning Charges')
            offmol = topology.get_largest_offmol(offtop)
            offmol.name = mol_name
            cmol = charger.charge_molecule(offmol)

            progress.set_postfix_str('Saving SDF')
            offtop = cmol.to_topology()
            sdf_path = chem_dir / f'{mol_name}.sdf'
            topology.topology_to_sdf(sdf_path, offtop)
        except Exception as e:
            print(f'{mol_name} : {e}')

        # except UnmatchedAtomsError:
        #     offtop_unmatched, *_ = Topology.from_pdb_and_monomer_info(
        #         str(pdb_path),
        #         monomer_info_json=mono_path,
        #         strict=False,
        #         verbose=False
        #     )
        #     offmol_unmatched = next(offtop_unmatched.molecules)
        #     unmatched_pdb_mols[chemistry][mol_name] = offmol_unmatched


  0%|          | 0/6 [00:00<?, ?it/s]



  0%|          | 0/63 [00:00<?, ?it/s]

## Probing PDB mismatches

In [5]:
unmatched_pdb_mols['urethane']['urethane_13']

KeyError: 'urethane_13'

In [None]:
for mol_name, mol in unmatched_pdb_mols['urethane'].items():
    mol.name = mol_name

mols_by_len = sorted(unmatched_pdb_mols['urethane'].values(), key=lambda mol : mol.n_atoms)

In [None]:
mol = mols_by_len[-1]

rdmol = mol.to_rdkit()
for i, atom in enumerate(mol.atoms):
    if not atom.metadata['already_matched']:
        print(atom.metadata)
        rdmol.GetAtomWithIdx(i).SetProp('atomNote', 'unmatched')

display(rdmol)