# Imports

In [None]:
## Logging and Shell
import logging
logging.basicConfig(
    level=logging.ERROR,
    force=True
)

## Generic imports
from collections import defaultdict
import pandas as pd

## File I/O
from pathlib import Path
import json

# Cheminformatics
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole

from openff.toolkit import Topology
from openff.toolkit.utils.exceptions import IncorrectNumConformersWarning

# Custom Imports
from polymerist.maths.greek import GREEK_PREFIXES
from polymerist.genutils.containers import RecursiveDict

from polymerist.rdutils.reactions import reactions, reactors
from polymerist.rdutils import rdkdraw

DIM    = 300
ASPECT = 3/2
rdkdraw.set_rdkdraw_size(DIM, ASPECT)

from polymerist.monomers import specification, MonomerGroup
from polymerist.residues.partition import partition
from polymerist.polymers import building

from polymerist.openfftools import topology
from polymerist.openfftools.pcharge import MolCharger

# catch annoying warnings
import warnings 
warnings.catch_warnings(record=True)
warnings.filterwarnings('ignore', category=IncorrectNumConformersWarning)

In [None]:
# Static Paths
RAW_DATA_DIR  = Path('monomer_data_raw')
FMT_DATA_DIR  = Path('monomer_data_formatted')
PROC_DATA_DIR = Path('monomer_data_processed')
RXN_FILES_DIR = Path('poly_rxns')

PDB_OUT_DIR   = Path('pdb_files')
MONO_OUT_DIR  = Path('monomer_fragments')
TOPO_OUT_DIR  = Path('Topologies')

# Load monomer and rxn data 

In [None]:
# input_data_path = PROC_DATA_DIR / '20231114_polyid_data_density_DP2-6 - 1,2 monomers_FILTERED.csv'
# input_data_path = PROC_DATA_DIR / 'nipu_urethanes_FILTERED.csv'
input_data_path = PROC_DATA_DIR / 'monomer_data_MASTER.csv'
df = pd.read_csv(input_data_path, index_col=0)

## Load pre-defined reactions with functional group and name backmap

In [None]:
keys = ['rxn_name']

blacklisted_rxns = ['imide']#, 'vinyl']
df = df[df.mechanism.map(lambda s : s not in blacklisted_rxns)]

df_grouper = df.groupby(keys)
frames = {
    mech : df_grouper.get_group(mech)
        for mech in df_grouper.groups
}

In [None]:
with (RXN_FILES_DIR / 'rxn_groups.json').open('r') as file: # load table of functional group for each reaction
    rxn_groups = json.load(file)

rxns = {
    rxnname : reactions.AnnotatedReaction.from_rxnfile(RXN_FILES_DIR / f'{rxnname}.rxn')
        for rxnname in rxn_groups.keys()
}

# Auto-generating monomer fragments and Topologies

## Set up and format progress bars to track build status

In [None]:
from time import sleep
from rich.progress import Progress
from rich.progress import (
    BarColumn,
    Progress,
    SpinnerColumn,
    TaskProgressColumn,
    TextColumn,
    TimeElapsedColumn,
    TimeRemainingColumn,
)
from rich.console import Group
from rich.live import Live

# status of individual task
status_readout = Progress(
    'STATUS:',
    TextColumn(
        '[purple]{task.fields[action]}'
    ),
    '...'
)
status_id = status_readout.add_task('[green]Current compound:', action='')

# textual display of the name of the curent polymer
compound_readout = Progress(
    TextColumn(
        'Current compound ({task.completed} / {task.total}):'
    ),
    TextColumn(
        '[blue]{task.fields[polymer_name]}',
        justify='right'
    )
)
curr_compound_id  = compound_readout.add_task('[green]Current compound:', polymer_name='')

# progress over distinct classes of mechanism
overall_progress = Progress(
    SpinnerColumn(),
    "[progress.description]{task.description}",
    BarColumn(
        
    ),
    TaskProgressColumn(),
    TextColumn(
        '({task.completed} / {task.total})'
    ),
    'At:',
    TimeElapsedColumn(),
)
curr_mechanism_id = overall_progress.add_task('[blue]Reaction mechanism(s)', start=True, total=len(frames))

# individual progress bars within each mechanism
mechanism_progress = Progress(
    "[progress.description]{task.description}",
    BarColumn(),
    TaskProgressColumn(),
    TextColumn(
        '({task.completed} / {task.total})'
    ),
    'At:',
    TimeElapsedColumn(),
)
total_compounds = 0
mech_task_ids = {} # preprocess dataframes by mechanism to determine progress bar layout and task lengths
for rxn_name, rxn_df in frames.items():
    num_compounds = len(rxn_df)
    mech_task_ids[rxn_name] = mechanism_progress.add_task(f'[cyan]{rxn_name}', start=True, total=len(rxn_df))
    total_compounds += num_compounds
compound_readout.update(curr_compound_id, total=total_compounds)

# combine progess readouts into unified live console
group = Group(
    status_readout,
    compound_readout,
    overall_progress,
    mechanism_progress,
)

## Polymerize all SMILES into fragments, generate PDB topologies, then parameterize chemical and nonbonded info 

In [11]:
import numpy as np
from openff.toolkit import Molecule, Topology
from openff.toolkit.utils.exceptions import UnassignedChemistryInPDBError

from polymerist.genutils.fileutils import filetree
from polymerist.maths.lattices import generate_int_lattice

from polymerist.rdutils.rdcoords import tiling
from polymerist.rdutils.rdprops import copy_rd_props


# Parameters
DOP : int= 3
clear_existing : bool = True
charge_method = 'Espaloma-AM1-BCC'
lattice_sizes : list[np.ndarray] = [
    np.array([5, 5, 5])
]

# create directories
TOPLEVEL_DIRS = (
    MONO_OUT_DIR,
    PDB_OUT_DIR,
    TOPO_OUT_DIR,
)

for dir in TOPLEVEL_DIRS:
    if clear_existing and dir.exists():
        filetree.clear_dir(dir)
    dir.mkdir(exist_ok=True)

nmer_name = f'{GREEK_PREFIXES[DOP]}mers'
N_MER_DIR_PDB  = PDB_OUT_DIR  / nmer_name 
N_MER_DIR_TOPO = TOPO_OUT_DIR / nmer_name 

N_MER_DIR_PDB.mkdir(exist_ok=True)
N_MER_DIR_TOPO.mkdir(exist_ok=True)

# preprocess parameters
charger = MolCharger.subclass_registry[charge_method]()
lattices = {
    'x'.join(str(i) for i in lattice_size) : generate_int_lattice(*lattice_size)
        for lattice_size in lattice_sizes
}

# set up data structures for global output
frag_registry = RecursiveDict()
failed_param  = RecursiveDict()
unmatched_pdb_mols = defaultdict(defaultdict)

# execute build loop
num_successful : int = 0
with Live(group, refresh_per_second=10) as live:
    # ensure bars start at 0
    for pbar in group.renderables: 
        for task_id in pbar.task_ids:
            pbar.reset(task_id)

    for rxn_name, rxn_df in frames.items():
        # look up reactive groups and pathway by rxn_name
        mech_task_id = mech_task_ids[rxn_name]
        rxn_pathway = rxns[rxn_name]
        reactor = reactors.PolymerizationReactor(rxn_pathway)
        
        # initialize output directories
        mono_dir : Path = MONO_OUT_DIR / rxn_name
        mono_dir.mkdir(exist_ok=True)

        pdb_dir : Path = N_MER_DIR_PDB / rxn_name
        pdb_dir.mkdir(exist_ok=True)

        topo_dir_indiv : Path = N_MER_DIR_TOPO / 'individual' / rxn_name
        topo_dir_indiv.mkdir(exist_ok=True, parents=True)

        topo_dirs_latt : dict[str, Path] = {} # prepopulate unified directories for each lattice size, as provided
        for lattice_str in lattices:
            topo_dir_latt = N_MER_DIR_TOPO / lattice_str / rxn_name
            topo_dir_latt.mkdir(exist_ok=True, parents=True)
            topo_dirs_latt[lattice_str] = topo_dir_latt

        for (i, row) in rxn_df.iterrows():
        # 0) load reactants with IUPAC names from chemical table
            status_readout.update(status_id, action='Gathering reactants')
            named_reactants = {}
            for j in range(2):
                reactant = Chem.MolFromSmiles(row[f'smiles_monomer_{j}'], sanitize=False)
                Chem.SanitizeMol(reactant, sanitizeOps=specification.SANITIZE_AS_KEKULE)
                named_reactants[ row[f'IUPAC_name_monomer_{j}'] ] = reactant
            initial_reactants = [reactants for reactants in named_reactants.values()] # must convert to list to pass to ChemicalReaction

            polymer_name = f'poly({"-co-".join(named_reactants.keys())})' # TODO : make sure this conforms to IUPAC standards for naming
            compound_readout.update(curr_compound_id, polymer_name=polymer_name)
            frag_registry[rxn_name][i] = polymer_name

        # 1) use rxn template to polymerize monomers into all possible fragments
            status_readout.update(status_id, action='Fragmenting via reaction mechanism')
            monogrp = MonomerGroup()
            for dimer, frags in reactor.propagate(initial_reactants):
                for assoc_group_name, rdfragment in zip(named_reactants.keys(), frags):
                    # generate spec-compliant SMARTS
                    raw_smiles = Chem.MolToSmiles(rdfragment)
                    exp_smiles = specification.expanded_SMILES(raw_smiles)
                    spec_smarts = specification.compliant_mol_SMARTS(exp_smiles)

                    # record to monomer group
                    affix = 'TERM' if MonomerGroup.is_terminal(rdfragment) else 'MID'
                    monogrp.monomers[f'{assoc_group_name}_{affix}'] = [spec_smarts]

            status_readout.update(status_id, action='Saving monomer fragments...')
            monogrp.to_file(mono_dir / f'{polymer_name}.json')

            try:
            # 2) generate PDB file from fragments
                status_readout.update(status_id, action='Generating PDB file')
                polymer = building.build_linear_polymer(monomers=monogrp, DOP=DOP, sequence='AB')  
                mol_pdb_path = pdb_dir / f'{polymer_name}.pdb'
                building.mbmol_to_openmm_pdb(mol_pdb_path, polymer)

            # 3) Parameterize topology and generate SDF 
                status_readout.update(status_id, action='Partitioning topology by fragments')
                offtop = Topology.from_pdb(mol_pdb_path, _custom_substructures=monogrp.monomers)
                was_partitioned = partition(offtop)
                assert(was_partitioned)

                status_readout.update(status_id, action='Assigning partial charges')
                offmol = topology.get_largest_offmol(offtop)
                offmol.name = polymer_name
                cmol = charger.charge_molecule(offmol)

                status_readout.update(status_id, action='Saving individual parameterized topology to SDF')
                offtop = cmol.to_topology()
                sdf_path = topo_dir_indiv / f'{polymer_name}.sdf'
                topology.topology_to_sdf(sdf_path, offtop)

                # also generate tiled lattices if specified
                for lattice_str, lattice in lattices.items():
                    status_readout.update(status_id, action=f'Generating tiled {lattice_str} topology')
                    tiled_rdmol = tiling.tile_lattice_with_rdmol(cmol.to_rdkit(), lattice)

                    tiled_offmols = [] 
                    for tiled_mol_copy in Chem.GetMolFrags(tiled_rdmol, asMols=True, sanitizeFrags=False):
                        copy_rd_props(tiled_rdmol, tiled_mol_copy) # ensure each individual fragment preserves the information of the parent molecule
                        tiled_offmols.append(
                            Molecule.from_rdkit(
                                rdmol=tiled_mol_copy,
                                allow_undefined_stereo=True,
                                hydrogens_are_explicit=True
                            )
                        )
                    tiled_offtop = Topology.from_molecules(tiled_offmols)

                    status_readout.update(status_id, action=f'Saving {lattice_str} parameterized topology to SDF')
                    sdf_path_latt = topo_dir_latt / f'{polymer_name}.sdf'
                    topology.topology_to_sdf(sdf_path_latt, tiled_offtop)
                    num_successful += 1

            except UnassignedChemistryInPDBError as pdb_error:
                unmatched_pdb_mols[DOP][rxn_name][polymer_name] = monogrp
                continue # skip to next compounds, don't proceed with parameterization
            except Exception as other_exception:
                failed_param[DOP][rxn_name][polymer_name] = other_exception.__class__.__name__
                print(f'{polymer_name} : {other_exception}')
            finally:
                mechanism_progress.advance(mech_task_id)
                compound_readout.advance(curr_compound_id)
            
        overall_progress.advance(curr_mechanism_id, advance=1)
    
    # Ensure readout are current at end of process
    compound_readout.update(curr_compound_id, polymer_name=f'Completed! ({num_successful}/{total_compounds} successful)')
    sleep(0.1) # needed to give final bar enough time to catch up

# global output after processing
with (MONO_OUT_DIR / 'dataset_backmap.json').open('w') as file:
    json.dump(frag_registry, file, indent=4)

print(failed_param)

Output()























































































RecursiveDict(<class 'polymerist.genutils.containers.RecursiveDict'>,
              {3: RecursiveDict(<class 'polymerist.genutils.containers.RecursiveDict'>,
                                {'polyurethane_isocyanate': RecursiveDict(<class 'polymerist.genutils.containers.RecursiveDict'>,
                                                                          {'poly(1-isocyanato-4-[(4-isocyanatophenyl)methyl]benzene-co-2-[2-hydroxyethyl-[4-(4-nitrophenyl)diazenylphenyl]amino]ethanol)': 'AtomValenceException'})})})
