In [None]:
# Custom Imports
from polymer_utils import general, filetree, extratypes
from polymer_utils import charging as polychg
from polymer_utils import simulation as polysim
from polymer_utils.representation import PolymerDir, PolymerDirManager
from polymer_utils.solvents import WATER_TIP3P

# Typing and Subclassing
from typing import Any, Callable, ClassVar, Optional, Union
from dataclasses import dataclass, field
from abc import ABC, abstractmethod

# File I/O
from pathlib import Path
import csv, json, pickle
from shutil import copyfile, rmtree

# Logging and Shell
import subprocess
import logging
from IPython.display import clear_output
                            
# Cheminformatics
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdmolfiles

# Molecular Dynamics
from openff.units import unit
from openff.interchange import Interchange

from openff.toolkit.topology import Topology
from openff.toolkit.topology.molecule import Molecule, Atom
from openff.toolkit.typing.engines.smirnoff import ForceField

from openff.toolkit.utils.exceptions import ConformerGenerationError
from openff.toolkit.utils.toolkits import RDKitToolkitWrapper, OpenEyeToolkitWrapper, AmberToolsToolkitWrapper

from openmm import LangevinMiddleIntegrator, Context
from openmm.vec3 import Vec3
from openmm.app import Simulation, PDBReporter, StateDataReporter

from openmm.unit import picosecond, femtosecond, nanosecond # time
from openmm.unit import nanometer, angstrom # length
from openmm.unit import Unit, kelvin # misc

# Static Paths
CORE_PATH = Path('Core')
POLY_PATH = Path('Polymers')
TEST_PATH = Path('Polymers_test')
COMPAT_PDB_PATH = Path('compatible_pdbs')

POLY_PDB_PATH = COMPAT_PDB_PATH/'simple_polymers'
SOLVENTS_PATH = CORE_PATH/'solvents'
POLYMER_SOLV_TEMPLATE = CORE_PATH/'inp_templates'/'solv_polymer_template_box.inp'

## Loading and configuring available polymers

In [None]:
# Logging config
logging.basicConfig(level=logging.INFO)
LOG_FORMATTER = logging.Formatter('%(asctime)s.%(msecs)03d [%(levelname)s:%(processName)s:line %(lineno)d] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
      
def setup_logger(log_name : str, outpath : Path, writemode='w', formatter : logging.Formatter=None):
    '''Boilerplate for creating a new Logger for process output'''
    log_path = outpath/f'{log_name}.log'
    log_path.touch()

    logger = logging.getLogger(log_name) # call is idempotent with same logger namename
    if len(logger.handlers) < 1: # prevent duplicate logging output when recreating logger
    # if not logger.hasHandlers(): # prevent duplicate logging output when recreating logger
        file_handler = logging.FileHandler(log_path, mode=writemode)
        if formatter is not None:
            file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

    return logger

In [None]:
reset     = False #True
resolvate = False #True
recover   = True

solvent = WATER_TIP3P
solvent.structure_file = CORE_PATH/'solvents'/solvent.name/f'{solvent.name}.pdb'
mgr = PolymerDirManager(collection_dir=POLY_PATH)

if reset:
    mgr.purge(really=True) 
    mgr.populate_mol_dirs(source_dir=POLY_PDB_PATH)

if resolvate:        
    for mol_name, mol_dir in mgr.mol_dirs.items():
        if mol_dir.info.solvent is None: # only try to solvate systems which don't already have a solvent
            print(mol_name)
            solv_dir = mol_dir.solvate(template_path=POLYMER_SOLV_TEMPLATE, solvent=solvent, exclusion=1*nanometer)

    mgr.update_mol_dirs() # ensure solvated dirs are added to collection

if recover:
    # When resolvation, ensure leftover charge files from previous solvation sims are reassigned
    recovery_attrs = {
        'pkl' : 'pickle_file',
        'FF'  : 'ff_file'
    }

    for mol_dir in mgr.mol_dirs_list:
        for subdir_name, attr_name in recovery_attrs.items():
            try:
                subdir = getattr(mol_dir, subdir_name)
                existing_file = next(subdir.iterdir()) # raises StopIteration if folder is empty
                setattr(mol_dir.info, attr_name, existing_file)
                mol_dir.to_file() # ensure info is updated on disc copy

                print(mol_dir.info)
            except StopIteration:
                pass

print([i for i in mgr.mol_dirs.keys()], '\n', mgr.all_completed_sims)

## Charge/Sim loop V1 proper

In [None]:
# DEFINE TARGET MOLECULES AND FORCEFIELD
# sample_mols = mols_to_use[:3]
sample_mols = ['polyvinylchloride']
main_ff_xml = CORE_PATH/'force_fields'/'openff_unconstrained-2.0.0.offxml'

# CHARGING PARAMETERS
toolkit = 'OpenEye Toolkit'
partial_charge_method = 'am1bccelf10'

# CHARGING / SIM LOOP BEHAVIOR
overwrite_ff_xml   = True
overwrite_chg_json = True
distrib_mono_charges = True
run_sims = True
strict = True
verbose = False

# SIMULATION PARAMETERS 
temperature = 300 * kelvin
friction_coeff = 1/picosecond

sim_time = 0.001 * nanosecond #5 * nanosecond 
timestep = 1 * femtosecond
num_samples = 100 #2_000

# AUXILIARY PRE-FLIGHT CALCULATIONS
sample_dirs = {
    mol_name : mgr.mol_dirs.get(mol_name)
        for mol_name in sample_mols
}
action_str = f'Charging{" & simulation" if run_sims else ""}'

num_steps   = round(sim_time / timestep)
record_freq = round(num_steps / num_samples)
num_mols = len(sample_dirs)
print(num_steps, record_freq)

In [None]:
chg_logger = logging.getLogger(charging.__name__)

def create_pickled_cmol(mol_dir : PolymerDir, toolkit : str, partial_charge_method : str, strict : bool=True, verbose : bool=False) -> None:
    '''Ensure that a pickled, charged molecule exists for the mol directory - perform charging with method of choice if none exists'''
    pickle_path = mol_dir.pkl/f'{mol_dir.mol_name}.pkl'

    chg_logger.info(f'Loading topology and molecule via graph match...')
    mol = mol_dir.largest_offmol_matched(strict=strict, verbose=verbose, topo_only=True)
    chg_logger.info(f'Charging {mol_dir.mol_name} via {toolkit}-{partial_charge_method}...')
    cmol = charging.charging.generate_molecule_charges(mol, toolkit=toolkit, partial_charge_method=partial_charge_method) 

    with pickle_path.open('wb') as pickle_file: # write charged molecule to pickle to avoid constantly redoing AM1
        pickle.dump(cmol, pickle_file)

    mol_dir.info.pickle_file = pickle_path # ensure change is reflected in directory info
    mol_dir.to_file() # record all changes to disc

def create_chg_avg_mono(mol_dir : PolymerDir, distrib_mono_charges : bool=True) -> tuple[list[ChargedResidue], AtomIDMap]:
    '''Create a charge-averaged monomer file from an existing monomer spec file and a charged OFF Molecule'''
    chg_logger.info('Unpickling charged Molecule for charge averaging...')
    with mol_dir.info.pickle_file.open('rb') as pickle_file: 
        cmol = pickle.load(pickle_file) # load AM1-charged molecule from file (must exist by this point in loop)

    chg_logger.info(f'Averaging charges over {mol_dir.mol_name} residues...')
    avgs, atom_id_mapping = charging.averaging.get_averaged_charges(cmol, monomer_data=mol_dir.monomer_data, distrib_mono_charges=distrib_mono_charges) # average charges over unique residues
    mono_chgs = {avgd_res.residue_name : avgd_res.charges for avgd_res in avgs}
    
    chg_logger.info(f'Writing new charged JSON monomer file...')
    mol_dir.create_charged_monomer_file(mono_chgs)

    return avgs, atom_id_mapping

def create_off_xml(mol_dir : PolymerDir, xml_src : Path) -> tuple[ForceField, list[LibraryChargeHandler]]:
    '''Generate an OFF force field with molecule-specific (and solvent specific, if applicable) Library Charges appended'''
    ff_path = mol_dir.FF/f'{mol_dir.mol_name}.offxml' # path to output library charges to
    chg_logger.info('Writing new force field OFFXML file')
    forcefield, lib_chgs = charging.averaging.write_lib_chgs_from_mono_data(mol_dir.monomer_data_charged, xml_src, output_path=ff_path)

    if mol_dir.info.solvent is not None:
        chg_logger.info('Associated solvent found, merging Library-Charged force field with solvent force field...')
        forcefield = ForceField(ff_path, mol_dir.info.solvent.forcefield_file, allow_cosmetic_attributes=True) # use both the polymer-specific xml and the solvent FF xml to make hybrid forcefield
        forcefield.to_file(ff_path)

    mol_dir.info.ff_file = ff_path # ensure change is reflected in directory info
    mol_dir.to_file() # record all changes to disc

    return forcefield, lib_chgs

In [None]:
# BEGIN CHARGING / SIM LOOP - Perform charge averaging on all target molecules which don't already have averaged LCs; Load forcefield for those which already do 
main_logger = logging.getLogger(__name__)
loggers = [main_logger, chg_logger]
main_log_handler = config_mlf_handler(mgr.log_dir/f'Polymer_battery_{general.timestamp_now()}.log', loggers, writemode='a')

main_logger.info(f'Beginning {action_str} loop...\n')
for i, (mol_name, mol_dir) in enumerate(sample_dirs.items()):
    # 0) LOAD MOLECULE AND TOPOLOGY, ATTEMPT TO APPLY LIBRARY CHARGES
    start_time = datetime.now()
    main_logger.info(f'Current molecule: "{mol_name}" ({i + 1}/{num_mols})') # +1 converts to more human-readable 1-index for step count
    polymer_log_handler = config_mlf_handler(mol_dir.logs/f'{general.timestamp_now()}.log', loggers, writemode='w') # NOTE : order matters, initial main logger call above should not record to local polymer log
    if not mol_dir.has_monomer_data:
        raise FileExistsError(f'No monomer JSONs found for {mol_name}')
    
    # 1) ENSURING AN AM1-BCC-ELF10-CHARGED MOLECULE EXISTS (IN PICKLE FORM). WILL RECHARGE IF NONE EXISTS
    if (mol_dir.info.pickle_file is None):
        main_logger.warning('(1-precheck) Generating new pickled charged OpenFF Molecule...')
        try:
            create_pickled_cmol(mol_dir, toolkit, partial_charge_method, strict, verbose)
        except ConformerGenerationError:
            main_logger.error('Could not successfully generate conformers\n')
            continue 
    main_logger.info('(1) Found pickled charged molecule...')
    
    # 2) CREATE JSON WITH AVERAGED CHARGES IF ONE DOES NOT ALREADY EXIST
    if (mol_dir.info.monomer_file_chgd is None) or overwrite_chg_json: # can only reach this branch if a json is present but isn't identified as charged within the PolymerDir
        main_logger.warning('(2-precheck) Generating new charged monomer JSON...')
        create_chg_avg_mono(mol_dir, distrib_mono_charges=distrib_mono_charges)
    main_logger.info('(2) Found charged monomer JSON...')

    # 3) CREATE FORCE FIELD XML WITH MONOMER-BASED LIBRARY CHARGE ENTRIES
    if (mol_dir.info.ff_file is None) or overwrite_ff_xml: # can only reach if a charged monomer json already exists
        main_logger.warning('(3-precheck) Generating new Force Field XML with Library Charges...')
        create_off_xml(mol_dir, xml_src=main_ff_xml)
    main_logger.info('(3) Found Force Field file with Library Charges...')

    # 4) RUN OpenMM SIMULATION FOR TARGET MOLECULE
    if run_sims:
        main_logger.info('(4) Preparing simulation...')
        output_folder = mol_dir.make_res_dir()
        sim_log_handler = config_mlf_handler(output_folder/f'{mol_dir.mol_name} simulation.log', loggers)

        main_logger.info('Loading Topology...')
        openff_topology = mol_dir.openff_topology_matched(strict=strict, verbose=verbose, topo_only=True)
        openff_topology.box_vectors = mol_dir.box_vectors.in_units_of(nanometer) # set box vector to allow for periodic simulation (will be non-periodic if mol_dir box vectors are unset i.e. NoneType)

        main_logger.info('Loading charged Molecule...')
        with mol_dir.info.pickle_file.open('rb') as pickle_file: 
            cmol = pickle.load(pickle_file) # load AM1-charged molecule from file (must exist by this point in loop)

        main_logger.info('Loading Force Field...')
        forcefield = ForceField(mol_dir.info.ff_file, allow_cosmetic_attributes=True)

        main_logger.info('Creating Simulation from Interchange...')
        interchange = Interchange.from_smirnoff(force_field=forcefield, topology=openff_topology, charge_from_molecules=[cmol]) # generate Interchange with new library charges prior to writing to file
        integrator  = LangevinMiddleIntegrator(temperature, friction_coeff, timestep)
        sim = polysim.create_simulation(interchange, integrator)
        
        main_logger.info(f'Running {sim_time} OpenMM sim at {temperature} for {num_steps} steps...')
        polysim.run_simulation(sim, output_folder=output_folder, output_name=mol_name, num_steps=num_steps, record_freq=record_freq)

        mol_dir.to_file() # ensure directory data reflects changes to files
        # filetree.startfile(output_folder)
        sim_log_handler.remove_from_loggers(*loggers)  
    
    proc_time = str(datetime.now() - start_time)
    main_logger.info(f'Successfully completed actions on {mol_name} in {proc_time}\n')
    clear_output() # for Jupyter notebooks only, can freely comment this out
    polymer_log_handler.remove_from_loggers(*loggers)  

main_logger.info(f'{action_str} loop completed')
main_log_handler.remove_from_loggers(*loggers)

## Running Sims v0

In [None]:
mol_dirs = mgr.mol_dirs

desired_solvents = (WATER_TIP3P,) #,None)
hard_polymers = ['vulcanizedrubber', 'polyphenylenesulfone', 'polyethylene'] # pathological or otherwise difficult-to-run polymers that I've encountered
hard_polymers_solv = [
    f'{unsolv_mol}_solv_{solvent.name}'
        for solvent in desired_solvents
            for unsolv_mol in hard_polymers
]
hard_polymers.extend(hard_polymers_solv) # ensure solvated names are also included

mols_to_use = [mol_dir.mol_name
    for mol_dir in mol_dirs.values()
        if (mol_dir.mol_name not in hard_polymers)         # 1) are not manually excluded
            and (0 < mol_dir.n_atoms <= 300)               # 2) are loadable (i.e. non-zero size) but are small enough for AM1BCC (150 is speed limit, 300 is error limit)
            and (mol_dir.has_monomer_data)                 # 3) have monomer information files
            and (mol_dir.info.solvent in desired_solvents) # 4) is solvated in the specified solvents (could be None)
]

print(mols_to_use)

In [None]:
# DEFINE TARGET MOLECULES AND FORCEFIELD
sample_mols = mols_to_use
# sample_mols = ['polyvinylchloride_solv_water']
main_ff_xml = CORE_PATH/'force_fields'/'openff_unconstrained-2.0.0.offxml'
solv_ff_xml = CORE_PATH/'force_fields'/'tip3p.offxml'

# SET CHARGING LOOP BEHAVIOR
prevent_overwrites = False # to deprecate
distrib_mono_charges = True
run_sims = True
verbose = False

# SIMULATION PARAMETERS 
temperature = 300 * kelvin
friction_coeff = 1/picosecond

sim_time = 5 * nanosecond 
timestep = 1 * femtosecond
num_samples = 2_000

In [None]:
# PRE-FLIGHT CHECKS
sample_dirs = {
    mol_name : mol_dirs.get(mol_name)
        for mol_name in sample_mols
}

num_steps   = round(sim_time / timestep)
record_freq = round(num_steps / num_samples)
num_mols = len(sample_dirs)
print(num_steps, record_freq)

main_log_dir = POLY_PATH/'Logs'
main_log_dir.mkdir(exist_ok=True)

master_logger = setup_logger(f'Polymer_battery_{general.timestamp_now()}', outpath=main_log_dir, formatter=LOG_FORMATTER, writemode='w')
master_handler = master_logger.handlers[0]

In [None]:
# BEGIN CHARGING / SIM LOOP - Perform charge averaging on all target molecules which don't already have averaged LCs; Load forcefield for those which already do 
for i, (mol_name, mol_dir) in enumerate(sample_dirs.items()):
    log_name = mol_name #f'{mol_name}_chg_sim_log' #{general.timestamp_now()}'
    logger = setup_logger(log_name, outpath=mol_dir.logs, writemode='a', formatter=LOG_FORMATTER)
    logger.addHandler(master_handler) # ensure output is also logged to the master

    # DEFINING PATHS, CREATING FOLDERS, AND FETCHING FILES
    pdb_path      = mol_dir.info.structure_file
    lc_path       = mol_dir.FF/f'new {mol_name} charges.offxml' # path to output library charges to
    pickle_path   = mol_dir.pkl/f'{mol_name}.pkl'
    output_folder = mol_dir.make_res_dir()

    # LOAD MOLECULE AND TOPOLOGY, ATTEMPT TO APPLY LIBRARY CHARGES
    logger.info(f'Current molecule: {mol_name} ({i + 1}/{num_mols})') # +1 converts to more human-readable 1-index for step count
    json_path = mol_dir.monomer_file_ranked
    if json_path is None:
        raise FileExistsError(f'No monomer JSONs found for {mol_name}')
    
    logger.info(f'Using monomer file "{json_path}"...')
    with json_path.open('r') as json_file:
        mono_data = json.load(json_file)

    logger.info(f'Loading and matching molecule "{mol_name}"...')
    openff_topology, _, _error = Topology.from_pdb_and_monomer_info(str(pdb_path), json_path, strict=True, verbose=verbose)
    openff_topology.box_vectors = mol_dir.box_vectors.in_units_of(nanometer) # set box vector to allow for periodic simulation (will be non-periodic if mol_dir box vectors are unset i.e. NoneType)
    mol = next(openff_topology.molecules) # get the first molecule (assumed to be the polymer of interest)

    if prevent_overwrites and lc_path.exists(): # check if library charges have already been generated for this molecule
        logger.info('Obtaining partial charges from Library Charge xml...')
        forcefield = ForceField(lc_path, solv_ff_xml, allow_cosmetic_attributes=True) # use both the polymer-specific xml and the solvent FF xml when creating the Forcefield
        
        logger.info('Unpickling charged Molecule...')
        with pickle_path.open('rb') as pickle_file: # read cmol from file if already extant
            cmol = pickle.load(pickle_file)
    else:
        # PERFORMING INITIAL AM1-BCC CHARGING, OR UNPICKLING MOLECULE IF THIS HAS ALREADY BEEN DONE
        if not pickle_path.exists():
            logger.warning('No extant pickled charged Molecule found, performing charging...')
            try:
                cmol = polychg.generate_molecule_charges(mol, partial_charge_method='am1bccelf10') # perform AM1BCC
            except ConformerGenerationError:
                logger.warning('Could not successfully generate conformers')
                continue 

            with pickle_path.open('wb') as pickle_file: # write charged molecule to pickle to avoid constantly redoing AM1
                pickle.dump(cmol, pickle_file)
            mol_dir.info.pickle_file = pickle_path
        
        logger.info('Unpickling charged Molecule...')
        with pickle_path.open('rb') as pickle_file: # read cmol from file if already extant
            cmol = pickle.load(pickle_file)

        # CHARGE AVERAGING
        logger.info(f'Averaging charges over {mol_name} residues...')
        avgs, atom_id_mapping = polychg.get_averaged_charges(cmol, monomer_data=mono_data, distrib_mono_charges=distrib_mono_charges) # average charges over unique residues

        logger.warning('Library Charge file not found OR overwrite allowed, writing new Library Charge xml...')
        forcefield, lib_chgs = polychg.write_new_library_charges(avgs, main_ff_xml, output_path=lc_path)
        mol_dir.info.ff_file = lc_path
        
        # CREATE JSON WITH AVERAGED CHARGES IF ONE DOES NOT ALREADY EXIST
        if mol_dir.info.monomer_file_chgd is None:
            logger.info('Writing new monomer JSON with charge data...')

            mono_chgs = {avgd_res.residue_name : avgd_res.charges for avgd_res in avgs}
            if mol_dir.info.solvent is not None:
                mono_data['charges'] = {**mono_chgs, **mol_dir.info.solvent.monomer_json_data['charges']} # ensure solvent "monomer" charges are also recorded

            chgd_json_path = json_path.with_name(f'{json_path.stem}_charged.json')
            chgd_json_path.touch()
            with chgd_json_path.open('w') as new_json:
                json.dump(mono_data, new_json, indent=4)
            mol_dir.info.monomer_file_chgd = chgd_json_path

    # RUN OpenMM SIMULATION FOR TARGET MOLECULE
    if run_sims:
        logger.info(f'Running {sim_time} OpenMM sim at {temperature} for {num_steps} steps...')

        forcefield = ForceField(lc_path, solv_ff_xml, allow_cosmetic_attributes=True)
        interchange = Interchange.from_smirnoff(force_field=forcefield, topology=openff_topology, charge_from_molecules=[cmol]) # generate Interchange with new library charges prior to writing to file
        integrator  = LangevinMiddleIntegrator(temperature, friction_coeff, timestep)
        
        sim = polysim.create_simulation(interchange, integrator)
        polysim.run_simulation(sim, output_folder=output_folder, output_name=mol_name, num_steps=num_steps, record_freq=record_freq)
    
    mol_dir.to_file() # ensure directory data reflects changes to files
    # filetree.startfile(output_folder)
    clear_output() # for Jupyter notebooks only, can freely comment this out
    logger.info(f'Successfully completed actions on {mol_name}\n')
    logger.removeHandler(master_handler) # free up master log handler - prevents bleed-over between multiple sim sessions

master_logger.info(f'Charging{" & simulation" if run_sims else ""} loop completed')

In [None]:
logger.handlers

In [None]:
logging.root.manager.loggerDict # use this to link SMIRNOFF and others to logging

In [None]:
failed_sims = lambda : set(sample_mols) - set(mgr.all_completed_sims(mol_dirs).keys())
failed_sims()

## Checking molecule size discrepancies between RDKit, OpenFF Molecule.from_file, and OpenFF Molecule.from_pdb_and_monomer_info

In [None]:
mol_sizes = {}

# RDKIT sizes
sizes = {}
for path in POLY_PDB_PATH.glob('**/*.pdb'):
    try:
        sizes[path.stem] = Chem.MolFromPDBFile(str(path), removeHs=False).GetNumAtoms()
    except Exception as e:
        print(path.name, e)
sizes = general.sort_dict_by_values(sizes)
mol_sizes['rdkit'] = sizes

# OpenFF sizes
sizes2 = {}
for path in POLY_PDB_PATH.glob('**/*.pdb'):
    try:
        sizes2[path.stem] = len(Molecule.from_file(path, toolkit_registry=polychg.TOOLKITS['openeye']()).atoms)
    except Exception as e:
        print(path.stem, ' failed', e)
mol_sizes['openff_file'] = sizes2

# Pre-computed sizes from old workflow
q = Path('Available Polymers.json')
with q.open('r') as file:
    mol_sizes['openff_pdb_mono'] = json.load(file)

In [None]:
names = {mol_name for sizes in mol_sizes.values() for mol_name in sizes}
header = {'Species' : list(mol_sizes.keys())}
compare = {
    mol_name : [sizes.get(mol_name) for sizes in mol_sizes.values()]
        for mol_name in names
}
compare = {k : compare[k] for k in sorted(compare)}
compare = {**header, **compare}


for species, sizes in compare.items():
    if len(set(sizes)) > 1:
        print(species, sizes)

compare        

## Checking which force field XMLs are non-unique

In [None]:
ff_dir = CORE_PATH/'force_fields'

dat = []
for ff_xml in ff_dir.iterdir():
    with ff_xml.open('r') as file:
        info = file.read()
        # if info in dat:
        print(ff_xml.name)
        # else:
        dat.append(info)

len(set(dat))

In [None]:
np.array([
    [i == j
        for i in dat]
            for j in dat
])

## Some other section