# Creating OpenMM and LAMMPS systems

## Harvest and tabulate paths + info for all Interchange and Topology files

In [2]:
import pandas as pd
import json
from pathlib import Path

# MOL_MASTER_DIR = Path('polymer_structures')
# MOL_MASTER_DIR = Path('polymer_validation')
# MOL_MASTER_DIR = Path('polymer_revision')
# MOL_MASTER_DIR = Path('polymer_update')
# MOL_MASTER_DIR = Path('polymer_benchmark')
# MOL_MASTER_DIR = Path('polymer_improved')
# MOL_MASTER_DIR = Path('polymers_atom_limited')
MOL_MASTER_DIR = Path('polymers_streamlined')
build_records_path = MOL_MASTER_DIR / 'build_records.csv'

# mol_file_frame = pd.read_csv(build_records_path, index_col=[0, 1])

In [3]:
records = []
for record_path in MOL_MASTER_DIR.glob('**/*_RECORD.json'):
    if record_path.exists:
        with record_path.open('r') as record_file:
            mol_info = json.load(record_file)
    mol_info['record_path'] = record_path

    records.append(mol_info)

mol_file_frame = pd.DataFrame.from_records(records)
mol_file_frame.set_index(['mechanism', 'polymer_name'], inplace=True)
for str_path_col in ('topology_path', 'interchange_path', 'directory'):
    mol_file_frame[str_path_col] = mol_file_frame[str_path_col].map(Path) # de-stringify file Paths

In [4]:
take_first_n : Optional[int] = None # debug option to only take a handful of compounds from each family
if take_first_n is not None:
    mol_file_frame = mol_file_frame.head(take_first_n)

# Create MD files from Interchange, evaluate starting energies

## Execute MD loop proper

In [None]:
import pickle
import cProfile
from rich.live import Live

from time import sleep
from gc import collect

from openmm import XmlSerializer, Context
from openmm.unit import kilojoule_per_mole

from polymerist.genutils.containers import RecursiveDict
from polymerist.genutils.textual.interpolation import insert_into_text_periodic
from polymerist.genutils.fileutils.pathutils import assemble_path
from polymerist.genutils.fileutils.jsonio.update import append_to_json

from polymerist.duration import Timer
from polymerist.lammpstools import lammpseval
from polymerist.mdtools.openfftools import topology

from polymerist.mdtools.openmmtools.thermo import EnsembleFactory
from polymerist.mdtools.openmmtools.parameters import SimulationParameters
from polymerist.mdtools.openmmtools import serialization
from polymerist.mdtools.openmmtools.evaluation import eval_openmm_energies_separated

from polybuild_utils import initialize_polymer_progress, interchange_to_lammps, interchange_to_openmm

In [None]:
# parameters
sim_params = SimulationParameters.from_file('sim_params.json')
ensfac = EnsembleFactory.from_thermo_params(sim_params.thermo_params)

build_lammps : bool = True
build_openmm : bool = False
lmp_args = ["-screen", "none"]#, "-log", "none"] # blocks stdout and log.lammps writes to avoid clutter

group, (status_id, curr_compound_id, comp_progress_id) = initialize_polymer_progress(num_compounds=len(mol_file_frame))
status_readout, compound_readout, compound_progress = group.renderables

energies = RecursiveDict()
with Live(group, refresh_per_second=10) as live:
    # ensure bars start at 0
    for pbar in group.renderables: 
        for task_id in pbar.task_ids:
            pbar.reset(task_id)

    # iterate over all distinct chemistries by reaction mechanism
    for (mechanism, polymer_name), row in mol_file_frame.iterrows():
        comp_desc = f'{polymer_name} ({row.n_atoms_cap} {row.oligomer_type} on {row.lattice_size} lattice)'
        # comp_desc = insert_into_text_periodic(comp_desc, period=32)
        compound_readout.update(curr_compound_id, polymer_name=comp_desc, mechanism=mechanism)

        # load recorded topology and interchange files
        if (build_lammps or build_openmm): # if neither is being loaded, don't bother trying to load an Interchange TODO: modify this to check if the directory tree exists
            status_readout.update(status_id, action='Loading Interchange from file')
            offtop = topology.topology_from_sdf(row.topology_path, allow_undefined_stereo=True)
            with row.interchange_path.open('rb') as inc_file:
                interchange = pickle.load(inc_file)

        # LAMMPS
        if build_lammps:
            lmp_dir : Path = row.directory / 'LAMMPS'
            lmp_dir.mkdir(exist_ok=True)

            lmp_input_path = assemble_path(lmp_dir, polymer_name, extension='in')
            lmp_data_path  = assemble_path(lmp_dir, polymer_name, extension='lammps')
            lmp_prof_path  = assemble_path(lmp_dir, polymer_name, extension='txt', postfix='profile')
            lmp_md_paths = [lmp_input_path, lmp_data_path]

            lmp_input_path.touch()

            ## writing LAMMPS files
            if not all(path.exists() for path in lmp_md_paths): # avoid double-writing if files are already generated
                status_readout.update(status_id, action='Writing LAMMPS files')
                with Timer() as lammps_timer:
                    lmp_profile = cProfile.Profile()
                    lmp_ret = lmp_profile.runcall(
                        interchange_to_lammps,
                        interchange=interchange,
                        lmp_data_path=lmp_data_path,
                        lmp_input_path=lmp_input_path
                    )
                    
                lmp_profile.dump_stats(lmp_prof_path)
                if row['record_path'].exists():
                    append_to_json(row['record_path'], lammps_time=lammps_timer.time_taken)

            ## evaluating LAMMPS energies
            # box_params = lammpseval.get_lammps_unit_cell(lmp_input_path, cmdargs=lmp_args) # need to make copy of args, list seems to be modified upon pass
            status_readout.update(status_id, action='Evaluating LAMMPS structure energies')
            energies['LAMMPS'][(mechanism, polymer_name)] = lammpseval.get_lammps_energies(lmp_input_path, preferred_unit=kilojoule_per_mole, cmdargs=lmp_args) # need to make copy of args, list seems to be modified upon pass

        # OpenMM
        if build_openmm:
            omm_dir : Path = row.directory / 'OpenMM'
            omm_dir.mkdir(exist_ok=True)

            sim_paths = serialization.SimulationPaths()
            omm_prof_path  = assemble_path(omm_dir, polymer_name, postfix='profile'   , extension='txt')
            omm_integ_path = assemble_path(omm_dir, polymer_name, postfix='integrator', extension='xml')
            omm_md_paths = [omm_top_path, omm_sys_path, omm_state_path] = sim_paths.init_top_and_sys_paths(omm_dir, polymer_name)
            omm_md_paths += (omm_integ_path,)

            if not all(path.exists() for path in omm_md_paths): # avoid double-writing if files are already generated
                status_readout.update(status_id, action='Writing OpenMM files')
                with Timer() as openmm_timer:
                    omm_profile = cProfile.Profile()
                    integrator = ensfac.integrator(time_step=sim_params.integ_params.time_step)
                    with omm_integ_path.open('w') as integ_file:
                        integ_file.write(XmlSerializer.serialize(integrator))

                    omm_ret = omm_profile.runcall(
                        interchange_to_openmm,
                        interchange=interchange,
                        integrator=integrator,
                        omm_top_path=omm_top_path,
                        omm_sys_path=omm_sys_path,
                        omm_state_path=omm_state_path
                    )

                omm_profile.dump_stats(omm_prof_path)
                if row['record_path'].exists():
                    append_to_json(row['record_path'], openmm_time=openmm_timer.time_taken)
                    
            ## evaluating OpenMM energies
            status_readout.update(status_id, action='Loading OpenMM context')
            with omm_sys_path.open('r') as sys_file:
                omm_sys_read = XmlSerializer.deserialize(sys_file.read())

            with omm_state_path.open('r') as state_file:
                omm_state_read = XmlSerializer.deserialize(state_file.read())

            with omm_integ_path.open('r') as integ_file:
                omm_integ_read = XmlSerializer.deserialize(integ_file.read())

            context = Context(omm_sys_read, omm_integ_read)
            serialization.apply_state_to_context(context, omm_state_read)

            status_readout.update(status_id, action='Evaluating OpenMM energies')
            openmm_pot, openmm_kin = eval_openmm_energies_separated(context, preferred_unit=kilojoule_per_mole)
            energies['OpenMM'][(mechanism, polymer_name)] = {**openmm_pot, **openmm_kin}

        compound_progress.advance(comp_progress_id)
        sleep(0.1) # needed to give final bar enough time to catch up
        collect() # manual garbage collector call to try to alleviate memory issues
    status_readout.update(status_id, action='MD file output complete!')

In [None]:
energy_dir = Path('energy_tables')
energy_dir.mkdir(exist_ok=True)
edfs = {}

for platform, energies_dict in energies.items():
    energy_path = assemble_path(energy_dir, MOL_MASTER_DIR.stem, postfix=f'{platform}_energies', extension='csv')
    edf = pd.DataFrame.from_dict(energies_dict, orient='index')
    edf.to_csv(energy_path)
    edfs[platform] = edf

# Comparing energies

## Loading energy tables and comparing contributions

In [None]:
from functools import reduce
from operator import add
from dataclasses import dataclass

pd.options.display.float_format = '{:.4f}'.format # disable scientific notation

force_name_remap = { # easier-to-understand names for OpenMM energies
    'vdW force'                : 'vdW',
    'Electrostatics force'     : 'Electrostatic',
    'vdW 1-4 force'            : 'vdW 1-4',
    'Electrostatics 1-4 force' : 'Electrostatic 1-4',
    'PeriodicTorsionForce'     : 'Dihedral',
    'HarmonicAngleForce'       : 'Angle',
    'HarmonicBondForce'        : 'Bond'
}

@dataclass
class TableFormats:
    sum_terms : dict[str, list[str]]
    del_terms : list[str]

formats = {
    'OpenMM' : TableFormats(
        sum_terms = {
            'vdW' : ['vdW', 'vdW 1-4'],
            'Coulomb' : ['Electrostatic', 'Electrostatic 1-4']
        },
        del_terms = ['Kinetic']
    ),
    'LAMMPS' : TableFormats(
        sum_terms = {
            'vdW' : ['vdW', 'Dispersion'],
            'Dihedral' : ['Proper Torsion', 'Improper Torsion'],
            'Coulomb'  : ['Coulomb Short', 'Coulomb Long']
        },
        del_terms = ['Nonbonded']
    ),
}

# apply reformatting to respective tables
edfs_fmt = {}
for platform, energy_df in edfs.items():
    fmt = formats[platform]

    # combine selected terms
    new_energy_df = energy_df.copy(deep=True) # leave original unmodified
    for combined_contrib, contribs in fmt.sum_terms.items():
        new_term = reduce(add, (new_energy_df[contrib] for contrib in contribs)) # merge contributions into a single new named term
        new_energy_df = new_energy_df.drop(columns=contribs, inplace=False) # clear contributions
        new_energy_df[combined_contrib] = new_term # done after drop to ensure name clashes don't result in extra deletion
    
    # delete redundant terms
    for del_contrib in fmt.del_terms:
        new_energy_df.drop(columns=[del_contrib], inplace=True) # clear contributions
    edfs_fmt[platform] = new_energy_df

In [None]:
import matplotlib.pyplot as plt
from polymerist.graphics.plotutils import presize_subplots


col_order = ['Bond', 'Angle', 'Dihedral', 'vdW', 'Coulomb', 'Potential']
max_err_perc : float = None
# max_err_perc : float = 100.0

energy_perc_rel_err = ((edfs_fmt['OpenMM'] - edfs_fmt['LAMMPS']) / edfs_fmt['LAMMPS']).abs() * 100
if max_err_perc:
    err_in_tol = (energy_perc_rel_err.abs() < max_err_perc).all(axis=1)
    energy_perc_rel_err = energy_perc_rel_err[err_in_tol]

fig, ax = presize_subplots(nrows=2, ncols=3)
for col, axis in zip(col_order, ax.flatten()):
    heights, bins, patches = axis.hist(energy_perc_rel_err[col], bins=50)
    axis.set_ylabel(f'{col} energy (rel. % error)')
    axis.tick_params(axis='x', rotation=-20)
    
plt.show()
display(energy_perc_rel_err[col_order])


In [None]:
diff_path = assemble_path(energy_dir, 'Energy_rel_err_table', postfix=f'{MOL_MASTER_DIR.stem}_{lattice_size}', extension='csv')
energy_perc_rel_err.to_csv(diff_path)

energy_fig_path = assemble_path(energy_dir, 'Energy_rel_err_graphs', postfix=f'{MOL_MASTER_DIR.stem}_{lattice_size}', extension='png')
fig.savefig(energy_fig_path)

# Getting just the systems which have density data available

In [None]:
def get_polymer_name(row : pd.Series) -> str:
    return f'poly({row["IUPAC_name_monomer_0"]}-co-{row["IUPAC_name_monomer_1"]})'.lower()

p = Path('monomer_data_processed/monomer_data_MASTER.csv')
polyid_df = pd.read_csv(p, index_col=0)

polyid_df['polymer_name'] = polyid_df.apply(get_polymer_name, axis=1) # generate column of polymer names from monomer names
polyid_df.set_index(['rxn_name', 'polymer_name'], inplace=True) # reindex by mechanism and molecule name
polyid_df = polyid_df[polyid_df['Density'].notnull()] # filter by density values

common_index = polyid_df.index.intersection(energy_perc_rel_err.index)
polyid_df.loc[common_index]['Density']

In [None]:
has_density = energy_perc_rel_err.loc[common_index]
has_density['Density'] = polyid_df.loc[common_index]['Density']
has_density