# Development of miscellaneous new features for polymerist

## Fixing degree of polymerization calculation weirdness in polymer builder

In [2]:
import logging
logging.basicConfig(level=logging.INFO)

from polymerist.polymers.monomers import MonomerGroup
from polymerist.polymers.monomers.specification import expanded_SMILES, compliant_mol_SMARTS
from polymerist.polymers.building import build_linear_polymer, mbmol_to_openmm_pdb

from polymerist.smileslib.primitives import Smarts, Smiles

INFO:numexpr.utils:Note: NumExpr detected 20 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
INFO:numexpr.utils:NumExpr defaulting to 16 threads.


In [3]:
from rdkit import Chem


show : bool = False
save : bool = False

smiles_frags : dict[str, Smiles] = {
    # PEG
    'PEG-1A' : '[OH]CC*',
    'PEG-1B' : '*OCC[OH]',
    'PEG-2'  : '*OCC*',
    # PLA
    'PLA-1A' : '[OH]C(C)C(=O)*',
    'PLA-1B' : '*OC(C)C(=O)[OH]',
    'PLA-2'  : '*OC(C)C(=O)*',
    # 'PLA-2'  : '*[NH]*',
    # PGA
    'PGA-1A' : '[OH]CC(=O)*',
    'PGA-1B' : '*OCC(=O)[OH]',
    'PGA-2'  : '*OCC(=O)*',
}

smarts_frags : dict[str, list[Smarts]] = {}
for name, smiles in smiles_frags.items():
    smiles_expl = expanded_SMILES(smiles)
    smarts = compliant_mol_SMARTS(smiles_expl)
    mol = Chem.MolFromSmarts(smiles_expl)
    smarts_frags[name] = smarts
    
    # print(name)
    # display(mol)
    # print('='*10)
    
# smarts_frags['PEG-1A'] = [smarts_frags['PEG-1A'], '[NH2]*']
monogrp = MonomerGroup(
    smarts_frags,
    # term_orient={'head' : 'PEG-1A','tail' : 'PEG-1A'}
)
if save:
    monogrp.to_file('peg-pla-pga.json')
if show:
    for name, mol in monogrp.iter_rdmols(term_only=None):
        print(name)
        display(mol)
        print('='*10)



In [25]:
chain = build_linear_polymer(monogrp, n_monomers=3, sequence='A', allow_partial_sequences=True, energy_minimize=False)
mbmol_to_openmm_pdb('peg-plga.pdb', chain)

INFO:polymerist.polymers.building.sequencing:Target chain length achievable with 1 whole 1-sequence repeats;
 Namely, polymer will be sequenced as [END-GROUP] + 1*[A] + [END-GROUP], yielding (1*1 + 0) middle monomers + 2 terminal monomers = 3 total monomers)
INFO:polymerist.polymers.building.linear:Registering middle monomer PEG-2 (block identifier "A")
INFO:polymerist.polymers.building.linear:Registering terminal monomer PEG-1A (orientation "head")
INFO:polymerist.polymers.building.linear:Registering terminal monomer PEG-1B (orientation "tail")
INFO:polymerist.polymers.building.linear:Assembling linear 3-mer chain (estimated 24 atoms)
INFO:polymerist.polymers.building.linear:Successfully assembled linear 3-mer chain (exactly 24 atoms)


In [None]:
from dataclasses import dataclass, field
from polymerist.polymers.exceptions import EndGroupDominatedChain, InsufficientChainLength, EmptyBlockSequence, PartialBlockSequence

@dataclass
class LinearCopolymerSequenceInfo:
    '''
    For encapsulating information about the sequence of repeat units in a periodic, linear copolymer
    Also covers, as trivial special cases, homopolymers and alternating copolymers
    '''
    sequence_kernel : str
    n_repeat_units_total : int
    n_repeat_units_terminal : int = 0
    
    def __post_init__(self) -> None:
        if not self.sequence_kernel:
            raise EmptyBlockSequence('Must provide non-empty sequence kernel to yield a valid (co)polymer sequence')
    
        if self.n_repeat_units_middle < 0:
            raise EndGroupDominatedChain(
                f'Number of terminal monomers exceeds requested chain length; ({self.n_repeat_units}-mer' \
                f'chain can\'t possibly contain {self.n_repeat_units_terminal} terminal monomers)'
            )
    
    @property
    def n_repeat_units(self) -> int:
        '''Briefer alias of self.n_repeat units_total'''
        return self.n_repeat_units_total
    nru = nrus = n_repeat_units
    
    @property
    def n_repeat_units_middle(self) -> int:
        '''Number of middle (i.e. non-terminal) repeat units'''
        return self.n_repeat_units - self.n_repeat_units_terminal
    nru_mid = nrus_mid = n_repeat_units_middle

    # Whole sequence periods
    @property
    def block_size(self) -> int:
        '''Number of repeat units units in one whole iteration of the kernel block'''
        return len(self.sequence_kernel)
    period = block_size
    
    @property
    def n_full_periods(self) -> int:
        '''
        Largest number of complete repetitions of the sequence kernel which, when taken
        together, contain no more repeats units than the specified number of middle units
        '''
        return self.n_repeat_units_middle // self.block_size
    
    # Partial sequence residues
    @property
    def n_residual_repeat_units(self) -> int:
        '''
        Difference between number of middle repeat units and units which
        would occur in maximal full periods of the kernel
        
        By construction, is no greater than the block size and is
        identically zero exactly when a whole number of kernel repeats
        '''
        return self.n_repeat_units_middle % self.block_size
    n_residual_symbols = n_res = n_residual_repeat_units
    
    @property
    def has_residual(self) -> bool:
        '''Whether or not the target number of middle repeat units can be attained by a whole number of kernel repeats'''
        return bool(self.n_residual_repeat_units)
    
    @property
    def sequence_residual(self) -> str:
        '''Partial repeat of the kernel sequence needed to attain the speficied number of middle units'''
        return self.sequence_kernel[:self.n_residual_repeat_units]
    residual = sequence_residual

In [11]:
lcps = LinearCopolymerSequenceInfo('ABCB', 17, 2)

In [12]:
lcps.nrus

17

## Testing polymerist importability within environment

In [None]:
import numpy as np
from openff.toolkit import Molecule, Topology, ForceField

In [None]:
import polymerist as ps
from polymerist.genutils import pyimports, importutils

import pandas as pd
print(importutils.module_hierarchy(ps))

In [None]:
import nglview

print(nglview.__version__)
nglview.demo()

In [None]:
from polymerist.polymers.monomers import specification
from rdkit import Chem

smi = 'CCO-c1ccccc1-N=C=C'
mol1 = Chem.MolFromSmiles(smi)
display(mol1)

sma = specification.expanded_SMILES(smi, assign_map_nums=True)
exp_sma = specification.compliant_mol_SMARTS(sma)
mol2 = Chem.MolFromSmarts(sma)
display(mol2)


In [None]:
from openff.toolkit import Molecule

offmol = Molecule.from_smiles(smi)
offmol.generate_conformers(n_conformers=1)
offmol.visualize(backend='nglview')

# Parsing lines from PDB file

In [None]:
l1 = 'ATOM    189  C99 OCT     5      39.590  30.100  38.320  1.00  0.00'           
l2 = 'ATOM    190 C100 OCT     5      38.850  31.110  37.700  1.00  0.00'
l3 = 'HETATM   47  H21 UNL     1       0.000   0.000   0.000  1.00  0.00           H '

In [None]:
from typing import Any

PDB_ATOM_TOKEN_COLUMNS : dict[str, tuple[int, int]] = {
    'Is Heteratom' : (1, 6),
    'Atom serial number' : (7, 11),
    'Atom name' : (13, 16),
    'Alternate location indicator' : (17, 17),
    'Residue name' : (18, 20),
    'Chain identifier' : (22, 22),
    'Residue sequence number' : (23, 26),
    'Code for insertions of residues' : (27, 27),
    'X (angstrom)' : (31, 38),
    'Y (angstrom)' : (39, 46),
    'Z (angstrom)' : (47, 54),
    'Occupancy' : (55, 60),
    'Temperature factor' : (61, 66),
    'Segment identifier' : (73, 76),
    'Element symbol' : (77, 78),
    'Charge' : (79, 80),
} # taken from PDB spec doc (https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html)
def tokenize_pdb_atom_line(line : str) -> dict[str, Any]:
    line = line.ljust(80, ' ') # pad line to 80 characters with spaces
    return {
        prop_name : line[i_start-1:i_end]
            for prop_name, (i_start, i_end) in PDB_ATOM_TOKEN_COLUMNS.items()
    }


for line in (l1, l2, l3):
    print(tokenize_pdb_atom_line(line))

# Another thing