# For verifying that PDB files present are loadable via all platforms

# Imports

In [None]:
# Supressing annoying warnings (!must be done first!)
import warnings

# General
import re
import pandas as pd
from pathlib import Path
from shutil import copyfile

# Logging
import logging
from rich.progress import Progress, track

# Cheminformatics
from rdkit import Chem
from openmm.app import PDBFile
from openff.toolkit import Topology

# Custom
from polysaccharide2.genutils.fileutils.filetree import clear_dir
from polysaccharide2.monomers.repr import MonomerGroup
from polysaccharide2.residues.partition import partition

# Correcting PDB residue name spacing

In [None]:
def adjust_PDB_resname_pos(pdb_block : str, delimiter : str='\n', res_idx : int=17-1) -> str:
    '''For correcting spacing issues with PDB residue names'''
    lines = []
    for line in pdb_block.split(delimiter):
        if re.match('HETATM|ATOM', line) and (line[res_idx] != ' '):
            lines.append(line[:res_idx] + ' ' + line[res_idx:])
        else:
            lines.append(line)

    return delimiter.join(lines)

In [None]:
pdb_dir  = Path('polymer_examples/compatible_pdbs')
mono_dir = Path('polymer_examples/monomer_generation/json_files/')

clean_pdb_dir = Path('cleaned_pdbs')
clean_pdb_dir.mkdir(exist_ok=True)
clear_dir(clean_pdb_dir)

SQUARE_RE = re.compile(r's\d+$')
pdb_subdirs_valid = [
    pdb_subdir
        for pdb_subdir in pdb_dir.iterdir()
            if pdb_subdir.is_dir()
]

records = {}
with Progress() as progress:
    warnings.filterwarnings('ignore', category=DeprecationWarning) # doesn't actually seem to do anything about mbuild warnings
    task1  = progress.add_task('Polymer Type', total=len(pdb_subdirs_valid))
    task2  = progress.add_task('Molecule') # totals will vary by type
    STATUS = progress.add_task('Status') # totals will vary by type

    for pdb_subdir in pdb_subdirs_valid:
        poly_type = pdb_subdir.name
        progress.update(task1, description=f'Polymer Type : {poly_type}')

        # generate copy directories for polymer type 
        pdb_out_subdir = clean_pdb_dir / poly_type
        pdb_out_subdir.mkdir(exist_ok=True)

        # locate all square-free PDBs present
        pdb_paths_valid = [
            pdb_path
                for pdb_path in pdb_subdir.iterdir()
                    if not re.search(SQUARE_RE, pdb_path.stem) # ignore squared patterns
        ]

        progress.reset(task2, total=len(pdb_paths_valid)) # clear progress on secondary bar, set length to new pdb count
        for pdb_path in pdb_paths_valid:
            mol_name = pdb_path.stem
            progress.update(task2, description=f'Molecule : {mol_name}')
            progress.update(STATUS, description='Waiting...')

            with pdb_path.open('r') as file:
                pdb_block_orig = file.read()
            pdb_block_shifted = adjust_PDB_resname_pos(pdb_block_orig)

            steps = {
                ('Worked the first time! Continuing...', 'INITIALLY FAILED, attempting PDB Residue Name shift' ): pdb_block_orig,
                ('Residue name shift fixed it! Continuing...', 'FAILED AGAIN, issue is something deeper' ): pdb_block_shifted
            }

            for (succ_msg, fail_msg), pdb_block in steps.items():
            # initial file load to check for successful RDKit load
                rdmol = Chem.MolFromPDBBlock(pdb_block)
                if rdmol is None:
                    continue # when completely invalid, proceed with loop to explicitly AVOID hitting break statement

                if rdmol.GetNumAtoms() != 0:
                    progress.update(STATUS, description=succ_msg)
                    with (pdb_out_subdir / pdb_path.name).open('w') as outfile:
                        outfile.write(pdb_block)
                    worked = True
                    break # skip external "else" clause if successful
            
                progress.update(STATUS, description=fail_msg)
            else:
                progress.update(STATUS, description='No remedy by shifting, defaulting to original file')
                cop = copyfile(pdb_path, pdb_out_subdir / pdb_path.name)
                worked = False

            records[(poly_type, mol_name)] = worked
            progress.advance(task2, advance=1)
            progress.refresh()
        
        progress.advance(task1, advance=1)
        progress.refresh()

# Curating available PDBs and monomer files

## Copying and organizing files, taking inventory of all pdbs and accompaying monomer files (if any)

In [None]:
# use_cleaned = False
use_cleaned = True

mono_dir = Path('polymer_examples/monomer_generation/json_files/')
if use_cleaned:
    pdb_dir = Path('cleaned_pdbs')
    out_dir = Path('pdb_test_cleaned')
else:
    pdb_dir  = Path('polymer_examples/compatible_pdbs')
    out_dir = Path('pdb_test')

# defining output paths
out_dir.mkdir(exist_ok=True)
clear_dir(out_dir) # ensure directory begins empty

pdb_out  = out_dir / 'pdbs'
pdb_out.mkdir(exist_ok=True)

mono_out = out_dir / 'monos'
mono_out.mkdir(exist_ok=True)

In [None]:
SQUARE_RE = re.compile(r's\d+$')

records = []
pdb_subdirs_valid = [
    pdb_subdir
        for pdb_subdir in pdb_dir.iterdir()
            if pdb_subdir.is_dir()
]

with Progress() as progress:
    warnings.filterwarnings('ignore', category=DeprecationWarning) # doesn't actually seem to do anything about mbuild warnings
    task1 = progress.add_task('Polymer Type', total=len(pdb_subdirs_valid))
    task2 = progress.add_task('Molecule') # totals will vary by type

    for pdb_subdir in pdb_subdirs_valid:
        poly_type = pdb_subdir.name
        progress.update(task1, description=f'Polymer Type : {poly_type}')

        # generate copy directories for polymer type 
        pdb_out_subdir = pdb_out / poly_type
        pdb_out_subdir.mkdir(exist_ok=True)

        mono_out_subdir = mono_out / poly_type
        mono_out_subdir.mkdir(exist_ok=True)

        # locate all square-free PDBs present
        pdb_paths_valid = [
            pdb_path
                for pdb_path in pdb_subdir.iterdir()
                    if not re.search(SQUARE_RE, pdb_path.stem) # ignore squared patterns
        ]

        progress.reset(task2, total=len(pdb_paths_valid)) # clear progress on secondary bar, set length to new pdb count
        for pdb_path in pdb_paths_valid:
            mol_name = pdb_path.stem
            progress.update(task2, description=f'Molecule : {mol_name}')
            curr_pdb = copyfile(pdb_path, pdb_out_subdir / pdb_path.name)
            
            mono_path = mono_dir / f'{mol_name}.json'
            if mono_path.exists():
                curr_mono = copyfile(mono_path, mono_out_subdir / mono_path.name)
            else:
                mono_path = None

            record = {
                'Polymer Type' : poly_type,
                'Molecule' : mol_name,
                'PDB Path' : pdb_path,
                'Monomer Path' : mono_path
            }
            records.append(record)
            progress.advance(task2, advance=1)
            progress.refresh()
        
        progress.advance(task1, advance=1)
        progress.refresh()
warnings.filterwarnings('ignore', category=DeprecationWarning) # doesn't actually seem to do anything about mbuild warnings

# format dataframe into MultiIndex
idx_cols = ['Polymer Type', 'Molecule']
pdb_df = pd.DataFrame.from_records(records, index=idx_cols) # MultiIndex by type and mol
pdb_df.sort_values(idx_cols, inplace=True) # sort by name within each polymer type

# extract levels for reference, save inventory to csv
poly_types, mols = pdb_df.index.levels
pdb_df.to_csv(out_dir / 'pdb_inventory.csv')

In [None]:
from openmm.unit import nanosecond

a = {'value' : 5, 'unit' : nanosecond}

# Testing Platform-based loading

## Defining PDB Loading tests for each platform

### Abstract base interface

In [None]:
from typing import Any, Optional
from abc import ABC, abstractmethod, abstractclassmethod, abstractproperty
from polysaccharide2.genutils.decorators.classmod import register_subclasses

@register_subclasses(key_attr='name')
class PDBLoadTester(ABC):
    '''For defining framework-specific PDB reading tests'''
    @abstractproperty
    @classmethod
    def name(cls) -> str:
        pass

    @abstractmethod
    def load_pdb_obj(self, pdb_path : Path, mono_path : Path) -> Optional[Any]:
        '''Implement the loading check for an individual PDB and monomer file here'''
        pass

### Concrete implementations by framework

In [None]:
from polysaccharide2.monomers.repr import MonomerGroup
from openff.toolkit.utils.exceptions import UnassignedChemistryInPDBError


class RDKitPDBLoadTester(PDBLoadTester):
    name = 'RDKit'
    
    def load_pdb_obj(self, pdb_path: Path, mono_path: Path) -> Optional[Any]:
        '''Load Mol from PDB file'''
        rdmol = Chem.MolFromPDBFile(str(pdb_path)) 
        try:
            if rdmol.GetNumAtoms() > 0:
                return rdmol
            return None
        except AttributeError:
            return None
    
class OpenMMPDBLoadTester(PDBLoadTester):
    name = 'OpenMM'
    
    def load_pdb_obj(self, pdb_path: Path, mono_path: Path) -> Optional[Any]:
        '''Return info about loadability'''
        try:
            return PDBFile(str(pdb_path))
        except ValueError:
            return None
        
class OpenFFPDBLoadTester(PDBLoadTester):
    name = 'OpenFF'
    
    def load_pdb_obj(self, pdb_path: Path, mono_path: Path) -> Optional[Any]:
        '''Return info about loadability'''
        if mono_path is None:
            return None
        
        mono_grp = MonomerGroup.from_file(mono_path)
        try:
            offtop = Topology.from_pdb(str(pdb_path), _custom_substructures=mono_grp.monomers)
            return offtop
        except UnassignedChemistryInPDBError:
            return None

## Looping over all available PDBs and testing by framework

In [None]:
loadable_dict = {}
pdb_obj_dict  = {}

with Progress() as progress:
    warnings.filterwarnings('ignore', category=DeprecationWarning) # doesn't actually seem to do anything about mbuild warnings
    task1 = progress.add_task('Polymer Type', total=len(poly_types))
    task2 = progress.add_task('Molecule')
    task3 = progress.add_task('Framework', total=len(PDBLoadTester.subclass_registry))

    for poly_type, ptype_df in pdb_df.groupby(level=0):
        progress.update(task1, description=f'Polymer Type : {poly_type}')
        progress.reset(task2, total=len(ptype_df))
        
        for mol_name, mol_paths in ptype_df.droplevel(0).iterrows():
            progress.update(task2, description=f'Molecule : {mol_name}')
            pdb_path  = mol_paths.loc['PDB Path']
            mono_path = mol_paths.loc['Monomer Path']
            
            progress.reset(task3)
            fw_pdb_objs = {}
            fw_loadable = {}
            for framework, PDBLoadClass in PDBLoadTester.subclass_registry.items():
                progress.update(task3, description=f'Framework : {framework}')
                pdb_loader = PDBLoadClass() # instantiate generic class
                pdb_obj = pdb_loader.load_pdb_obj(pdb_path, mono_path)

                fw_pdb_objs[f'{framework} PDB Object'] = pdb_obj
                fw_loadable[f'{framework} Loadable?' ] = bool(pdb_obj is not None)
                progress.advance(task3)
                progress.refresh()

            pdb_obj_dict[ (poly_type, mol_name)] = fw_pdb_objs
            loadable_dict[(poly_type, mol_name)] = fw_loadable
            progress.advance(task2)
            progress.refresh()

        progress.advance(task1)
        progress.refresh()

## Creating DataFrames for tabular reference

In [None]:
# Booleans, whether objects are loadable
loadable_df = pd.DataFrame(loadable_dict).transpose()
loadable_df.index.names = idx_cols
loadable_df = pd.concat([pdb_df, loadable_df], axis=1)

loadable_df.to_csv(out_dir / 'pdbs_loadable.csv')

# The actual PDB-objects by platform (including NoneType)
pdb_obj_df = pd.DataFrame(pdb_obj_dict).transpose()
pdb_obj_df.index.names = idx_cols
pdb_obj_df = pd.concat([pdb_df, pdb_obj_df], axis=1)