Rachael needs the aminopyridine series mergers only.

https://fragalysis.diamond.ac.uk/api/targets/?title=Mpro


So in `Mpro/metadata.csv` the ones cared about are in `A1 - Aminopyridine-like`	(site 5).

I am told than none of them are covalent —nor make them covalent—, so the models already there are fine.

In [None]:
! curl https://fragalysis.diamond.ac.uk/media/targets/Mpro.zip --output Mpro_Nov20.zip

In [None]:
! unzip Mpro_Nov20.zip -d Mpro_Nov20

In [None]:
from rdkit import Chem
def loadmol(file):
            mol = Chem.MolFromMolFile(file)
            if mol.GetProp('_Name') == '':
                mol.SetProp('_Name', file.split('/')[-1].replace('.sdf','').replace('.mol',''))
            return mol

In [None]:
import pandas as pd
from IPython.display import display

metadata = pd.read_csv('Mpro_Nov20/metadata.csv')
hits = metadata.loc[metadata.site_name == 'A1 - Aminopyridine-like']\
               .crystal_name\
               .apply(lambda name: f'Mpro_Nov20/aligned/{name}/{name}.sdf')\
               .apply(loadmol)\
               .values
len(hits)

In [None]:
##############################################
cores = 25
out_path = 'aminopyridine'
db_name = 'aminopyridine.sqlite'
##############################################

from sqlitedict import SqliteDict
import json, os
results = SqliteDict(db_name, encode=json.dumps, decode=json.loads, autocommit=True)
#os.remove(results)
assert len(results) == 0

In [None]:
from typing import Dict, List, Union

def process(data: Dict[str, Union[str, dict]]):
    # read data
    project = data['project']
    db_name = f'{project}.sqlite'
    hit_blocks = data['hit_blocks']
    name = '-'.join(hit_blocks.keys())
    print('**********', name)
    # imports
    import pyrosetta, logging
    pyrosetta.distributed.maybe_init(extra_options='-no_optH false -mute all -ignore_unrecognized_res true -load_PDB_components false')
    from fragmenstein.mpro import MProVictor
    from sqlitedict import SqliteDict
    import json, logging
    # fix hits
    from rdkit import Chem
    hits = []
    for hit_name in hit_blocks:
        hit = Chem.MolFromMolBlock(hit_blocks[hit_name])
        hit.SetProp('_Name', hit_name)
        hits.append(hit)
    # settings for Fragmenstein
    MProVictor.work_path = f'{project}'  # db_name
    MProVictor.fragmenstein_throw_on_discard= True
    MProVictor.fragmenstein_joining_cutoff = 5 # 10
    MProVictor.quick_renanimation = False
    MProVictor.error_to_catch = Exception
    MProVictor.enable_stdout(logging.ERROR)
    MProVictor.enable_logfile(f'{project}.log', logging.INFO)
    MProVictor.log_errors()
    # analyse
    try:
        v = MProVictor.combine(hits=hits)
        results = SqliteDict(db_name, encode=json.dumps, decode=json.loads, autocommit=True)
        results[v.long_name] = v.summarise()
        if not v.error:
            v.make_pse()
        print('DONE', [hit.GetProp('_Name') for hit in hits])
        return v.minimised_mol
    except Exception as error:
        error_msg = f'{error.__class__.__name__} {error}'
        results = SqliteDict(db_name, encode=json.dumps, decode=json.loads, autocommit=True)
        results[name] = {'error': error_msg}
        MProVictor.journal.critical(f'*** {error_msg}, files: {x}')
    except ConnectionError:
        pass
    return None

In [None]:
# get stuff started
from multiprocessing import Pool
import itertools, random, re
pool = Pool(cores, maxtasksperchild=1)

In [None]:
# new
results = SqliteDict(db_name, encode=json.dumps, decode=json.loads, autocommit=True)
done = list(results.keys())

to_do = [(a, b) for a, b in itertools.permutations(hits, 2)]
random.shuffle(to_do)
print(len(to_do))
for pair in to_do:
    async_result = pool.apply_async(process, ({'project': out_path,
                                              'hit_blocks': {hit.GetProp('_Name'): Chem.MolToMolBlock(hit) for hit in pair}
                                              },))

In [None]:
pool._cache

In [None]:
async_result.successful()

In [None]:
mol = process(hits[:2])
import nglview

nglview.show_rdkit(mol)

In [None]:
SqliteDict(db_name, encode=json.dumps, decode=json.loads, autocommit=True)

In [None]:
from sqlitedict import SqliteDict
from rdkit.Chem import PandasTools
import json
import pandas as pd
from fragmenstein import Victor

Victor.work_path = out_path

import numpy as np
import os
from rdkit import Chem
from rdkit.Chem import AllChem
from scipy.stats import skewnorm, gennorm


def old_ranker(row):
    try:
        return float(row['∆∆G'])/5 + float(row.comRMSD) + row.N_unconstrained_atoms /5 - row.N_constrained_atoms / 10
        #return float(row['∆∆G'])/(row.N_unconstrained_atoms + row.N_constrained_atoms * 0.5)*10 + float(row.comRMSD)
    except:
        return float('nan')
    

rank_weights = {'LE': 1., 'comRMSD': 2., 'atom_bonus': 2. , 'novelty_penalty': 5.}
def ranker(row):
    try:
        #atom_bonus = row.N_constrained_atoms / (20 + row.N_constrained_atoms)
        #atom_bonus = skewnorm.pdf((row.N_constrained_atoms - 20)/8, 3)
        ζ = (row.N_constrained_atoms**2 - 25**2)/500
        atom_bonus = gennorm.pdf(ζ, 5) / 0.5445622105291682
        novelty_penalty = row.N_unconstrained_atoms / row.N_constrained_atoms
        return rank_weights['LE'] * float(row.LE) + \
               rank_weights['comRMSD'] * float(row.comRMSD) + \
               - rank_weights['atom_bonus'] * atom_bonus + \
                rank_weights['novelty_penalty'] * novelty_penalty
    except:
        return float('nan')
    
def LE(row):
    try:
        return float(row['∆∆G'])/(row.N_unconstrained_atoms + row.N_constrained_atoms)
    except:
        return float('nan')

def get_mol3D(name):
    path = os.path.join(Victor.work_path, name, name+'.minimised.mol')
    if os.path.exists(path):
        try:
            mol = Chem.MolFromMolFile(path, sanitize=True)
            if mol is None:
                return None
            Chem.SanitizeMol(mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL)
            return mol
        except Exception as error:
            print(f'{type(error)}: {error}')
            return None
    else:
        return None

    

def get_table(db_name, mols=True, mol_only=True):
    results = SqliteDict(db_name, encode=json.dumps, decode=json.loads, autocommit=True)
    result_table = pd.DataFrame(results.values())
    print(len(result_table), sum(~result_table['∆∆G'].isna()))
    result_table['LE'] = result_table.apply(LE,1)
    rank = result_table.apply(ranker, axis=1).rank()
    m = np.nanmax(rank.values)
    result_table['%Rank'] = rank / m * 100
    result_table['N_hits'] = result_table.regarded.apply(lambda x: len(x) if str(x) != 'nan' else float('nan'))
    result_table = result_table.loc[~result_table.smiles.isna()].sort_values(['%Rank'], axis=0) 
    if mols:
        result_table['mol3D'] = result_table['name'].apply(get_mol3D)
        #result_table['mol2D'] = result_table['name'].apply(get_mol2D)
        PandasTools.AddMoleculeColumnToFrame(result_table,'smiles','mol2D')
        if mol_only:
            result_table = result_table.loc[~result_table.mol3D.isna()]
    return result_table


In [None]:
hits[:4]

In [None]:
atom_Ns = {hit.GetProp('_Name'): hit.GetNumAtoms() for hit in hits}

In [None]:
result_table = get_table(db_name, mols=True)
hit_counter = lambda hits: sum([atom_Ns[hit] for hit in hits])
merge_counter = lambda row: row.N_hit_atoms - row.N_unconstrained_atoms - row.N_constrained_atoms 

In [None]:
result_table = result_table.assign(N_hit_atoms=result_table.regarded.apply(hit_counter))
result_table = result_table.assign(N_diff_atoms=result_table.apply(merge_counter, axis='columns'))

In [None]:
from typing import List
hits_dex = {hit.GetProp('_Name'): hit for hit in hits}
def is_same_as_parent(query: Chem.Mol, parent_names: List[str]):
    if query is None:
        return False
    query = Chem.RemoveHs(query,implicitOnly=False)
    parents = [Chem.RemoveHs(hits_dex[name], implicitOnly=False) for name in parent_names]
    return any([parent.HasSubstructMatch(query) for parent in parents])

In [None]:
result_table = result_table.assign(is_same_as_parent=result_table.apply(lambda row: is_same_as_parent(row.mol3D, row.regarded), axis=1))

In [None]:
result_table.is_same_as_parent.value_counts()

In [None]:
outgoing = result_table.loc[~result_table.is_same_as_parent]
outgoing = outgoing.loc[result_table.mol3D != None]
outgoing = outgoing.loc[outgoing['∆∆G'] < 0]
outgoing = outgoing.assign(ref_mols=outgoing.regarded.apply(lambda names: ','.join(names)))
len(outgoing)

* **smiles**: molecule smiles used
* **%Rank**: simple weighted sum of a few terms: 1x LE + 2x comRMSD - 2x atom_bonus (scaled generalised normal distribution) + 2x novelty penalty (ratio of N unconstrained over constrained) 
* **ref_mols**: all reference molecules
* **∆∆G**: Difference in Gibbs Free energy relative to unbound molecule in kcal/mol (ref2015 scorefxn; negative=Good)
* **∆G_bound**: Gibbs Free energy of ligand bound
* **∆G_unbound**: Gibbs Free energy of ligand unbound
* **LE**: Ligand efficiency (kcal/mol/N_heavy)
* **comRMSD**: Combined RMSD from the atoms of the fragments that contributed to the position of the followup
* **N_constrained_atoms**: Number of atoms in the submission that were constrained
* **N_unconstrained_atoms**: Number of heavy atoms in the submission that were NOT constrained
* **N_diff_atoms**: Difference in number of heavy atoms between the merger and the hits (negative: atoms added, positive: atoms merged)

In [None]:
from rdkit.Chem import PandasTools
sdfile = f'{out_path}.sdf'
PandasTools.WriteSDF(outgoing, sdfile, molColName='mol3D', idName='name',
                     properties=['smiles', '%Rank', 'ref_mols',
                                 '∆∆G', '∆G_bound', '∆G_unbound', 'LE',
                                 'comRMSD', 'N_constrained_atoms', 'N_unconstrained_atoms', 'N_diff_atoms'
                                ], allNumeric=False)