# KALRNA: Generating initial follow-up candidates

In [None]:
import rdkit
from rdkit import Chem
import glob
from joblib import Parallel, delayed
import multiprocessing
import argparse, os, gzip
import operator
import numpy as np
from rdkit.Chem import AllChem, rdShapeHelpers
from rdkit.Chem.FeatMaps import FeatMaps
from rdkit import RDConfig

## 1. Finding enamine molecules
In order to find potential follow-up molecules that are available to purchase from enamine, the REALSpaceNavigator tool (https://enamine.net/library-synthesis/real-compounds/real-space-navigator) was used.

For each hit, downloaded from fragalysis, two sets of candidate follow-up molecules were obtained. To get these two sets, the default 'more similar' and 'more surprise' settings in REALSpaceNavigator were selected, and each set of 1000 hits downloaded as an sdf file of 0D molecules. 

## 2. Generating 3D conformations for candidates
For each of the 2 sets of follow-up candidates, 'surprise' and 'similar', for each fragment hit, 100 conformations of each candidate molecule were generated. These conformations were generated with the ETKDG method (implemented in RDKit), which uses torsion angle preferences from the Cambridge Structural Database (CSD) to correct the conformers after distance geometry has been used to generate them.

Following the generation of 100 conformers for each candidate, the conformers for each candidate were aligned to each other. 

Finally, the conformations were written out to sdf files for each candidate.

### Code:

In [None]:
# generate n conformations of rdkit mol object m
def generateconformations(m, n):
    # add hydrogen to molecule
    m = Chem.AddHs(m)
    # generate multiple conformations and return list of conf ids
    ids = AllChem.EmbedMultipleConfs(m, numConfs=n, params=AllChem.ETKDG())
    # align all of the conformers to each other
    _ = Chem.rdMolAlign.AlignMolConformers(m)
    # return the mol object with embedded conformers, and list of their ids
    return m, list(ids)

# write multiple conformations of mol m out to file name
def writeout_confs(name, mol, ids):
    writer = Chem.SDWriter(name)
    for i in ids:
        writer.write(mol, confId=i)

# take an input sdf of multiple candidates (0D) and generate multiple conformations written out to sdf files
def confs_and_write(candidates_file, no_confs):
    # read in all candidate mols
    suppl = Chem.SDMolSupplier(candidates_file)
    mols = [x for x in suppl]
    # for every candidate
    for i in range(0, len(mols)):
        # name out file by index of mol from initial file
        out_file = candidates_file.replace('.sdf', str('_' + str(i) + '_confs.sdf'))
        # if this candidate hasn't been done
        if not os.path.isfile(out_file):
            print('pocessing ' + candidates_file + ': candidate ' + str(i) + '(total=' + str(len(mols)) +')')
            # generate n=no_confs conformers and write to out_file
            mol, ids = generateconformations(mols[i], no_confs)
            writeout_confs(name=out_file, mol=mol, ids=ids)
        # if done, move onto next molecule
        else:
            continue
            
# confs and write when -similar-rsn and -surprise-rsn exist (from REALSpaceNavigator search)
def process_confs(f, no_confs):
    fup_similar_file = glob.glob(str(f + '/' + f.split('/')[-1] + '-similar-rsn.sdf'))
    fup_surprise_file = glob.glob(str(f + '/' + f.split('/')[-1] + '-surprise-rsn.sdf'))
    if fup_similar_file:
        confs_and_write(fup_similar_file[0], no_confs)
    if fup_surprise_file:
        confs_and_write(fup_surprise_file[0], no_confs)

        
no_confs=100
hit_files = glob.glob(str(os.getcwd() + '/XX02*'))

num_cores = multiprocessing.cpu_count()
Parallel(n_jobs=num_cores)(delayed(process_confs)(f=i, no_confs=no_confs) for i in hit_files)

## 3. Aligning candidate conformations to initial hit
For each candidate molecule, the conformations generated (see above) were aligned using the O3A alignment method implemented in RDKit. O3A is an unsupervised alignment algorithm. Two structures are aligned by matching the most similar pairs of atoms between them. Similarity is defined by the closeness of their MMFF94 atom types and charges.

### Code:

In [None]:
# 3D align all conformations in sdf file (confs path) to reference mol (hit path) and save aligned confs to out_path
def align_and_write(confs_path, hit_path, out_path):
    suppl = Chem.SDMolSupplier(hit_path)
    hit_mol = suppl[0]
    
    suppl = Chem.SDMolSupplier(confs_path)
    mols = [x for x in suppl]

    for mol in mols:
        o3d = Chem.rdMolAlign.GetO3A(prbMol=mol, refMol=hit_mol)
        o3d.Align()

    writer = Chem.SDWriter(out_path)
    for mol in mols:
        writer.write(mol)
        
# handle wether to run align_and_write        
def process_align(confs):
    out_path = confs.replace('.sdf', '_aligned.sdf')
    if not os.path.isfile(out_path):
        align_and_write(confs_path=confs, hit_path=hit_file, out_path=out_path)
        
hit_dirs = glob.glob('/Users/res3/michellab/XChem-examples/KALRNA/XX02*/')
for f in hit_dirs:
    hit_file = glob.glob(str(f + f.split('/')[-2] + '.sdf'))[0]
    similar_confs_paths = glob.glob(hit_file.replace('.sdf', '-similar-rsn_*_confs.sdf'))
    surprise_confs_paths = glob.glob(hit_file.replace('.sdf', '-surprise-rsn_*_confs.sdf'))
    print(similar_confs_paths)
    num_cores = multiprocessing.cpu_count()
    Parallel(n_jobs=num_cores)(delayed(process_align)(confs=confs) for confs in similar_confs_paths)
    Parallel(n_jobs=num_cores)(delayed(process_align)(confs=confs) for confs in surprise_confs_paths)