In [1]:
from collections import namedtuple
from openpharmacophore.algorithms.alignment import apply_radii_to_bounds, transform_embeddings
import openpharmacophore as oph

from rdkit import Chem, RDConfig
from rdkit.Chem import ChemicalFeatures, rdDistGeom
from rdkit.Chem.Pharm3D import EmbedLib

import os
from multiprocessing import Pool, Queue
from operator import itemgetter





# Benchmark of Virtual Screening of a ZINC set

We will compute the average time it takes to screen the shards subset of ZINC database. This is set consist of approximately 1,200,000 molecules. For this, we will use a structured based pharmacophore consisting of 4 pharmacophoric points.

The purpose of this notebooks is to compare the time it takes to screen a small set of molecules to optimize the screening algorithms.

In [3]:
# Download the shards subset

download = False # Change this to download subset
download_path="./data/zinc/shards"

if download:
    zinc_client = oph.ZincClient()
    zinc_client.download_predifined_subset(download_path=download_path,
                                           subset="shards",
                                           fileformat="smi")

## Obtain a pharmacophore

We derive a pharmacophore for the PDB 1M70, and remove some of it's pharmacophoric points so the screening algorithm finds more molecules that fit the pharmacophore. If the pharmacophore has a lot of pharmacophoric points, the screeing algorithm won't find any matches and it will perform much faster, thus we will not be able to compare it's performance.

In [4]:
sb_pharmacophore = oph.StructuredBasedPharmacophore().from_pdb("1M7O", ligand_id="3PG:A:5401")
sb_pharmacophore.remove_elements([0, 1, 3, 4])
sb_pharmacophore

StructuredBasedPharmacophore(n_elements: 4)

In [5]:
sb_pharmacophore.show()

NGLWidget()

In [9]:
zinc_screener = oph.VirtualScreening(sb_pharmacophore)

In [7]:
zinc_screener.screen_db_from_dir(download_path)

  0%|          | 0/136 [00:00<?, ?it/s]

In [8]:
zinc_screener.print_report()

Virtual Screening Results
-------------------------

Molecules scanned:                            1,254,835
Molecules matched to pharmacophore:              30,047
Molecules that didn't match the pharmacophore: 1,224,307
Lowest  SSD value:     0.1348
Highest SSD value:    21.6753
Average SSD value:     3.8425

Top 5 molecules:

   ID            SSD
-------       ------
42393881      0.1348
990021182     0.1518
601435566     0.1542
70451404      0.1556
82475853      0.1629



In [10]:
%timeit zinc_screener.screen_db_from_dir(download_path)

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

27min 44s ± 17.5 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
zinc_screener.print_report()

Virtual Screening Results
-------------------------

Molecules scanned:                            1,254,835
Molecules matched to pharmacophore:              30,047
Molecules that didn't match the pharmacophore: 1,224,307
Lowest  SSD value:     0.1348
Highest SSD value:    21.6753
Average SSD value:     3.8425

Top 5 molecules:

   ID            SSD
-------       ------
42393881      0.1348
990021182     0.1518
601435566     0.1542
70451404      0.1556
82475853      0.1629



The dataset we are screening is stored in 136 different files for a total of 1,254,835 molecules. In my computer it takes a while to run even tough there are no many files. This suggests that the process may not be IO bound. 

The average time on my computer was of 27 min. In the next section we'll try to see if screening time improves by taking advantage of the multiple cores on my computer. 

## Using multiprocessing

We will make use of the multiprocessing module to perform the same virtual screening that we did above. On my cumputer with an intel i7 processor (8 cores), a performance gain should be expected.

In [7]:
Match = namedtuple("Match", ["score", "id", "mol"])

def get_rdkit_pharmacophore(pharmacophore):
    rdkit_pharmacophore, radii = pharmacophore.to_rdkit()
    apply_radii_to_bounds(radii, rdkit_pharmacophore)

    return rdkit_pharmacophore
        
rdkit_pharmacophore = get_rdkit_pharmacophore(sb_pharmacophore)
featFactory = ChemicalFeatures.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir,
                                                                     'BaseFeatures.fdef'))

def align_molecule(mol):
    """ Align a molecules to a given pharmacophore.

    Parameters
    ----------
    mol : rdkit.Chem.mol
        Molecule to align.

    """

    bounds_matrix = rdDistGeom.GetMoleculeBoundsMatrix(mol)
    # Check if the molecule features can match with the pharmacophore.
    can_match, all_matches = EmbedLib.MatchPharmacophoreToMol(mol, featFactory, rdkit_pharmacophore)
    # all_matches is a list of tuples where each tuple contains the chemical features
    if can_match:
        # Match the molecule to the pharmacophore without aligning it
        failed, bounds_matrix_matched, matched_mols, match_details = EmbedLib.MatchPharmacophore(all_matches, 
                                                                                        bounds_matrix,
                                                                                        rdkit_pharmacophore, 
                                                                                        useDownsampling=True)
        if failed:
            return
    else:
        return
    atom_match = [list(x.GetAtomIds()) for x in matched_mols]
    try:
        mol_H = Chem.AddHs(mol)
        # Embed molecule onto the pharmacophore
        # embeddings is a list of molecules with a single conformer
        b_matrix, embeddings, num_fail = EmbedLib.EmbedPharmacophore(mol_H, atom_match, rdkit_pharmacophore, count=10)
    except Exception as e:
        return
    # Align embeddings to the pharmacophore 
    SSDs = transform_embeddings(rdkit_pharmacophore, embeddings, atom_match) 
    if len(SSDs) == 0:
        return
    best_fit_index = min(enumerate(SSDs), key=itemgetter(1))[0]

    try:
        mol_id = mol.GetProp("_Name")
    except:
        mol_id = None

    matched_mol = Match(SSDs[best_fit_index], mol_id, embeddings[best_fit_index]) 
    # Append to list in ordered manner
    

In [8]:
def get_files(path):
    file_queue = Queue()
    for root, directory, files in os.walk(path):
        if '.ipynb_checkpoints' in root:
            continue
        for file in files:
            file_queue.put(os.path.join(root, file))
    
    return file_queue

file_queue = get_files(download_path)
print(f"Number of files: {file_queue.qsize()}")

NUmber of files: 136


In [9]:
def screen_files(file_queue):
    while not file_queue.empty():
        file = file_queue.get()
        with open(file, "r") as fp:
            fp.readline()
            for line in fp:
                splitted_line = line.split()
                smiles = splitted_line[0]
                zinc_id = splitted_line[1]
                molecule = Chem.MolFromSmiles(smiles)
                molecule.SetProp("_Name", zinc_id)
                align_molecule(molecule)
        # print("Scanned File!")

In [10]:
def virtual_screening():
    
    file_queue = get_files("./data/shards")
    
    pool = Pool(None, screen_files, (file_queue,))
    pool.close()
    pool.join()

In [13]:
%timeit virtual_screening()

9min 7s ± 2.49 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


The average time is reduced significantly from 27min to 9min. This is more evidence that the process is CPU bound and we should expect better performance by taking advantage of multiple cores. 