## Examples on how to screen for similar molecules in Enamine REAL space

In [None]:
%load_ext nb_black

In [None]:
from pathlib import Path
import os
import sys
from distutils.spawn import find_executable
import subprocess

import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from IPython.core.display import HTML

### Requirements:
* [Ftree](https://www.biosolveit.de/FTrees/) tool from BioSolveIt
* [REAL space](https://www.biosolveit.de/CoLibri/spaces.html#realspace) to  find synthesizable compounds from Enamine

Ftrees commandline help:

```
 Available options:

 Program options:
  -i [ --input ] arg                   Input query molecule file or single input molecule as smiles.
  -s [ --searchFiles ] arg             Paths to library input molecule files for similarity scoring or to
                                       Fragment Space FSF files or Fragment Spaces.
                                       Note: The .flf and fragment files specified in the FSF have to be in
                                       the appropriate relative paths.
  -o [ --outputFiles ] arg             Output base files (suffixes are required). Only '.sdf' and '.csv' are
                                       supported.
  -m [ --matchImageBaseFile ] arg      Output base file name for matching images (suffix required).
                                       The file formats '.pdf', '.png' and '.svg' are supported.
                                       Note: For each match a separate file is created.
  --gen2dOutput arg (=0)               Generates 2d coordinates in case of SDF output files.
                                       Note: Can't be used together with '--gen3dOutput'.
  --gen3dOutput arg (=0)               Generates 3d coordinates in case of SDF output files.
                                       Note: Can't be used together with '--gen2dOutput'.
  --thread-count arg                   Maximum number of threads used for calculations.

 Configuration:
  --comparisonAlgorithm arg (=1)       Feature Tree comparison algorithm
                                           0: Split-Search
                                           1: Match-Search
                                           2: Multi-Match-Search
  --expandAlternativeResults arg (=0)  Write alternative results based on alternative reaction paths or
                                       duplicate matchings.
  --maxNofResults arg (=100)           Maximum number of top-ranking result molecules [1 to 1000000].
  --minSimilarityThreshold arg (=0.8)  Similarity threshold below which molecules are discarded [0.0 to 1.0].
  --targetSimilarity arg (=1)          Desired target similarity to the query molecule [0.5 to 1.0].
                                       Note: Must be >= '--minSimilarityThreshold'
  --totalDiversity arg (=1)            Required diversity between any two compounds in a solution set [0.9 to
                                       1.0].
                                       Note: Only available if --maxNofResults' is <= 500.
                                       WARNING: any value below 1.0 drastically extends the run time.

 Deprecated options:
  -l [ --library ] arg                 Library input molecule files to calculate similarity score with.
                                       Note: Can't be used together with '--searchFiles'.
  -f [ --fragSpace ] arg               Paths to the Fragment Space FSF files or Fragment Spaces.
                                       Note: The .flf and fragment files specified in the FSF have to be in
                                       the appropriate relative paths.
                                       Note: Can't be used together with '--searchFiles'.

 General options:
  -h [ --help ]                        Prints this help message
  --version                            Prints version info
  --license-info                       Prints license info
  -v [ --verbosity ] arg (=2)          Set verbosity level
                                            0 [silent]
                                            1 [error]
                                            2 [warning]
                                            3 [workflow]
                                            4 [steps]
```

### Define input path for binary and library

Define constant that points to the data/ folder using absolute path in running computer (working dir indepedent).

In [None]:
DATADIR = Path(_dh[0]).parent.parent / "data"
print(DATADIR)

In [None]:
# FTrees executable
# Replace with your absolute path if location is not standard
if sys.platform == "windows":
    path_to_binary = (
        rf"{os.environ['LOCALAPPDATA']}\Programs\BioSolveIT\FTrees-6.0\FTrees.exe"
    )
else:
    path_to_binary = find_executable("Ftrees")
# fragment space
if not path_to_binary or not os.path.isfile(path_to_binary):
    raise ValueError("FTrees program cannot be found")
fspace = "REALspace_2019-12.space"
library = DATADIR / f"{fspace}"

### Set command line options

In [None]:
# Maximum number of top-ranking result molecules [1 to 1000000].
max_nof_results = 50
# Similarity threshold below which molecules are discarded [0.0 to 1.0].
min_sim_thresh = 0.8
# Desired target similarity to the query molecule [0.5 to 1.0]. Note: Must be >= '--minSimilarityThreshold'
target_sim = 1
# Required diversity between any two compounds in a solution set [0.9 to 1.0].
total_div = 0.95
# Generates 3d coordinates in case of SDF output files. 0 = off
gen_3d_output = 1

### Example 1: Run for individual Smiles

In [None]:
smi = "CC(=O)NCCC1=CNC2=C1C=C(C=C2)OC"
# Output base files (suffixes are required). Only '.sdf' and '.csv' are supported.
outfile_name = "ex1_out"

### Prepare system call

In [None]:
sys_call = [
    path_to_binary,
    "--input",
    smi,
    "--searchFiles",
    library,
    "--maxNofResults",
    max_nof_results,
    "--outputFiles",
    DATADIR / f"{outfile_name}.sdf",
    "--minSimilarityThreshold",
    min_sim_thresh,
    "--targetSimilarity",
    target_sim,
    "--totalDiversity",
    total_div,
    "--gen3dOutput",
    gen_3d_output,
]
print(*sys_call)

In [None]:
out = subprocess.check_output([str(x) for x in sys_call])
out

### Collect results 

In [None]:
load_file = str(DATADIR / f"{outfile_name}_1.sdf")
df = PandasTools.LoadSDF(load_file)
df.head()
# display(HTML(df.to_html()))

### Example 2: Run for protease set

In [None]:
# get protease sdf file
protease_sdf = str(DATADIR / "proteaseFDAdrugs_3D.sdf")
# Output base files (suffixes are required). Only '.sdf' and '.csv' are supported.
outfile_name = "ex2_out"
# generate more results
max_nof_results = 100

In [None]:
df_tmp = PandasTools.LoadSDF(protease_sdf)
display(HTML(df_tmp.to_html()))

### Prepare system call

In [None]:
sys_call = [
    path_to_binary,
    "--input",
    protease_sdf,
    "--searchFiles",
    library,
    "--maxNofResults",
    max_nof_results,
    "--outputFiles",
    DATADIR / f"{outfile_name}.sdf",
    "--minSimilarityThreshold",
    min_sim_thresh,
    "--targetSimilarity",
    target_sim,
    "--totalDiversity",
    total_div,
    "--gen3dOutput",
    gen_3d_output,
]
print(*sys_call)

In [None]:
# run
out = subprocess.check_output([str(x) for x in sys_call])
out

In [None]:
df2 = PandasTools.LoadSDF(str(DATADIR / f"{outfile_name}_1.sdf"))
display(HTML(df2.to_html()))

In [None]:
Draw.MolsToGridImage(df2.ROMol, molsPerRow=5)