## Examples on how to screen for similar molecules in Enamine REAL space

In [25]:
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [26]:
from pathlib import Path
import os
import sys
from distutils.spawn import find_executable
import subprocess

import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from IPython.core.display import HTML

<IPython.core.display.Javascript object>

### Requirements:
* [Ftree](https://www.biosolveit.de/FTrees/) tool from BioSolveIt
* [REAL space](https://www.biosolveit.de/CoLibri/spaces.html#realspace) to  find synthesizable compounds from Enamine

Ftrees commandline help:

```
 Available options:

 Program options:
  -i [ --input ] arg                   Input query molecule file or single input molecule as smiles.
  -s [ --searchFiles ] arg             Paths to library input molecule files for similarity scoring or to
                                       Fragment Space FSF files or Fragment Spaces.
                                       Note: The .flf and fragment files specified in the FSF have to be in
                                       the appropriate relative paths.
  -o [ --outputFiles ] arg             Output base files (suffixes are required). Only '.sdf' and '.csv' are
                                       supported.
  -m [ --matchImageBaseFile ] arg      Output base file name for matching images (suffix required).
                                       The file formats '.pdf', '.png' and '.svg' are supported.
                                       Note: For each match a separate file is created.
  --gen2dOutput arg (=0)               Generates 2d coordinates in case of SDF output files.
                                       Note: Can't be used together with '--gen3dOutput'.
  --gen3dOutput arg (=0)               Generates 3d coordinates in case of SDF output files.
                                       Note: Can't be used together with '--gen2dOutput'.
  --thread-count arg                   Maximum number of threads used for calculations.

 Configuration:
  --comparisonAlgorithm arg (=1)       Feature Tree comparison algorithm
                                           0: Split-Search
                                           1: Match-Search
                                           2: Multi-Match-Search
  --expandAlternativeResults arg (=0)  Write alternative results based on alternative reaction paths or
                                       duplicate matchings.
  --maxNofResults arg (=100)           Maximum number of top-ranking result molecules [1 to 1000000].
  --minSimilarityThreshold arg (=0.8)  Similarity threshold below which molecules are discarded [0.0 to 1.0].
  --targetSimilarity arg (=1)          Desired target similarity to the query molecule [0.5 to 1.0].
                                       Note: Must be >= '--minSimilarityThreshold'
  --totalDiversity arg (=1)            Required diversity between any two compounds in a solution set [0.9 to
                                       1.0].
                                       Note: Only available if --maxNofResults' is <= 500.
                                       WARNING: any value below 1.0 drastically extends the run time.

 Deprecated options:
  -l [ --library ] arg                 Library input molecule files to calculate similarity score with.
                                       Note: Can't be used together with '--searchFiles'.
  -f [ --fragSpace ] arg               Paths to the Fragment Space FSF files or Fragment Spaces.
                                       Note: The .flf and fragment files specified in the FSF have to be in
                                       the appropriate relative paths.
                                       Note: Can't be used together with '--searchFiles'.

 General options:
  -h [ --help ]                        Prints this help message
  --version                            Prints version info
  --license-info                       Prints license info
  -v [ --verbosity ] arg (=2)          Set verbosity level
                                            0 [silent]
                                            1 [error]
                                            2 [warning]
                                            3 [workflow]
                                            4 [steps]
```

### Define input path for binary and library

Define constant that points to the data/ folder using absolute path in running computer (working dir indepedent).

In [27]:
DATADIR = Path(_dh[0]).parent.parent / "data"
print(DATADIR)

C:\Users\andre\Desktop\Arbeit_Charite\GitHub\covid19-SBapproach\data


<IPython.core.display.Javascript object>

In [33]:
# FTrees executable
# Replace with your absolute path if location is not standard
if sys.platform == "windows" or sys.platform == "win32":
    path_to_binary = (
        rf"{os.environ['LOCALAPPDATA']}\Programs\BioSolveIT\FTrees-6.0\FTrees.exe"
    )
else:
    path_to_binary = find_executable("Ftrees")
# fragment space
if not path_to_binary or not os.path.isfile(path_to_binary):
    raise ValueError("FTrees program cannot be found")
fspace = "REALspace_2019-12.space"
library = DATADIR / f"{fspace}"

<IPython.core.display.Javascript object>

### Set command line options

In [34]:
# Maximum number of top-ranking result molecules [1 to 1000000].
max_nof_results = 50
# Similarity threshold below which molecules are discarded [0.0 to 1.0].
min_sim_thresh = 0.8
# Desired target similarity to the query molecule [0.5 to 1.0]. Note: Must be >= '--minSimilarityThreshold'
target_sim = 1
# Required diversity between any two compounds in a solution set [0.9 to 1.0].
total_div = 0.95
# Generates 3d coordinates in case of SDF output files. 0 = off
gen_3d_output = 1

<IPython.core.display.Javascript object>

### Example 1: Run for individual Smiles

In [35]:
smi = "CC(=O)NCCC1=CNC2=C1C=C(C=C2)OC"
# Output base files (suffixes are required). Only '.sdf' and '.csv' are supported.
outfile_name = "ex1_out"

<IPython.core.display.Javascript object>

### Prepare system call

In [36]:
sys_call = [
    path_to_binary,
    "--input",
    smi,
    "--searchFiles",
    library,
    "--maxNofResults",
    max_nof_results,
    "--outputFiles",
    DATADIR / f"{outfile_name}.sdf",
    "--minSimilarityThreshold",
    min_sim_thresh,
    "--targetSimilarity",
    target_sim,
    "--totalDiversity",
    total_div,
    "--gen3dOutput",
    gen_3d_output,
]
print(*sys_call)

C:\Users\andre\AppData\Local\Programs\BioSolveIT\FTrees-6.0\FTrees.exe --input CC(=O)NCCC1=CNC2=C1C=C(C=C2)OC --searchFiles C:\Users\andre\Desktop\Arbeit_Charite\GitHub\covid19-SBapproach\data\REALspace_2019-12.space --maxNofResults 50 --outputFiles C:\Users\andre\Desktop\Arbeit_Charite\GitHub\covid19-SBapproach\data\ex1_out.sdf --minSimilarityThreshold 0.8 --targetSimilarity 1 --totalDiversity 0.95 --gen3dOutput 1


<IPython.core.display.Javascript object>

In [37]:
out = subprocess.check_output([str(x) for x in sys_call])
out

b'\r\nQueryMolecule:  \tnofMatches: 50\r\n   matching molecule: s_22____13025516____58844 \t   source: REALspace_2019-12 \t   similarity: 1\r\n   matching molecule: s_2708____15506640____8988634 \t   source: REALspace_2019-12 \t   similarity: 0.949915\r\n   matching molecule: s_22____13913040____9296322 \t   source: REALspace_2019-12 \t   similarity: 0.949527\r\n   matching molecule: s_22____13771018____13206548 \t   source: REALspace_2019-12 \t   similarity: 0.949454\r\n   matching molecule: s_240690b____17036658____3025692 \t   source: REALspace_2019-12 \t   similarity: 0.949445\r\n   matching molecule: s_270004____8288600____16621896 \t   source: REALspace_2019-12 \t   similarity: 0.949418\r\n   matching molecule: s_22____9276248____58844 \t   source: REALspace_2019-12 \t   similarity: 0.949319\r\n   matching molecule: s_22____9631354____337030 \t   source: REALspace_2019-12 \t   similarity: 0.949218\r\n   matching molecule: s_22____17638894____58844 \t   source: REALspace_2019-12 \

<IPython.core.display.Javascript object>

### Collect results 

In [40]:
load_file = str(DATADIR / f"{outfile_name}_1.sdf")
df = PandasTools.LoadSDF(load_file)
df.head()
# display(HTML(df.to_html()))

Unnamed: 0,result-rank,similarity,query-name,query-smiles,source,similarity-descriptor,similarity-descriptor-smiles,reaction-name,reagent1-name,reagent1-smiles,reagent2-name,reagent2-smiles,ID,ROMol
0,1,1.0,,O=C(NCCC=1c2c(NC1)ccc(OC)c2)C,REALspace_2019-12,1.000;1.000;1.000;1.000;1.000;1.000;1.000;1.00...,C1=CNcc1;c1ccccc1;O=C;N;C;C;O;C;C;,s_22,13025516,[R1*]NCCC=1c2c(OC)cccc2NC1,58844,[R1*]C(=O)C,s_22____13025516____58844,"<img src=""data:image/png;base64,iVBORw0KGgoAAA..."
1,2,0.95,,O=C(NCCC=1c2c(NC1)ccc(OC)c2)C,REALspace_2019-12,1.000;1.000;0.840;1.000;1.000;1.000;1.000;1.00...,C1=CNcc1;c1ccccc1;O=C;N;C;C;O;C;O=C;N.O.C,s_2708,15506640,[R1*]C(=O)NCCC=1c2c(NC1)c(OC)ccc2,8988634,[R1*]NOC,s_2708____15506640____8988634,"<img src=""data:image/png;base64,iVBORw0KGgoAAA..."
2,3,0.95,,O=C(NCCC=1c2c(NC1)ccc(OC)c2)C,REALspace_2019-12,1.000;0.980;0.936;1.000;0.969;0.969;0.0;0.0;0....,c1C=CNc1;Fc1ccccc1;C(=O)C;N;CC;CC;;;CC;C,s_22,13913040,[R1*]NC(CC=1c2c(NC1)ccc(F)c2)C,9296322,[R1*]C(=O)C#CC,s_22____13913040____9296322,"<img src=""data:image/png;base64,iVBORw0KGgoAAA..."
3,4,0.949,,O=C(NCCC=1c2c(NC1)ccc(OC)c2)C,REALspace_2019-12,1.000;1.000;0.914;1.000;1.000;1.000;1.000;0.95...,C1=CNcc1;c1ccccc1;O=CC;N;C;C;O;C;C;C.C.C.C,s_22,13771018,[R1*]NCCC=1c2c(NC1)ccc(OCCC)c2,13206548,[R1*]C(=O)C(=C)CC,s_22____13771018____13206548,"<img src=""data:image/png;base64,iVBORw0KGgoAAA..."
4,5,0.949,,O=C(NCCC=1c2c(NC1)ccc(OC)c2)C,REALspace_2019-12,0.998;0.958;0.887;1.000;1.000;1.000;0.958;0.0;...,C1=CNcc1;c1cccnc1;O=CC;N;C;C;c1cccnc1;;C;[NH3+],s_240690b,17036658,[R1*]NCCC=1c2c(nccc2)NC1,3025692,[R1*]C(=O)C(N)C,s_240690b____17036658____3025692,"<img src=""data:image/png;base64,iVBORw0KGgoAAA..."


<IPython.core.display.Javascript object>

### Example 2: Run for protease set

In [None]:
# get protease sdf file
protease_sdf = str(DATADIR / "proteaseFDAdrugs_3D.sdf")
# Output base files (suffixes are required). Only '.sdf' and '.csv' are supported.
outfile_name = "ex2_out"
# generate more results
max_nof_results = 100

In [None]:
df_tmp = PandasTools.LoadSDF(protease_sdf)
display(HTML(df_tmp.to_html()))

### Prepare system call

In [None]:
sys_call = [
    path_to_binary,
    "--input",
    protease_sdf,
    "--searchFiles",
    library,
    "--maxNofResults",
    max_nof_results,
    "--outputFiles",
    DATADIR / f"{outfile_name}.sdf",
    "--minSimilarityThreshold",
    min_sim_thresh,
    "--targetSimilarity",
    target_sim,
    "--totalDiversity",
    total_div,
    "--gen3dOutput",
    gen_3d_output,
]
print(*sys_call)

In [None]:
# run
out = subprocess.check_output([str(x) for x in sys_call])
out

In [None]:
df2 = PandasTools.LoadSDF(str(DATADIR / f"{outfile_name}_1.sdf"))
display(HTML(df2.to_html()))

In [None]:
Draw.MolsToGridImage(df2.ROMol, molsPerRow=5)