# This pipeline is a fork of DockM8

In [1]:
#Import required libraries and scripts
from DockM8.docking_functions import *
from DockM8.rescoring_functions import *
from DockM8.consensus_methods import *
# from DockM8.scripts.dogsitescorer import *
# from DockM8.scripts.get_pocket import *
from tqdm.autonotebook import tqdm

In [2]:
protein_name  = 'protein_protoss_noligand.pdb'
ligand_library = 'ecft_scores_new_cleaned.sdf'
reference_ligand = 'ref_ligand.pdb'
# snapshot_IDs = ['p9', 'p11']

In [17]:
HERE = Path(_dh[-1])
DATA = (HERE / "data")
# Move input data (protein pdb, docking library and reference ligand) to data directory
software = (HERE / "software")
protein_file = (DATA  / protein_name)
ligand_library = (DATA / ligand_library)
ref_file = (DATA /  reference_ligand)

print(ligand_library)

OUTPUT = DATA / "results"

/home/ibrahim/Github/ECFT-VS-pipeline/data/ecft_scores_new_cleaned.sdf


### Move snapshots to another different folders

### Load Ground truth data with 2D compounds ['ID', '2D structure', 'Activity score']

# Data-preprocessing

### Protein is prepared by [Protoss](https://proteins.plus/)

### Ligand library preparation by Gypsum-DL for 3D conformers generation

In [4]:
from data_preparation import run_gypsumdl


prepared_library_path = OUTPUT / f"{ligand_library.stem}_prepared.sdf"
run_gypsumdl(ligand_library, prepared_library_path)

Molecules are already prepared


In [5]:
df_prepared = PandasTools.LoadSDF(str(prepared_library_path))
df_prepared.head(5)

Unnamed: 0,ID,ROMol
0,HIPS6128,<rdkit.Chem.rdchem.Mol object at 0x7f0071d7cc80>
1,HIPS449,<rdkit.Chem.rdchem.Mol object at 0x7f00761aab90>
2,HIPS6989,<rdkit.Chem.rdchem.Mol object at 0x7f011035a500>
3,HIPS7002,<rdkit.Chem.rdchem.Mol object at 0x7f0071d7ccf0>
4,HIPS7000,<rdkit.Chem.rdchem.Mol object at 0x7f0071d7cd60>


In [6]:
docking_programs = [
                'GNINA', 
                'SMINA',
                'local_diffdock', 
                'PLANTS', 
                'flexx',
                ]


consensus_methods = []
n_poses = 10
exhaustiveness = 8

# Docking

1. Local DiffDock
2. PLANTS (Implemented by DockM8)
3. SMINA (Implemented by DockM8)
4. GNINA (Implemented by DockM8)
5. FlexX

NOTE : Output of docking step should have at least two columns 
 1. **ID** : Name of the compound , name of docking tool and number of pose e.g. (compoundX_diffdock_01)
 2. **Molecule** : Poses of every docking tool

In [None]:
from docking import docking
docking(
        docking_programs,
        protein_file,
        prepared_library_path,
        ref_file,
        exhaustiveness,
        n_poses
        )

Extracting ligand coordinates supports either SDF files or PDB files...
Compounds are already docked with GNINA v 1.0
Compounds are already docked with SMINA
Binding pocket is already extracted


Local DiffDock is running ...:   0%|          | 0/212 [00:00<?, ?it/s]

Compound HIPS6128 is already docked with Local DiffDock
Compound HIPS449 is already docked with Local DiffDock
Compound HIPS6989 is already docked with Local DiffDock
Compound HIPS7002 is already docked with Local DiffDock
Compound HIPS7000 is already docked with Local DiffDock
Compound HIPS6994 is already docked with Local DiffDock
Compound HIPS6991 is already docked with Local DiffDock
Compound HIPS6998 is already docked with Local DiffDock
Compound HIPS7006 is already docked with Local DiffDock
Compound HIPS7004 is already docked with Local DiffDock
Compound HIPS7001 is already docked with Local DiffDock
Compound HIPS6992 is already docked with Local DiffDock
Compound HIPS6981 is already docked with Local DiffDock
Compound HIPS6999 is already docked with Local DiffDock
Compound HIPS7242 is already docked with Local DiffDock
Compound HIPS470 is already docked with Local DiffDock
Compound HIPS6984 is already docked with Local DiffDock
Compound HIPS6990 is already docked with Local Dif

# Rescoring

### Choose wanted scoring function from the next list

In [15]:
rescoring = [
    'gnina_rescoring', 
    'ad4',  
    'linf9', 
    'rtmscore', 
    'vinardo', 
    'scorch',
        
    'chemplp', 
    'rfscorevs_v1',
    'rfscorevs_v2',
    'rfscorevs_v3', 
    'vina_hydrophobic', 
    'vina_intra_hydrophobic'
    ]

# Load important dataframes


In [16]:
from rescoring import rescoring_function
  
docked_library_path = OUTPUT / f"allposes.sdf"

rescoring_function(
    rescoring,
    protein_file,
    docked_library_path,
    ref_file,
)

protein is already converted to mol2


Now rescoring with GNINA_RESCORING ... ⌛⌛ 
gnina_rescoring is already excuted
gnina_rescoring is already read


Now rescoring with AD4 ... ⌛⌛ 
ad4 is already excuted
ad4 is already read


Now rescoring with LINF9 ... ⌛⌛ 
linf9 is already excuted
linf9 is already read


Now rescoring with RTMSCORE ... ⌛⌛ 
rtmscore is already excuted
rtmscore is already read


Now rescoring with VINARDO ... ⌛⌛ 
vinardo is already excuted
vinardo is already read


Now rescoring with SCORCH ... ⌛⌛ 
scorch is already excuted
scorch is already read


Now rescoring with CHEMPLP ... ⌛⌛ 
chemplp is already excuted
chemplp is already read


Now rescoring with RFSCOREVS_V1 ... ⌛⌛ 
rfscorevs_v1 is already excuted
rfscorevs_v1 is already read


Now rescoring with RFSCOREVS_V2 ... ⌛⌛ 
rfscorevs_v2 is already excuted
rfscorevs_v2 is already read


Now rescoring with RFSCOREVS_V3 ... ⌛⌛ 
rfscorevs_v3 is already excuted
rfscorevs_v3 is already read


Now rescoring with VINA_HYDROP

# Consensus ranking methods (Implemented by DockM8)
### You can also choose the ranking methods according to you preference

In [7]:
ranking_methods = [
                'best_ECR' ,
                'ECR_average',
                'average_ECR' ,
                'rank_by_rank' ,
                'rank_by_vote' ,
                'best_Zscore',
                'average_Zscore'
                ]

In [8]:
df_rescored = pd.read_csv(str(DATA / 'results' / 'all_rescoring_results.csv')).apply(pd.to_numeric, errors='ignore')
df_scores = PandasTools.LoadSDF(str(ligand_library))[['ID', 'score']]
df_scores = df_scores.rename(columns={'score': 'true_value'})

### Run Ranking methods

In [9]:
from ranking import *


poses_ranking(
    ranking_methods,
    df_rescored,
    OUTPUT,
    df_scores,
)


Number of possible combinations for every ranking method: 1015777
 With total combinations : 7110439
Parallelizing best_ECR...


100%|██████████| 145111/145111 [1:17:19<00:00, 31.28it/s]
 92%|█████████▏| 133057/145111 [1:17:28<08:09, 24.61it/s]

Execution time: 4648.840747117996 seconds


100%|██████████| 145111/145111 [1:24:14<00:00, 28.71it/s]
 88%|████████▊ | 127380/145111 [1:24:21<09:05, 32.51it/s]

Execution time: 5062.0362293720245 seconds


100%|██████████| 145111/145111 [1:25:19<00:00, 28.35it/s]
 89%|████████▉ | 129648/145111 [1:25:25<07:20, 35.13it/s]

Execution time: 5125.631903886795 seconds


100%|██████████| 145111/145111 [1:28:16<00:00, 27.40it/s]
 89%|████████▉ | 129646/145111 [1:28:22<07:15, 35.53it/s]

Execution time: 5302.76443195343 seconds


100%|██████████| 145111/145111 [1:30:13<00:00, 26.80it/s]
 98%|█████████▊| 141502/145111 [1:30:18<01:19, 45.51it/s]

Execution time: 5418.703077316284 seconds


100%|██████████| 145111/145111 [1:31:34<00:00, 26.41it/s]
 95%|█████████▍| 137482/145111 [1:31:38<03:18, 38.38it/s]

Execution time: 5498.658306598663 seconds


100%|██████████| 145111/145111 [1:34:24<00:00, 25.62it/s]


Execution time: 5668.64576125145 seconds
Parallelizing ECR_average...


 84%|████████▍ | 121767/145111 [1:45:52<15:25, 25.24it/s] 
 92%|█████████▏| 132883/145111 [1:46:00<07:35, 26.87it/s]

Execution time: 6360.388710260391 seconds


100%|██████████| 145111/145111 [1:50:52<00:00, 21.81it/s]
 95%|█████████▌| 138058/145111 [1:50:59<03:19, 35.30it/s]

Execution time: 6659.607899188995 seconds


100%|██████████| 145111/145111 [1:51:15<00:00, 21.74it/s]
 93%|█████████▎| 134435/145111 [1:51:21<04:56, 35.98it/s]

Execution time: 6681.409529685974 seconds


100%|██████████| 145111/145111 [1:53:55<00:00, 21.23it/s]
 96%|█████████▌| 139068/145111 [1:54:01<02:40, 37.69it/s]

Execution time: 6841.373648405075 seconds


100%|██████████| 145111/145111 [1:55:32<00:00, 20.93it/s]
 99%|█████████▉| 143353/145111 [1:55:37<00:40, 43.29it/s]

Execution time: 6937.5613758563995 seconds


100%|██████████| 145111/145111 [1:56:13<00:00, 20.81it/s]
 95%|█████████▌| 138334/145111 [1:56:17<02:33, 44.19it/s]

Execution time: 6978.16214466095 seconds


100%|██████████| 145111/145111 [1:58:44<00:00, 20.37it/s]


Execution time: 7128.486462831497 seconds
Parallelizing average_ECR...


100%|██████████| 145111/145111 [1:07:14<00:00, 35.97it/s]
 91%|█████████▏| 132764/145111 [1:07:21<06:05, 33.76it/s]

Execution time: 4041.5251874923706 seconds

 96%|█████████▌| 139569/145111 [1:07:21<02:56, 31.36it/s]




100%|██████████| 145111/145111 [1:09:46<00:00, 34.66it/s]
 97%|█████████▋| 141143/145111 [1:09:53<01:48, 36.55it/s]

Execution time: 4193.101531982422 seconds

 96%|█████████▌| 138658/145111 [1:09:52<03:03, 35.12it/s]




100%|██████████| 145111/145111 [1:09:56<00:00, 34.58it/s]
 96%|█████████▌| 139286/145111 [1:10:02<03:03, 31.75it/s]

Execution time: 4203.152593135834 seconds

 94%|█████████▍| 136235/145111 [1:10:02<04:17, 34.47it/s]




100%|██████████| 145111/145111 [1:11:27<00:00, 33.85it/s]
 99%|█████████▊| 143051/145111 [1:11:33<00:49, 41.54it/s]

Execution time: 4293.282702207565 seconds


100%|██████████| 145111/145111 [1:12:00<00:00, 33.58it/s]
100%|█████████▉| 144589/145111 [1:12:05<00:10, 47.86it/s]

Execution time: 4326.0263686180115 seconds


100%|██████████| 145111/145111 [1:12:15<00:00, 33.47it/s]
 98%|█████████▊| 141896/145111 [1:12:20<01:11, 45.07it/s]

Execution time: 4340.373472690582 seconds


100%|██████████| 145111/145111 [1:13:26<00:00, 32.93it/s]


Execution time: 4410.4860916137695 seconds
Parallelizing rank_by_rank...


100%|██████████| 145111/145111 [52:53<00:00, 45.72it/s] 
 85%|████████▌ | 123457/145111 [53:00<09:41, 37.25it/s]

Execution time: 3180.882963657379 seconds

 89%|████████▊ | 128645/145111 [53:00<06:33, 41.85it/s]




100%|██████████| 145111/145111 [55:55<00:00, 43.25it/s]
 94%|█████████▍| 136309/145111 [56:01<03:25, 42.73it/s]

Execution time: 3361.6917741298676 seconds


100%|██████████| 145111/145111 [56:23<00:00, 42.88it/s]
 97%|█████████▋| 140936/145111 [56:30<01:30, 46.25it/s]

Execution time: 3390.277195930481 seconds


100%|██████████| 145111/145111 [57:49<00:00, 41.83it/s]
 94%|█████████▍| 136722/145111 [57:54<02:55, 47.74it/s]

Execution time: 3474.490665435791 seconds


100%|██████████| 145111/145111 [58:39<00:00, 41.23it/s]
 96%|█████████▌| 139450/145111 [58:44<01:43, 54.56it/s]

Execution time: 3525.20352101326 seconds


100%|██████████| 145111/145111 [58:56<00:00, 41.03it/s]
 97%|█████████▋| 140374/145111 [59:00<01:24, 55.86it/s]

Execution time: 3541.020654439926 seconds


100%|██████████| 145111/145111 [1:00:17<00:00, 40.12it/s]


Execution time: 3621.545594215393 seconds
Parallelizing rank_by_vote...


 87%|████████▋ | 126491/145111 [1:16:40<11:17, 27.46it/s]
 95%|█████████▌| 137922/145111 [1:16:47<04:24, 27.18it/s]

Execution time: 4607.541902065277 seconds


100%|██████████| 145111/145111 [1:20:42<00:00, 29.97it/s]
100%|██████████| 145111/145111 [1:20:43<00:00, 29.96it/s]
 95%|█████████▍| 137295/145111 [1:20:48<05:06, 25.48it/s]

Execution time: 4849.039310932159 seconds


 92%|█████████▏| 133914/145111 [1:20:49<06:23, 29.19it/s]

Execution time: 4850.401869535446 seconds


100%|██████████| 145111/145111 [1:22:44<00:00, 29.23it/s]
 98%|█████████▊| 142026/145111 [1:22:50<01:24, 36.62it/s]

Execution time: 4970.670939683914 seconds


100%|██████████| 145111/145111 [1:23:59<00:00, 28.79it/s]
100%|█████████▉| 145021/145111 [1:24:04<00:02, 39.51it/s]

Execution time: 5044.602001905441 seconds


100%|██████████| 145111/145111 [1:24:06<00:00, 28.75it/s]
 97%|█████████▋| 140851/145111 [1:24:10<01:53, 37.49it/s]

Execution time: 5051.002393960953 seconds


100%|██████████| 145111/145111 [1:25:57<00:00, 28.14it/s]


Execution time: 5161.737718105316 seconds
Parallelizing best_Zscore...


100%|██████████| 145111/145111 [1:17:08<00:00, 31.35it/s]
 80%|███████▉  | 115583/145111 [1:17:15<17:16, 28.48it/s]

Execution time: 4635.835487604141 seconds


 95%|█████████▌| 137980/145111 [1:23:16<03:58, 29.86it/s]
 87%|████████▋ | 126036/145111 [1:23:22<12:17, 25.85it/s]

Execution time: 5003.291789770126 seconds


 93%|█████████▎| 134316/145111 [1:23:37<06:30, 27.64it/s]
 96%|█████████▌| 138834/145111 [1:23:43<03:29, 29.91it/s]

Execution time: 5023.478915929794 seconds


100%|██████████| 145111/145111 [1:26:44<00:00, 27.88it/s]
 97%|█████████▋| 140271/145111 [1:26:50<02:50, 28.40it/s]

Execution time: 5210.56543302536 seconds


100%|██████████| 145111/145111 [1:28:34<00:00, 27.31it/s]
 94%|█████████▍| 136266/145111 [1:28:38<04:50, 30.43it/s]

Execution time: 5318.990394115448 seconds


100%|██████████| 145111/145111 [1:29:33<00:00, 27.01it/s]
 95%|█████████▌| 138434/145111 [1:29:37<03:10, 34.97it/s]

Execution time: 5377.605813026428 seconds


100%|██████████| 145111/145111 [1:32:30<00:00, 26.14it/s]


Execution time: 5554.950061559677 seconds
Parallelizing average_Zscore...


 89%|████████▉ | 129678/145111 [1:18:03<09:41, 26.54it/s]
 86%|████████▌ | 124312/145111 [1:18:10<17:14, 20.10it/s]

Execution time: 4690.96949672699 seconds


100%|██████████| 145111/145111 [1:23:18<00:00, 29.03it/s]
 99%|█████████▉| 144369/145111 [1:23:25<00:20, 35.67it/s]

Execution time: 5005.255040645599 seconds

 92%|█████████▏| 134008/145111 [1:23:24<06:19, 29.29it/s]




100%|██████████| 145111/145111 [1:23:44<00:00, 28.88it/s]
 94%|█████████▍| 136217/145111 [1:23:50<04:48, 30.82it/s]

Execution time: 5030.858374834061 seconds


100%|██████████| 145111/145111 [1:26:02<00:00, 28.11it/s]
 93%|█████████▎| 134231/145111 [1:26:08<05:55, 30.64it/s]

Execution time: 5168.49117064476 seconds


100%|██████████| 145111/145111 [1:27:44<00:00, 27.56it/s]
 99%|█████████▉| 143780/145111 [1:27:49<00:35, 37.70it/s]

Execution time: 5269.506246328354 seconds


100%|██████████| 145111/145111 [1:28:21<00:00, 27.37it/s]
 96%|█████████▌| 139082/145111 [1:28:25<02:41, 37.22it/s]

Execution time: 5306.037352800369 seconds


100%|██████████| 145111/145111 [1:31:00<00:00, 26.57it/s]


Execution time: 5464.490411520004 seconds
Finished average_Zscore...
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/ibrahim/mambaforge-pypy3/envs/dockm8/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_983724/1857051794.py", line 4, in <module>
    poses_ranking(
  File "/home/ibrahim/Github/ECFT-VS-pipeline/ranking.py", line 150, in poses_ranking
  File "/home/ibrahim/Github/ECFT-VS-pipeline/ranking.py", line 150, in <listcomp>
  File "/home/ibrahim/mambaforge-pypy3/envs/dockm8/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 912, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/home/ibrahim/mambaforge-pypy3/envs/dockm8/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 577, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/home/ibrahim/mambaforge-pypy3/envs/dockm8/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 1407, in __init__
    self._engine = self