# This pipeline is a fork of DockM8

In [1]:
#Import required libraries and scripts
from DockM8.docking_functions import *
from DockM8.rescoring_functions import *
from DockM8.consensus_methods import *
# from DockM8.scripts.dogsitescorer import *
# from DockM8.scripts.get_pocket import *
from tqdm.autonotebook import tqdm

In [2]:
protein_name  = 'chain_cd_noligand.pdb'
ligand_library = 'ecft_scores_new_cleaned.sdf'
reference_ligand = 'protein_protoss_noligand_pocket.pdb'
snapshot_IDs = ['p9', 'p11']

In [3]:
HERE = Path(_dh[-1])
DATA = (HERE / "data")

# Move input data (protein pdb, docking library and reference ligand) to data directory
software = (HERE / "software")
protein_file = (DATA  / protein_name)
ligand_library = (DATA / ligand_library)
ref_file = (DATA / reference_ligand)


OUTPUT = DATA / "results"


### Move snapshots to another different folders

### Load Ground truth data with 2D compounds ['ID', '2D structure', 'Activity score']

# Data-preprocessing

### Protein is prepared by [Protoss](https://proteins.plus/)

### Ligand library preparation by Gypsum-DL for 3D conformers generation

In [4]:
from data_preparation import run_gypsumdl


prepared_library_path = OUTPUT / f"{ligand_library.stem}_prepared.sdf"
run_gypsumdl(ligand_library, prepared_library_path)


Molecules are already prepared


In [5]:
df_prepared = PandasTools.LoadSDF(str(prepared_library_path))
df_prepared.head(5)

Unnamed: 0,ID,ROMol
0,HIPS6128,<rdkit.Chem.rdchem.Mol object at 0x7fea69651000>
1,HIPS449,<rdkit.Chem.rdchem.Mol object at 0x7fea69651070>
2,HIPS6989,<rdkit.Chem.rdchem.Mol object at 0x7fea696510e0>
3,HIPS7002,<rdkit.Chem.rdchem.Mol object at 0x7fea69651150>
4,HIPS7000,<rdkit.Chem.rdchem.Mol object at 0x7fea696511c0>


In [6]:
docking_programs = [
                # 'GNINA', 
                # 'SMINA',
                # 'local_diffdock', 
                # 'PLANTS', 
                'flexx',
                ]


consensus_methods = []
n_poses = 10
exhaustiveness = 8

# Docking

1. Local DiffDock
2. PLANTS (Implemented by DockM8)
3. SMINA (Implemented by DockM8)
4. GNINA (Implemented by DockM8)
5. FlexX

NOTE : Output of docking step should have at least two columns 
 1. **ID** : Name of the compound , name of docking tool and number of pose e.g. (compoundX_diffdock_01)
 2. **Molecule** : Poses of every docking tool

In [7]:
from docking import docking
docking(
        docking_programs,
        protein_file,
        prepared_library_path,
        ref_file,
        exhaustiveness,
        n_poses
        )

Extracting ligand coordinates supports either SDF files or PDB files...
Reference ligand is already in SDF format
/home/ibrahim/Github/general_pipeline/data/results/flexx/flexx_poses.sdf
Compounds are already docked with FlexX v 6.0


Compounds are already docked and concatenated, CHECK /home/ibrahim/Github/general_pipeline/data/results/flexx/flexx_poses.sdf


Unnamed: 0,ID,Molecule
0,HIPS6128_01,<rdkit.Chem.rdchem.Mol object at 0x7fea6918e730>
1,HIPS6128_02,<rdkit.Chem.rdchem.Mol object at 0x7fea6918d2a0>
2,HIPS6128_03,<rdkit.Chem.rdchem.Mol object at 0x7fea6918d5b0>
3,HIPS6128_04,<rdkit.Chem.rdchem.Mol object at 0x7fea690a33e0>
4,HIPS6128_05,<rdkit.Chem.rdchem.Mol object at 0x7fea690a3450>
...,...,...
2094,HIPS6787_06,<rdkit.Chem.rdchem.Mol object at 0x7fea69118f90>
2095,HIPS6787_07,<rdkit.Chem.rdchem.Mol object at 0x7fea69119000>
2096,HIPS6787_08,<rdkit.Chem.rdchem.Mol object at 0x7fea69119070>
2097,HIPS6787_09,<rdkit.Chem.rdchem.Mol object at 0x7fea691190e0>


In [13]:
PandasTools.WriteSDF(df, str(sdf_output),idName='ID', molColName='ROMol', properties=list(df.columns), allNumeric=False)

In [11]:
sdf_output='/home/ibrahim/Github/general_pipeline/data/results/flexx/flexx_poses.sdf'

df = PandasTools.LoadSDF(str(sdf_output))[['ID', 'ROMol']]
display(df)
if df['ID'].str.split('_').str.len().max() == 2:
    print('ID format is incorrect, fixing it ...')
    df['ID'] = df['ID'].str.split('_').str[0] + '_flexx_' + df['ID'].str.split('_').str[1]
    PandasTools.WriteSDF(df, str(sdf_output),idName='ID', molColName='ROMol', properties=list(df.columns), allNumeric=False)
else:
    print('ID format is correct')


KeyError: "None of [Index(['ID', 'ROMol'], dtype='object')] are in the [columns]"

In [None]:
def calculate_rmsd_matrix(mol):
    """Calculate an RMSD matrix for the conformers in a molecule."""
    rmsd_matrix = np.zeros((10, 10))

    for i in range(len(supplier)):
        for j in range(len(supplier)):
            rmsd_value = AllChem.GetBestRMS(mol[i], mol[j])
            rmsd_matrix[i][j] = round(rmsd_value, 2)
            rmsd_matrix[j][i] = round(rmsd_value, 2)
    
    return rmsd_matrix

    # Load molecule with multiple conformers (example: SDF file)
supplier = Chem.SDMolSupplier(str(OUTPUT / 'local_diffdock'/ 'local_diffdock_poses.sdf'))
rmsd_matrix = calculate_rmsd_matrix(supplier)
display(rmsd_matrix)

RuntimeError: No sub-structure match found between the reference and probe mol

In [None]:
@ TODO : Rescore poses with all rescoring functions

# Rescoring

### Choose wanted scoring function from the next list

In [None]:
rescoring = [
    'gnina', 
    'AD4',  
    'LinF9', 
    'RTMScore', 
    'vinardo', 
    'SCORCH',
        
    'CHEMPLP', 
    'rfscorevs_V1',
    'rfscorevs_V2',
    'rfscorevs_V3', 
    'vina_hydrophobic', 
    'vina_intra_hydrophobic'
    ]


### Run Rescoring

# Consensus ranking methods (Implemented by DockM8)
### You can also choose the ranking methods according to you preference

### Run Ranking methods