# 1. Install dependancies.

In [1]:
!pip install pandas pandarallel tqdm ipynb ipywidgets==7.7.2



# 2. Import and working directory

In [7]:
import pandas as pd
import tqdm.notebook as tqdm
from tqdm import tqdm as tq
import glob
import pathlib
import os
import re
import numpy as np
import subprocess
tq.pandas()

from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=6) #6 process, each will take 8 cores, which leave me with 48 other cores (96 cores in total)

#Regex for best mode energy
regex_best_energy = re.compile(r"^1 +(-?[0-9]+\.[0-9]+) +0\.000 +0.000 +")



INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [8]:
os.chdir("/home/thibault/work/projects/other/Sylvie/SMINA")

# 3. Searching for pdbqt files

This will search all `pdbqt` files in `ligands` folder and put them in a pandas dataframe (used for quick parallelisation)

In [9]:
ligand_list = glob.glob("ligands/*.pdbqt")
df = pd.DataFrame(ligand_list, columns=["LigandFile"])

# Run processess

In [13]:
def run_smina(row):
    file=row.iloc[0]
    basename = pathlib.Path(file).stem
    outfolder = f"all_poses/{basename}"
    if not os.path.exists(outfolder):
        os.makedirs(outfolder)

    outputposes = f"{outfolder}/{basename}.pdbqt"
    outputlog = f"{outfolder}/{basename}.log"

    #default success = False. Only change it when it worked.
    success = False
    best_energy = np.nan
    #Check if the logfile already exist
    #if os.path.exists(outputlog):
    if False:
        with open(outputlog, "r") as log:
            stdout = log.readlines()
        success = True
    else:
        results = subprocess.run([
            "./smina.static", 
            "--config","ACDC_B.inp",
            "--ligand", file, 
            "--out", outputposes, 
            "--log",outputlog,
            "--cpu","8", 
            "--scoring","vinardo"],
            capture_output=True)
        if results.stdout == 0:
            success = True
        if success == True:
            stdout = results.stdout.decode("utf-8").split("\n")
        

    if success == True:
        #Get the best energy
        for line in stdout:
            match = regex_best_energy.findall(line)
            if match:
                best_energy=float(match[0])
                break
            else:
                best_energy = np.nan


    return pd.Series(
            {
            "name":basename,
            "filename":file,
            "success":success,
            "BestEnergy":best_energy
            }
        )
results = df[:20].parallel_apply(lambda x: run_smina(x), axis=1)

results.to_csv("results.csv",sep=";")

100%|██████████| 20/20 [05:05<00:00, 15.29s/it]


In [12]:
results

Unnamed: 0,name,filename,success,BestEnergy
0,TCMDC-140876,ligands/TCMDC-140876.pdbqt,True,-8.3
1,TCMDC-124866,ligands/TCMDC-124866.pdbqt,True,-5.3
2,TCMDC-132783,ligands/TCMDC-132783.pdbqt,True,-7.6
3,TCMDC-125318,ligands/TCMDC-125318.pdbqt,True,-6.7
4,TCMDC-137170,ligands/TCMDC-137170.pdbqt,True,-7.6
5,TCMDC-133954,ligands/TCMDC-133954.pdbqt,True,-9.0
6,TCMDC-138571,ligands/TCMDC-138571.pdbqt,True,-7.2
7,TCMDC-139954,ligands/TCMDC-139954.pdbqt,True,-9.8
8,TCMDC-133911,ligands/TCMDC-133911.pdbqt,True,-8.3
9,TCMDC-134380,ligands/TCMDC-134380.pdbqt,True,-6.6
