In [1]:
import os

import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc

from rdkit.Chem import AllChem as Chem
from rdkit import DataStructs

import warnings
warnings.filterwarnings("ignore")

In [2]:
#LBVS
def Compute_RocAuc(Diverse_ligands_PS, Final_decoys, fptype="ECFP4"):   
   
    df_Ligand = pd.read_csv(Diverse_ligands_PS)
    df_Decoy  = pd.read_csv(Final_decoys)
    length_Ligand = len(df_Ligand)
    length_Decoy= len(df_Decoy)

    S_Ligand = list(df_Ligand['SMILES'])
    Suppl_Ligand = []
    for i in range(0,length_Ligand):
        m = Chem.MolFromSmiles(S_Ligand[i])
        Suppl_Ligand.append(m)

    S_Decoy = list(df_Decoy['SMILES'])
    Suppl_Decoy = []
    for i in range(0,length_Decoy):
        m = Chem.MolFromSmiles(S_Decoy[i])
        Suppl_Decoy.append(m)

    if fptype == "ECFP4":
        Fps_Ligand = [Chem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in Suppl_Ligand]
        Fps_Decoy  = [Chem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in Suppl_Decoy]
    else:
        Fps_Ligand = [Chem.GetMorganFingerprintAsBitVect(x, 3, useFeatures=True, nBits=1024) for x in Suppl_Ligand]
        Fps_Decoy  = [Chem.GetMorganFingerprintAsBitVect(x, 3, useFeatures=True, nBits=1024) for x in Suppl_Decoy]
        
    def sims_LL(t,r): 
        sims=DataStructs.FingerprintSimilarity(Fps_Ligand[t],Fps_Ligand[r])
        return sims
    def sims_LD(t,r): 
        sims=DataStructs.FingerprintSimilarity(Fps_Ligand[t],Fps_Decoy[r])
        return sims

    def Get_sims(k):
        Sims = []
        
        for i in range(0,length_Ligand):
            if i != k:
                Sims.append(1)
                Sims.append(sims_LL(k,i))     
        for j in range(0,length_Decoy):
            if not ((39*k<=j)&(j<39*(k+1))): 
                Sims.append(0)
                Sims.append(sims_LD(k,j))   
        return Sims

    def Get_Roc_Arg_sims(k):
        Sims_list = Get_sims(k)                                         
        data = np.array(Sims_list).reshape(-1,2)                      
        df_data = pd.DataFrame(data,columns = ['sort','sims'])           
        df = df_data.sort_values(by="sims" , ascending=False)              
        sort = list(df['sort'])
        sims = list(df['sims'])

        fpr, tpr, thresholds  =  roc_curve(sort, sims)
        roc_auc = auc(fpr,tpr)

        return (fpr, tpr,roc_auc)

    Fpr_list_sims = []
    Tpr_list_sims = []
    Auc_list_sims = []

    for k in range(0,length_Ligand):
        Roc_Arg = Get_Roc_Arg_sims(k)  
        Fpr_list_sims.append(Roc_Arg[0])
        Tpr_list_sims.append(Roc_Arg[1])
        Auc_list_sims.append(Roc_Arg[2])
    aucs_np = np.array(Auc_list_sims)
    return Auc_list_sims, aucs_np.mean(), aucs_np.std()

rows_ecfp = []
rows_fcfp = []
lbvs_dir = os.path.join(os.getcwd(), "datasets_ext_val_classical_VS", "LBVS")
cases = ["HIVRT", "HSP90A", "ESR1", "ESR2", "FAK1"]
MUBD_ligand_dir = [os.path.join(it, "MUBDreal/Diverse_ligands_PS.csv") for it in os.scandir(lbvs_dir)]
MUBDreal_decoy_dir = [os.path.join(it, "MUBDreal/Final_decoys.csv") for it in os.scandir(lbvs_dir)]
MUBDsyn_decoy_dir = [os.path.join(it, "MUBDsyn/Final_decoys.csv") for it in os.scandir(lbvs_dir)]

MUV_ligand_dir = [os.path.join(it, "MUV/Diverse_ligands_PS.csv") for it in os.scandir(lbvs_dir)]
MUV_inactive_dir = [os.path.join(it, "MUV/Final_decoys.csv") for it in os.scandir(lbvs_dir)]

DUDE_ligand_dir = [os.path.join(it, "DUDE/Diverse_ligands_PS.csv") for it in os.scandir(lbvs_dir)]
DUDE_decoy_dir = [os.path.join(it, "DUDE/Final_decoys.csv") for it in os.scandir(lbvs_dir)]

ecfp4_MUBDreal, ecfp4_MUBDsyn, ecfp4_MUV, ecfp4_DUDE = [], [], [], []
fcfp6_MUBDreal, fcfp6_MUBDsyn, fcfp6_MUV, fcfp6_DUDE = [], [], [], []
for case in cases:
    for dir in MUBD_ligand_dir:
        if case in dir:
            single_ligands = dir
    for dir in MUBDreal_decoy_dir:
        if case in dir:
            single_real_decoys = dir
    for dir in MUBDsyn_decoy_dir:
        if case in dir:
            single_syn_decoys = dir
    for dir in MUV_ligand_dir:
        if case in dir:
            single_MUV_ligands = dir
    for dir in MUV_inactive_dir:
        if case in dir:
            single_MUV_inactives = dir
    for dir in DUDE_ligand_dir:
        if case in dir:
            single_DUDE_ligands = dir
    for dir in DUDE_decoy_dir:
        if case in dir:
            single_DUDE_decoys = dir
    ecfp4_MUBDreal_l, MUBDreal_m, MUBDreal_s = Compute_RocAuc(single_ligands, single_real_decoys)
    ecfp4_MUBDsyn_l, MUBDsyn_m, MUBDsyn_s = Compute_RocAuc(single_ligands, single_syn_decoys)
    ecfp4_MUV_l, MUV_m, MUV_s = Compute_RocAuc(single_MUV_ligands, single_MUV_inactives)
    ecfp4_DUDE_l, DUDE_m, DUDE_s = Compute_RocAuc(single_DUDE_ligands, single_DUDE_decoys)

    ecfp4_MUBDreal.append(ecfp4_MUBDreal_l)
    ecfp4_MUBDsyn.append(ecfp4_MUBDsyn_l)
    ecfp4_MUV.append(ecfp4_MUV_l)
    ecfp4_DUDE.append(ecfp4_DUDE_l)
    rows_ecfp.append({"Case":case, "MUBDreal_mean":MUBDreal_m, "MUBDreal_s":MUBDreal_s,
                    "MUBDsyn_mean":MUBDsyn_m, "MUBDsyn_s":MUBDsyn_s,
                    "MUV_mean":MUV_m, "MUV_s":MUV_s,
                    "DUDE_mean":DUDE_m, "DUDE_s":DUDE_s})

    fcfp6_MUBDreal_l, MUBDreal_m, MUBDreal_s = Compute_RocAuc(single_ligands, single_real_decoys, "FCFP6")
    fcfp6_MUBDsyn_l, MUBDsyn_m, MUBDsyn_s = Compute_RocAuc(single_ligands, single_syn_decoys, "FCFP6")
    fcfp6_MUV_l, MUV_m, MUV_s = Compute_RocAuc(single_MUV_ligands, single_MUV_inactives, "FCFP6")
    fcfp6_DUDE_l, DUDE_m, DUDE_s = Compute_RocAuc(single_DUDE_ligands, single_DUDE_decoys, "FCFP6")

    fcfp6_MUBDreal.append(fcfp6_MUBDreal_l)
    fcfp6_MUBDsyn.append(fcfp6_MUBDsyn_l)
    fcfp6_MUV.append(fcfp6_MUV_l)
    fcfp6_DUDE.append(fcfp6_DUDE_l)
    rows_fcfp.append({"Case":case, "MUBDreal_mean":MUBDreal_m, "MUBDreal_std":MUBDreal_s,
                    "MUBDsyn_mean":MUBDsyn_m, "MUBDsyn_std":MUBDsyn_s,
                    "MUV_mean":MUV_m, "MUV_std":MUV_s,
                    "DUDE_mean":DUDE_m, "DUDE_std":DUDE_s})

df_ecfp = pd.DataFrame(rows_ecfp)
df_ecfp.to_csv("LBVS_table_ecfp4.csv", index=False)
df_fcfp = pd.DataFrame(rows_fcfp)
df_fcfp.to_csv("LBVS_table_fcfp6.csv", index=False)

In [3]:
#SBVS
def read_score(file_path):
    with open(file_path) as f:
        lines = f.readlines()
 
        content = [line.rstrip() for line in lines]
        dic = {}
        
        for i, con_ in enumerate(content):
            if ('CHEMBL' in con_) or ('ZINC' in con_) or ('active' in con_) or ('decoy' in con_):
                if (con_ not in dic):
                    dic[con_] = 9999
                idx = con_

            if con_ == '> <minimizedAffinity>':
                score = float(content[i+1])
                if score < dic[idx]:
                    dic[idx] = score

    Dock = []
    if file_path.name.count('actives'):
        for val in dic.values():
            Dock.append(1)
            Dock.append(val)
    else:
        for val in dic.values():
            Dock.append(0)
            Dock.append(val)

    return Dock

def get_args(Dock_l):
    data = np.array(Dock_l).reshape(-1,2)
    df_data = pd.DataFrame(data,columns = ['type','dock'])
    df = df_data.sort_values(by="dock" , ascending=True)
    sort = list(df['type'])
    dock = list(df['dock'])

    fpr, tpr, thresholds  =  roc_curve(sort, dock, pos_label=0)
    roc_auc = auc(fpr,tpr)

    return fpr,tpr,roc_auc

sbvs_dir = os.path.join(os.getcwd(), "datasets_ext_val_classical_VS", "SBVS")
cases = ["HIVRT", "HSP90A", "ESR1", "ESR2", "FAK1"]

rows = []
for i, case in enumerate(cases):
    for it in os.scandir(sbvs_dir):
        if case in it.name:
            case_dir = it
    for sdf in os.scandir(case_dir):
        if sdf.name == "smina_out_actives_DUDE.sdf":
            DUDE_actives_dir = sdf
        elif sdf.name == "smina_out_decoys_DUDE.sdf":
            DUDE_decoys_dir = sdf
        elif sdf.name == "smina_out_actives_MUV.sdf":
            MUV_actives_dir = sdf
        elif sdf.name == "smina_out_decoys_MUV.sdf":
            MUV_decoys_dir = sdf
        elif sdf.name == "smina_out_actives_MUBD.sdf":
            MUBD_actives_dir = sdf
        elif sdf.name == "smina_out_decoys_MUBDreal.sdf":
            MUBDreal_decoys_dir = sdf
        else:
            MUBDsyn_decoys_dir = sdf

    DUDE_actives_scores = read_score(DUDE_actives_dir)
    DUDE_decoys_scores = read_score(DUDE_decoys_dir)
    DUDE_actives_scores.extend(DUDE_decoys_scores)
    fpr_DUDE, tpr_DUDE, roc_auc_DUDE = get_args(DUDE_actives_scores)

    MUV_actives_scores = read_score(MUV_actives_dir)
    MUV_decoys_scores = read_score(MUV_decoys_dir)
    MUV_actives_scores.extend(MUV_decoys_scores)
    fpr_MUV, tpr_MUV, roc_auc_MUV = get_args(MUV_actives_scores)

    MUBD_actives_scores = read_score(MUBD_actives_dir)
    MUBDreal_decoys_scores = read_score(MUBDreal_decoys_dir)
    MUBD_actives_scores.extend(MUBDreal_decoys_scores)
    fpr_MUBDreal, tpr_MUBDreal, roc_auc_MUBDreal = get_args(MUBD_actives_scores)

    MUBD_actives_scores = read_score(MUBD_actives_dir)
    MUBDsyn_decoys_scores = read_score(MUBDsyn_decoys_dir)
    MUBD_actives_scores.extend(MUBDsyn_decoys_scores)
    fpr_MUBDsyn, tpr_MUBDsyn, roc_auc_MUBDsyn = get_args(MUBD_actives_scores)

    rows.append({"Case":case, "MUBDreal":roc_auc_MUBDreal,
                    "MUBDsyn":roc_auc_MUBDsyn,
                    "MUV":roc_auc_MUV,
                    "DUDE":roc_auc_DUDE, })
df_smina = pd.DataFrame(rows)
df_smina.to_csv("SBVS_table_smina.csv", index=False)